include/linux/seqlock.h | 9 +++++++++ 1 file changed, 9 insertions(+)
While fuzzing an arm64 kernel, Alexander Potapenko reported:
| BUG: KCSAN: data-race in ktime_get_mono_fast_ns / timekeeping_update
|
| write to 0xffffffc082e74248 of 56 bytes by interrupt on cpu 0:
| update_fast_timekeeper kernel/time/timekeeping.c:430 [inline]
| timekeeping_update+0x1d8/0x2d8 kernel/time/timekeeping.c:768
| timekeeping_advance+0x9e8/0xb78 kernel/time/timekeeping.c:2344
| update_wall_time+0x18/0x38 kernel/time/timekeeping.c:2360
| tick_do_update_jiffies64+0x164/0x1b0 kernel/time/tick-sched.c:149
| tick_nohz_handler+0xa4/0x2a8 kernel/time/tick-sched.c:232
| __hrtimer_run_queues+0x198/0x33c kernel/time/hrtimer.c:1691
| hrtimer_interrupt+0x16c/0x630 kernel/time/hrtimer.c:1817
| timer_handler drivers/clocksource/arm_arch_timer.c:674 [inline]
| arch_timer_handler_phys+0x60/0x74 drivers/clocksource/arm_arch_timer.c:692
| handle_percpu_devid_irq+0xd8/0x1ec kernel/irq/chip.c:942
| generic_handle_irq_desc include/linux/irqdesc.h:173 [inline]
| handle_irq_desc kernel/irq/irqdesc.c:692 [inline]
| generic_handle_domain_irq+0x5c/0x7c kernel/irq/irqdesc.c:748
| gic_handle_irq+0x78/0x1b0 drivers/irqchip/irq-gic-v3.c:843
| call_on_irq_stack+0x24/0x4c arch/arm64/kernel/entry.S:889
| do_interrupt_handler+0x74/0xa8 arch/arm64/kernel/entry-common.c:310
| __el1_irq arch/arm64/kernel/entry-common.c:536 [inline]
| el1_interrupt+0x34/0x54 arch/arm64/kernel/entry-common.c:551
| el1h_64_irq_handler+0x18/0x24 arch/arm64/kernel/entry-common.c:556
| el1h_64_irq+0x64/0x68 arch/arm64/kernel/entry.S:594
| __daif_local_irq_enable arch/arm64/include/asm/irqflags.h:26 [inline]
| arch_local_irq_enable arch/arm64/include/asm/irqflags.h:48 [inline]
| kvm_arch_vcpu_ioctl_run+0x8d4/0xf48 arch/arm64/kvm/arm.c:1259
| kvm_vcpu_ioctl+0x650/0x894 virt/kvm/kvm_main.c:4487
| __do_sys_ioctl fs/ioctl.c:51 [inline]
| __se_sys_ioctl fs/ioctl.c:893 [inline]
| __arm64_sys_ioctl+0xf8/0x170 fs/ioctl.c:893
| [...]
|
| read to 0xffffffc082e74258 of 8 bytes by task 5260 on cpu 1:
| __ktime_get_fast_ns kernel/time/timekeeping.c:372 [inline]
| ktime_get_mono_fast_ns+0x88/0x174 kernel/time/timekeeping.c:489
| init_srcu_struct_fields+0x40c/0x530 kernel/rcu/srcutree.c:263
| init_srcu_struct+0x14/0x20 kernel/rcu/srcutree.c:311
| kvm_dev_ioctl+0x304/0x908 virt/kvm/kvm_main.c:1185
| __do_sys_ioctl fs/ioctl.c:51 [inline]
| __se_sys_ioctl fs/ioctl.c:893 [inline]
| __arm64_sys_ioctl+0xf8/0x170 fs/ioctl.c:893
| [...]
|
| value changed: 0x000002f875d33266 -> 0x000002f877416866
|
| Reported by Kernel Concurrency Sanitizer on:
| CPU: 1 UID: 0 PID: 5260 Comm: syz.2.7483 Not tainted 6.12.0-rc3-dirty #78
This is a false positive data race between a seqcount latch writer and a
reader accessing stale data.
Unlike the regular seqlock interface, the seqcount_latch interface for
latch writers never has a well-defined critical section.
To support with KCSAN, optimistically declare that a fixed number of
memory accesses after raw_write_seqcount_latch() are "atomic". Latch
readers follow a similar pattern as the regular seqlock interface. This
effectively tells KCSAN that data races on accesses to seqcount_latch
protected data should be ignored.
Reviewing current raw_write_seqcount_latch() callers, the most common
patterns involve only few memory accesses, either a single plain C
assignment, or memcpy; therefore, the value of 8 memory accesses after
raw_write_seqcount_latch() is chosen to (a) avoid most false positives,
and (b) avoid excessive number of false negatives (due to inadvertently
declaring most accesses in the proximity of update_fast_timekeeper() as
"atomic").
Reported-by: Alexander Potapenko <glider@google.com>
Tested-by: Alexander Potapenko <glider@google.com>
Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN")
Signed-off-by: Marco Elver <elver@google.com>
---
include/linux/seqlock.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index fffeb754880f..e24cf144276e 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -614,6 +614,7 @@ typedef struct {
*/
static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
+ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
/*
* Pairs with the first smp_wmb() in raw_write_seqcount_latch().
* Due to the dependent load, a full smp_rmb() is not needed.
@@ -631,6 +632,7 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *
static __always_inline int
raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
+ kcsan_atomic_next(0);
smp_rmb();
return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}
@@ -721,6 +723,13 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
smp_wmb(); /* prior stores before incrementing "sequence" */
s->seqcount.sequence++;
smp_wmb(); /* increment "sequence" before following stores */
+
+ /*
+ * Latch writers do not have a well-defined critical section, but to
+ * avoid most false positives, at the cost of false negatives, assume
+ * the next few memory accesses belong to the latch writer.
+ */
+ kcsan_atomic_next(8);
}
#define __SEQLOCK_UNLOCKED(lockname) \
--
2.47.0.163.g1226f6d8fa-goog
On Tue, Oct 29, 2024 at 09:36:29AM +0100, Marco Elver wrote:
> Reviewing current raw_write_seqcount_latch() callers, the most common
> patterns involve only few memory accesses, either a single plain C
> assignment, or memcpy;
Then I assume you've encountered latch_tree_{insert,erase}() in your
travels, right?
Also, I note that update_clock_read_data() seems to do things
'backwards' and will completely elide your proposed annotation.
> therefore, the value of 8 memory accesses after
> raw_write_seqcount_latch() is chosen to (a) avoid most false positives,
> and (b) avoid excessive number of false negatives (due to inadvertently
> declaring most accesses in the proximity of update_fast_timekeeper() as
> "atomic").
The above latch'ed RB-trees can certainly exceed this magical number 8.
> Reported-by: Alexander Potapenko <glider@google.com>
> Tested-by: Alexander Potapenko <glider@google.com>
> Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN")
> Signed-off-by: Marco Elver <elver@google.com>
> ---
> include/linux/seqlock.h | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> index fffeb754880f..e24cf144276e 100644
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -614,6 +614,7 @@ typedef struct {
> */
> static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
> {
> + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
> /*
> * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
> * Due to the dependent load, a full smp_rmb() is not needed.
> @@ -631,6 +632,7 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *
> static __always_inline int
> raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
> {
> + kcsan_atomic_next(0);
> smp_rmb();
> return unlikely(READ_ONCE(s->seqcount.sequence) != start);
> }
> @@ -721,6 +723,13 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
> smp_wmb(); /* prior stores before incrementing "sequence" */
> s->seqcount.sequence++;
> smp_wmb(); /* increment "sequence" before following stores */
> +
> + /*
> + * Latch writers do not have a well-defined critical section, but to
> + * avoid most false positives, at the cost of false negatives, assume
> + * the next few memory accesses belong to the latch writer.
> + */
> + kcsan_atomic_next(8);
> }
Given there are so very few latch users, would it make sense to
introduce a raw_write_seqcount_latch_end() callback that does
kcsan_atomic_next(0) ? -- or something along those lines? Then you won't
have to assume such a small number.
On Tue, 29 Oct 2024 at 12:49, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Oct 29, 2024 at 09:36:29AM +0100, Marco Elver wrote:
> > Reviewing current raw_write_seqcount_latch() callers, the most common
> > patterns involve only few memory accesses, either a single plain C
> > assignment, or memcpy;
>
> Then I assume you've encountered latch_tree_{insert,erase}() in your
> travels, right?
Oops. That once certainly exceeds the "8 memory accesses".
> Also, I note that update_clock_read_data() seems to do things
> 'backwards' and will completely elide your proposed annotation.
Hmm, for the first access, yes. This particular oddity could be
"fixed" by surrounding the accesses by
kcsan_nestable_atomic_begin/end(). I don't know if it warrants adding
a raw_write_seqcount_latch_begin().
Preferences?
> > therefore, the value of 8 memory accesses after
> > raw_write_seqcount_latch() is chosen to (a) avoid most false positives,
> > and (b) avoid excessive number of false negatives (due to inadvertently
> > declaring most accesses in the proximity of update_fast_timekeeper() as
> > "atomic").
>
> The above latch'ed RB-trees can certainly exceed this magical number 8.
>
> > Reported-by: Alexander Potapenko <glider@google.com>
> > Tested-by: Alexander Potapenko <glider@google.com>
> > Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN")
> > Signed-off-by: Marco Elver <elver@google.com>
> > ---
> > include/linux/seqlock.h | 9 +++++++++
> > 1 file changed, 9 insertions(+)
> >
> > diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> > index fffeb754880f..e24cf144276e 100644
> > --- a/include/linux/seqlock.h
> > +++ b/include/linux/seqlock.h
> > @@ -614,6 +614,7 @@ typedef struct {
> > */
> > static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
> > {
> > + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
> > /*
> > * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
> > * Due to the dependent load, a full smp_rmb() is not needed.
> > @@ -631,6 +632,7 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *
> > static __always_inline int
> > raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
> > {
> > + kcsan_atomic_next(0);
> > smp_rmb();
> > return unlikely(READ_ONCE(s->seqcount.sequence) != start);
> > }
> > @@ -721,6 +723,13 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
> > smp_wmb(); /* prior stores before incrementing "sequence" */
> > s->seqcount.sequence++;
> > smp_wmb(); /* increment "sequence" before following stores */
> > +
> > + /*
> > + * Latch writers do not have a well-defined critical section, but to
> > + * avoid most false positives, at the cost of false negatives, assume
> > + * the next few memory accesses belong to the latch writer.
> > + */
> > + kcsan_atomic_next(8);
> > }
>
> Given there are so very few latch users, would it make sense to
> introduce a raw_write_seqcount_latch_end() callback that does
> kcsan_atomic_next(0) ? -- or something along those lines? Then you won't
> have to assume such a small number.
That's something I considered, but thought I'd try the unintrusive
version first. But since you proposed it here, I'd much prefer that,
too. ;-)
Let me try that.
Thanks,
-- Marco
On Tue, Oct 29, 2024 at 02:05:38PM +0100, Marco Elver wrote:
> On Tue, 29 Oct 2024 at 12:49, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Oct 29, 2024 at 09:36:29AM +0100, Marco Elver wrote:
> > > Reviewing current raw_write_seqcount_latch() callers, the most common
> > > patterns involve only few memory accesses, either a single plain C
> > > assignment, or memcpy;
> >
> > Then I assume you've encountered latch_tree_{insert,erase}() in your
> > travels, right?
>
> Oops. That once certainly exceeds the "8 memory accesses".
>
> > Also, I note that update_clock_read_data() seems to do things
> > 'backwards' and will completely elide your proposed annotation.
>
> Hmm, for the first access, yes. This particular oddity could be
> "fixed" by surrounding the accesses by
> kcsan_nestable_atomic_begin/end(). I don't know if it warrants adding
> a raw_write_seqcount_latch_begin().
>
> Preferences?
I *think* it is doable to flip it around to the 'normal' order, but
given I've been near cross-eyed with a head-ache these past two days,
I'm not going to attempt a patch for you, since I'm bound to get it
wrong :/
> > > therefore, the value of 8 memory accesses after
> > > raw_write_seqcount_latch() is chosen to (a) avoid most false positives,
> > > and (b) avoid excessive number of false negatives (due to inadvertently
> > > declaring most accesses in the proximity of update_fast_timekeeper() as
> > > "atomic").
> >
> > The above latch'ed RB-trees can certainly exceed this magical number 8.
> >
> > > Reported-by: Alexander Potapenko <glider@google.com>
> > > Tested-by: Alexander Potapenko <glider@google.com>
> > > Fixes: 88ecd153be95 ("seqlock, kcsan: Add annotations for KCSAN")
> > > Signed-off-by: Marco Elver <elver@google.com>
> > > ---
> > > include/linux/seqlock.h | 9 +++++++++
> > > 1 file changed, 9 insertions(+)
> > >
> > > diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> > > index fffeb754880f..e24cf144276e 100644
> > > --- a/include/linux/seqlock.h
> > > +++ b/include/linux/seqlock.h
> > > @@ -614,6 +614,7 @@ typedef struct {
> > > */
> > > static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
> > > {
> > > + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
> > > /*
> > > * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
> > > * Due to the dependent load, a full smp_rmb() is not needed.
> > > @@ -631,6 +632,7 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *
> > > static __always_inline int
> > > raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
> > > {
> > > + kcsan_atomic_next(0);
> > > smp_rmb();
> > > return unlikely(READ_ONCE(s->seqcount.sequence) != start);
> > > }
> > > @@ -721,6 +723,13 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
> > > smp_wmb(); /* prior stores before incrementing "sequence" */
> > > s->seqcount.sequence++;
> > > smp_wmb(); /* increment "sequence" before following stores */
> > > +
> > > + /*
> > > + * Latch writers do not have a well-defined critical section, but to
> > > + * avoid most false positives, at the cost of false negatives, assume
> > > + * the next few memory accesses belong to the latch writer.
> > > + */
> > > + kcsan_atomic_next(8);
> > > }
> >
> > Given there are so very few latch users, would it make sense to
> > introduce a raw_write_seqcount_latch_end() callback that does
> > kcsan_atomic_next(0) ? -- or something along those lines? Then you won't
> > have to assume such a small number.
>
> That's something I considered, but thought I'd try the unintrusive
> version first. But since you proposed it here, I'd much prefer that,
> too. ;-)
> Let me try that.
>
> Thanks,
> -- Marco
On Tue, Oct 29, 2024 at 02:46PM +0100, Peter Zijlstra wrote:
> On Tue, Oct 29, 2024 at 02:05:38PM +0100, Marco Elver wrote:
> > On Tue, 29 Oct 2024 at 12:49, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, Oct 29, 2024 at 09:36:29AM +0100, Marco Elver wrote:
> > > > Reviewing current raw_write_seqcount_latch() callers, the most common
> > > > patterns involve only few memory accesses, either a single plain C
> > > > assignment, or memcpy;
> > >
> > > Then I assume you've encountered latch_tree_{insert,erase}() in your
> > > travels, right?
> >
> > Oops. That once certainly exceeds the "8 memory accesses".
> >
> > > Also, I note that update_clock_read_data() seems to do things
> > > 'backwards' and will completely elide your proposed annotation.
> >
> > Hmm, for the first access, yes. This particular oddity could be
> > "fixed" by surrounding the accesses by
> > kcsan_nestable_atomic_begin/end(). I don't know if it warrants adding
> > a raw_write_seqcount_latch_begin().
> >
> > Preferences?
>
> I *think* it is doable to flip it around to the 'normal' order, but
> given I've been near cross-eyed with a head-ache these past two days,
> I'm not going to attempt a patch for you, since I'm bound to get it
> wrong :/
Something like this?
------ >8 ------
Author: Marco Elver <elver@google.com>
Date: Tue Oct 29 21:16:21 2024 +0100
time/sched_clock: Swap update_clock_read_data() latch writes
Swap the writes to the odd and even copies to make the writer critical
section look like all other seqcount_latch writers.
With that, we can also add the raw_write_seqcount_latch_end() to clearly
denote the end of the writer section.
Signed-off-by: Marco Elver <elver@google.com>
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68d6c1190ac7..311c90a0e86e 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -119,9 +119,6 @@ unsigned long long notrace sched_clock(void)
*/
static void update_clock_read_data(struct clock_read_data *rd)
{
- /* update the backup (odd) copy with the new data */
- cd.read_data[1] = *rd;
-
/* steer readers towards the odd copy */
raw_write_seqcount_latch(&cd.seq);
@@ -130,6 +127,11 @@ static void update_clock_read_data(struct clock_read_data *rd)
/* switch readers back to the even copy */
raw_write_seqcount_latch(&cd.seq);
+
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ raw_write_seqcount_latch_end(&cd.seq);
}
/*
------ >8 ------
I also noticed your d16317de9b41 ("seqlock/latch: Provide
raw_read_seqcount_latch_retry()") to get rid of explicit instrumentation
in noinstr.
Not sure how to resolve that. We have that objtool support to erase
calls in noinstr code (is_profiling_func), but that's x86 only.
I could also make kcsan_atomic_next(0) noinstr compatible by checking if
the ret IP is in noinstr, and immediately return if it is.
Preferences?
On Tue, Oct 29, 2024 at 09:49:21PM +0100, Marco Elver wrote:
> Something like this?
>
> ------ >8 ------
>
> Author: Marco Elver <elver@google.com>
> Date: Tue Oct 29 21:16:21 2024 +0100
>
> time/sched_clock: Swap update_clock_read_data() latch writes
>
> Swap the writes to the odd and even copies to make the writer critical
> section look like all other seqcount_latch writers.
>
> With that, we can also add the raw_write_seqcount_latch_end() to clearly
> denote the end of the writer section.
>
> Signed-off-by: Marco Elver <elver@google.com>
>
> diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
> index 68d6c1190ac7..311c90a0e86e 100644
> --- a/kernel/time/sched_clock.c
> +++ b/kernel/time/sched_clock.c
> @@ -119,9 +119,6 @@ unsigned long long notrace sched_clock(void)
> */
> static void update_clock_read_data(struct clock_read_data *rd)
> {
> - /* update the backup (odd) copy with the new data */
> - cd.read_data[1] = *rd;
> -
> /* steer readers towards the odd copy */
> raw_write_seqcount_latch(&cd.seq);
>
> @@ -130,6 +127,11 @@ static void update_clock_read_data(struct clock_read_data *rd)
>
> /* switch readers back to the even copy */
> raw_write_seqcount_latch(&cd.seq);
> +
> + /* update the backup (odd) copy with the new data */
> + cd.read_data[1] = *rd;
> +
> + raw_write_seqcount_latch_end(&cd.seq);
> }
>
> /*
That looks about right :-)
> ------ >8 ------
>
> I also noticed your d16317de9b41 ("seqlock/latch: Provide
> raw_read_seqcount_latch_retry()") to get rid of explicit instrumentation
> in noinstr.
>
> Not sure how to resolve that. We have that objtool support to erase
> calls in noinstr code (is_profiling_func), but that's x86 only.
>
> I could also make kcsan_atomic_next(0) noinstr compatible by checking if
> the ret IP is in noinstr, and immediately return if it is.
>
> Preferences?
Something like this perhaps?
---
arch/x86/kernel/tsc.c | 5 +++--
include/linux/rbtree_latch.h | 14 ++++++++------
include/linux/seqlock.h | 31 ++++++++++++++++++++++++++++++-
kernel/printk/printk.c | 9 +++++----
kernel/time/sched_clock.c | 20 ++++++++++++--------
kernel/time/timekeeping.c | 10 ++++++----
6 files changed, 64 insertions(+), 25 deletions(-)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index dfe6847fd99e..67aeaba4ba9c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -174,10 +174,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts
c2n = per_cpu_ptr(&cyc2ns, cpu);
- raw_write_seqcount_latch(&c2n->seq);
+ write_seqcount_latch_begin(&c2n->seq);
c2n->data[0] = data;
- raw_write_seqcount_latch(&c2n->seq);
+ write_seqcount_latch(&c2n->seq);
c2n->data[1] = data;
+ write_seqcount_latch_end(&c2n->seq);
}
static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
index 6a0999c26c7c..bc992c61b7ce 100644
--- a/include/linux/rbtree_latch.h
+++ b/include/linux/rbtree_latch.h
@@ -145,10 +145,11 @@ latch_tree_insert(struct latch_tree_node *node,
struct latch_tree_root *root,
const struct latch_tree_ops *ops)
{
- raw_write_seqcount_latch(&root->seq);
+ write_seqcount_latch_begin(&root->seq);
__lt_insert(node, root, 0, ops->less);
- raw_write_seqcount_latch(&root->seq);
+ write_seqcount_latch(&root->seq);
__lt_insert(node, root, 1, ops->less);
+ write_seqcount_latch_end(&root->seq);
}
/**
@@ -172,10 +173,11 @@ latch_tree_erase(struct latch_tree_node *node,
struct latch_tree_root *root,
const struct latch_tree_ops *ops)
{
- raw_write_seqcount_latch(&root->seq);
+ write_seqcount_latch_begin(&root->seq);
__lt_erase(node, root, 0);
- raw_write_seqcount_latch(&root->seq);
+ write_seqcount_latch(&root->seq);
__lt_erase(node, root, 1);
+ write_seqcount_latch_end(&root->seq);
}
/**
@@ -204,9 +206,9 @@ latch_tree_find(void *key, struct latch_tree_root *root,
unsigned int seq;
do {
- seq = raw_read_seqcount_latch(&root->seq);
+ seq = read_seqcount_latch(&root->seq);
node = __lt_find(key, root, seq & 1, ops->comp);
- } while (raw_read_seqcount_latch_retry(&root->seq, seq));
+ } while (read_seqcount_latch_retry(&root->seq, seq));
return node;
}
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index fffeb754880f..9c2751087185 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -621,6 +621,12 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *
return READ_ONCE(s->seqcount.sequence);
}
+static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s)
+{
+ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
+ return raw_read_seqcount_latch(s);
+}
+
/**
* raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
* @s: Pointer to seqcount_latch_t
@@ -635,6 +641,13 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}
+static __always_inline int
+read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
+{
+ kcsan_atomic_next(0);
+ return raw_read_seqcount_latch_retry(s, start);
+}
+
/**
* raw_write_seqcount_latch() - redirect latch readers to even/odd copy
* @s: Pointer to seqcount_latch_t
@@ -716,13 +729,29 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
* When data is a dynamic data structure; one should use regular RCU
* patterns to manage the lifetimes of the objects within.
*/
-static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
+static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
smp_wmb(); /* prior stores before incrementing "sequence" */
s->seqcount.sequence++;
smp_wmb(); /* increment "sequence" before following stores */
}
+static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s)
+{
+ kcsan_nestable_atomic_begin();
+ raw_write_seqcount_latch(s);
+}
+
+static __always_inline void write_seqcount_latch(seqcount_latch_t *s)
+{
+ raw_write_seqcount_latch(s);
+}
+
+static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s)
+{
+ kcsan_nestable_atomic_end();
+}
+
#define __SEQLOCK_UNLOCKED(lockname) \
{ \
.seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index beb808f4c367..19911c8fa7b6 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -560,10 +560,11 @@ bool printk_percpu_data_ready(void)
/* Must be called under syslog_lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
- raw_write_seqcount_latch(&ls->latch);
+ write_seqcount_latch_begin(&ls->latch);
ls->val[0] = val;
- raw_write_seqcount_latch(&ls->latch);
+ write_seqcount_latch(&ls->latch);
ls->val[1] = val;
+ write_seqcount_latch_end(&ls->latch);
}
/* Can be called from any context. */
@@ -574,10 +575,10 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls)
u64 val;
do {
- seq = raw_read_seqcount_latch(&ls->latch);
+ seq = read_seqcount_latch(&ls->latch);
idx = seq & 0x1;
val = ls->val[idx];
- } while (raw_read_seqcount_latch_retry(&ls->latch, seq));
+ } while (read_seqcount_latch_retry(&ls->latch, seq));
return val;
}
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68d6c1190ac7..4958b40ba6c9 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -71,13 +71,13 @@ static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
{
- *seq = raw_read_seqcount_latch(&cd.seq);
+ *seq = read_seqcount_latch(&cd.seq);
return cd.read_data + (*seq & 1);
}
notrace int sched_clock_read_retry(unsigned int seq)
{
- return raw_read_seqcount_latch_retry(&cd.seq, seq);
+ return read_seqcount_latch_retry(&cd.seq, seq);
}
unsigned long long noinstr sched_clock_noinstr(void)
@@ -102,7 +102,9 @@ unsigned long long notrace sched_clock(void)
{
unsigned long long ns;
preempt_disable_notrace();
+ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
ns = sched_clock_noinstr();
+ kcsan_atomic_next(0);
preempt_enable_notrace();
return ns;
}
@@ -119,17 +121,19 @@ unsigned long long notrace sched_clock(void)
*/
static void update_clock_read_data(struct clock_read_data *rd)
{
- /* update the backup (odd) copy with the new data */
- cd.read_data[1] = *rd;
-
/* steer readers towards the odd copy */
- raw_write_seqcount_latch(&cd.seq);
+ write_seqcount_latch_begin(&cd.seq);
/* now its safe for us to update the normal (even) copy */
cd.read_data[0] = *rd;
/* switch readers back to the even copy */
- raw_write_seqcount_latch(&cd.seq);
+ write_seqcount_latch(&cd.seq);
+
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ write_seqcount_latch_end(&cd.seq);
}
/*
@@ -267,7 +271,7 @@ void __init generic_sched_clock_init(void)
*/
static u64 notrace suspended_sched_clock_read(void)
{
- unsigned int seq = raw_read_seqcount_latch(&cd.seq);
+ unsigned int seq = read_seqcount_latch(&cd.seq);
return cd.read_data[seq & 1].epoch_cyc;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409bf311..2ca26bfeb8f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -424,16 +424,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
struct tk_read_base *base = tkf->base;
/* Force readers off to base[1] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch_begin(&tkf->seq);
/* Update base[0] */
memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
- raw_write_seqcount_latch(&tkf->seq);
+ write_seqcount_latch(&tkf->seq);
/* Update base[1] */
memcpy(base + 1, base, sizeof(*base));
+
+ write_seqcount_latch_end(&tkf->seq);
}
static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
@@ -443,11 +445,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
u64 now;
do {
- seq = raw_read_seqcount_latch(&tkf->seq);
+ seq = read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
now = ktime_to_ns(tkr->base);
now += __timekeeping_get_ns(tkr);
- } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
+ } while (read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
On Wed, Oct 30, 2024 at 09:48:15PM +0100, Peter Zijlstra wrote:
> diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
> index 68d6c1190ac7..4958b40ba6c9 100644
> --- a/kernel/time/sched_clock.c
> +++ b/kernel/time/sched_clock.c
> @@ -102,7 +102,9 @@ unsigned long long notrace sched_clock(void)
> {
> unsigned long long ns;
> preempt_disable_notrace();
> + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
> ns = sched_clock_noinstr();
> + kcsan_atomic_next(0);
> preempt_enable_notrace();
> return ns;
> }
You might want to consider also folding something like this in.
That should give this instrumented version instrumentation :-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 68d6c1190ac7..db26f343233f 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -80,7 +80,7 @@ notrace int sched_clock_read_retry(unsigned int seq)
return raw_read_seqcount_latch_retry(&cd.seq, seq);
}
-unsigned long long noinstr sched_clock_noinstr(void)
+static __always_inline unsigned long long __sched_clock(void)
{
struct clock_read_data *rd;
unsigned int seq;
@@ -98,11 +98,16 @@ unsigned long long noinstr sched_clock_noinstr(void)
return res;
}
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+ return __sched_clock();
+}
+
unsigned long long notrace sched_clock(void)
{
unsigned long long ns;
preempt_disable_notrace();
- ns = sched_clock_noinstr();
+ ns = __sched_clock();
preempt_enable_notrace();
return ns;
}
On Wed, 30 Oct 2024 at 21:48, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Oct 29, 2024 at 09:49:21PM +0100, Marco Elver wrote:
>
> > Something like this?
> >
> > ------ >8 ------
> >
> > Author: Marco Elver <elver@google.com>
> > Date: Tue Oct 29 21:16:21 2024 +0100
> >
> > time/sched_clock: Swap update_clock_read_data() latch writes
> >
> > Swap the writes to the odd and even copies to make the writer critical
> > section look like all other seqcount_latch writers.
> >
> > With that, we can also add the raw_write_seqcount_latch_end() to clearly
> > denote the end of the writer section.
> >
> > Signed-off-by: Marco Elver <elver@google.com>
> >
> > diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
> > index 68d6c1190ac7..311c90a0e86e 100644
> > --- a/kernel/time/sched_clock.c
> > +++ b/kernel/time/sched_clock.c
> > @@ -119,9 +119,6 @@ unsigned long long notrace sched_clock(void)
> > */
> > static void update_clock_read_data(struct clock_read_data *rd)
> > {
> > - /* update the backup (odd) copy with the new data */
> > - cd.read_data[1] = *rd;
> > -
> > /* steer readers towards the odd copy */
> > raw_write_seqcount_latch(&cd.seq);
> >
> > @@ -130,6 +127,11 @@ static void update_clock_read_data(struct clock_read_data *rd)
> >
> > /* switch readers back to the even copy */
> > raw_write_seqcount_latch(&cd.seq);
> > +
> > + /* update the backup (odd) copy with the new data */
> > + cd.read_data[1] = *rd;
> > +
> > + raw_write_seqcount_latch_end(&cd.seq);
> > }
> >
> > /*
>
> That looks about right :-)
>
> > ------ >8 ------
> >
> > I also noticed your d16317de9b41 ("seqlock/latch: Provide
> > raw_read_seqcount_latch_retry()") to get rid of explicit instrumentation
> > in noinstr.
> >
> > Not sure how to resolve that. We have that objtool support to erase
> > calls in noinstr code (is_profiling_func), but that's x86 only.
> >
> > I could also make kcsan_atomic_next(0) noinstr compatible by checking if
> > the ret IP is in noinstr, and immediately return if it is.
> >
> > Preferences?
>
> Something like this perhaps?
Looks good.
Let me try to assemble the pieces into a patch. (Your SOB will be
needed - either now or later.)
Thanks,
-- Marco
> ---
> arch/x86/kernel/tsc.c | 5 +++--
> include/linux/rbtree_latch.h | 14 ++++++++------
> include/linux/seqlock.h | 31 ++++++++++++++++++++++++++++++-
> kernel/printk/printk.c | 9 +++++----
> kernel/time/sched_clock.c | 20 ++++++++++++--------
> kernel/time/timekeeping.c | 10 ++++++----
> 6 files changed, 64 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index dfe6847fd99e..67aeaba4ba9c 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -174,10 +174,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts
>
> c2n = per_cpu_ptr(&cyc2ns, cpu);
>
> - raw_write_seqcount_latch(&c2n->seq);
> + write_seqcount_latch_begin(&c2n->seq);
> c2n->data[0] = data;
> - raw_write_seqcount_latch(&c2n->seq);
> + write_seqcount_latch(&c2n->seq);
> c2n->data[1] = data;
> + write_seqcount_latch_end(&c2n->seq);
> }
>
> static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
> diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
> index 6a0999c26c7c..bc992c61b7ce 100644
> --- a/include/linux/rbtree_latch.h
> +++ b/include/linux/rbtree_latch.h
> @@ -145,10 +145,11 @@ latch_tree_insert(struct latch_tree_node *node,
> struct latch_tree_root *root,
> const struct latch_tree_ops *ops)
> {
> - raw_write_seqcount_latch(&root->seq);
> + write_seqcount_latch_begin(&root->seq);
> __lt_insert(node, root, 0, ops->less);
> - raw_write_seqcount_latch(&root->seq);
> + write_seqcount_latch(&root->seq);
> __lt_insert(node, root, 1, ops->less);
> + write_seqcount_latch_end(&root->seq);
> }
>
> /**
> @@ -172,10 +173,11 @@ latch_tree_erase(struct latch_tree_node *node,
> struct latch_tree_root *root,
> const struct latch_tree_ops *ops)
> {
> - raw_write_seqcount_latch(&root->seq);
> + write_seqcount_latch_begin(&root->seq);
> __lt_erase(node, root, 0);
> - raw_write_seqcount_latch(&root->seq);
> + write_seqcount_latch(&root->seq);
> __lt_erase(node, root, 1);
> + write_seqcount_latch_end(&root->seq);
> }
>
> /**
> @@ -204,9 +206,9 @@ latch_tree_find(void *key, struct latch_tree_root *root,
> unsigned int seq;
>
> do {
> - seq = raw_read_seqcount_latch(&root->seq);
> + seq = read_seqcount_latch(&root->seq);
> node = __lt_find(key, root, seq & 1, ops->comp);
> - } while (raw_read_seqcount_latch_retry(&root->seq, seq));
> + } while (read_seqcount_latch_retry(&root->seq, seq));
>
> return node;
> }
> diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
> index fffeb754880f..9c2751087185 100644
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -621,6 +621,12 @@ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *
> return READ_ONCE(s->seqcount.sequence);
> }
>
> +static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s)
> +{
> + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
> + return raw_read_seqcount_latch(s);
> +}
> +
> /**
> * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
> * @s: Pointer to seqcount_latch_t
> @@ -635,6 +641,13 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
> return unlikely(READ_ONCE(s->seqcount.sequence) != start);
> }
>
> +static __always_inline int
> +read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
> +{
> + kcsan_atomic_next(0);
> + return raw_read_seqcount_latch_retry(s, start);
> +}
> +
> /**
> * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
> * @s: Pointer to seqcount_latch_t
> @@ -716,13 +729,29 @@ raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
> * When data is a dynamic data structure; one should use regular RCU
> * patterns to manage the lifetimes of the objects within.
> */
> -static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
> +static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s)
> {
> smp_wmb(); /* prior stores before incrementing "sequence" */
> s->seqcount.sequence++;
> smp_wmb(); /* increment "sequence" before following stores */
> }
>
> +static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s)
> +{
> + kcsan_nestable_atomic_begin();
> + raw_write_seqcount_latch(s);
> +}
> +
> +static __always_inline void write_seqcount_latch(seqcount_latch_t *s)
> +{
> + raw_write_seqcount_latch(s);
> +}
> +
> +static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s)
> +{
> + kcsan_nestable_atomic_end();
> +}
> +
> #define __SEQLOCK_UNLOCKED(lockname) \
> { \
> .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index beb808f4c367..19911c8fa7b6 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -560,10 +560,11 @@ bool printk_percpu_data_ready(void)
> /* Must be called under syslog_lock. */
> static void latched_seq_write(struct latched_seq *ls, u64 val)
> {
> - raw_write_seqcount_latch(&ls->latch);
> + write_seqcount_latch_begin(&ls->latch);
> ls->val[0] = val;
> - raw_write_seqcount_latch(&ls->latch);
> + write_seqcount_latch(&ls->latch);
> ls->val[1] = val;
> + write_seqcount_latch_end(&ls->latch);
> }
>
> /* Can be called from any context. */
> @@ -574,10 +575,10 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls)
> u64 val;
>
> do {
> - seq = raw_read_seqcount_latch(&ls->latch);
> + seq = read_seqcount_latch(&ls->latch);
> idx = seq & 0x1;
> val = ls->val[idx];
> - } while (raw_read_seqcount_latch_retry(&ls->latch, seq));
> + } while (read_seqcount_latch_retry(&ls->latch, seq));
>
> return val;
> }
> diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
> index 68d6c1190ac7..4958b40ba6c9 100644
> --- a/kernel/time/sched_clock.c
> +++ b/kernel/time/sched_clock.c
> @@ -71,13 +71,13 @@ static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
>
> notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
> {
> - *seq = raw_read_seqcount_latch(&cd.seq);
> + *seq = read_seqcount_latch(&cd.seq);
> return cd.read_data + (*seq & 1);
> }
>
> notrace int sched_clock_read_retry(unsigned int seq)
> {
> - return raw_read_seqcount_latch_retry(&cd.seq, seq);
> + return read_seqcount_latch_retry(&cd.seq, seq);
> }
>
> unsigned long long noinstr sched_clock_noinstr(void)
> @@ -102,7 +102,9 @@ unsigned long long notrace sched_clock(void)
> {
> unsigned long long ns;
> preempt_disable_notrace();
> + kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
> ns = sched_clock_noinstr();
> + kcsan_atomic_next(0);
> preempt_enable_notrace();
> return ns;
> }
> @@ -119,17 +121,19 @@ unsigned long long notrace sched_clock(void)
> */
> static void update_clock_read_data(struct clock_read_data *rd)
> {
> - /* update the backup (odd) copy with the new data */
> - cd.read_data[1] = *rd;
> -
> /* steer readers towards the odd copy */
> - raw_write_seqcount_latch(&cd.seq);
> + write_seqcount_latch_begin(&cd.seq);
>
> /* now its safe for us to update the normal (even) copy */
> cd.read_data[0] = *rd;
>
> /* switch readers back to the even copy */
> - raw_write_seqcount_latch(&cd.seq);
> + write_seqcount_latch(&cd.seq);
> +
> + /* update the backup (odd) copy with the new data */
> + cd.read_data[1] = *rd;
> +
> + write_seqcount_latch_end(&cd.seq);
> }
>
> /*
> @@ -267,7 +271,7 @@ void __init generic_sched_clock_init(void)
> */
> static u64 notrace suspended_sched_clock_read(void)
> {
> - unsigned int seq = raw_read_seqcount_latch(&cd.seq);
> + unsigned int seq = read_seqcount_latch(&cd.seq);
>
> return cd.read_data[seq & 1].epoch_cyc;
> }
> diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
> index 7e6f409bf311..2ca26bfeb8f3 100644
> --- a/kernel/time/timekeeping.c
> +++ b/kernel/time/timekeeping.c
> @@ -424,16 +424,18 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
> struct tk_read_base *base = tkf->base;
>
> /* Force readers off to base[1] */
> - raw_write_seqcount_latch(&tkf->seq);
> + write_seqcount_latch_begin(&tkf->seq);
>
> /* Update base[0] */
> memcpy(base, tkr, sizeof(*base));
>
> /* Force readers back to base[0] */
> - raw_write_seqcount_latch(&tkf->seq);
> + write_seqcount_latch(&tkf->seq);
>
> /* Update base[1] */
> memcpy(base + 1, base, sizeof(*base));
> +
> + write_seqcount_latch_end(&tkf->seq);
> }
>
> static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
> @@ -443,11 +445,11 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
> u64 now;
>
> do {
> - seq = raw_read_seqcount_latch(&tkf->seq);
> + seq = read_seqcount_latch(&tkf->seq);
> tkr = tkf->base + (seq & 0x01);
> now = ktime_to_ns(tkr->base);
> now += __timekeeping_get_ns(tkr);
> - } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
> + } while (read_seqcount_latch_retry(&tkf->seq, seq));
>
> return now;
> }
On Thu, Oct 31, 2024 at 08:00:00AM +0100, Marco Elver wrote: > Looks good. > > Let me try to assemble the pieces into a patch. (Your SOB will be > needed - either now or later.) Feel free to add: Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
© 2016 - 2026 Red Hat, Inc.