[PATCH] llist: add [READ|WRITE]_ONCE tags for llist_node

Shakeel Butt posted 1 patch 3 months, 1 week ago
include/linux/llist.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
[PATCH] llist: add [READ|WRITE]_ONCE tags for llist_node
Posted by Shakeel Butt 3 months, 1 week ago
Before the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi
safe"), the struct llist_node is expected to be private to the one
inserting the node to the lockless list or the one removing the node
from the lockless list. After the mentioned commit, the llist_node in
the rstat code is per-cpu shared between the stacked contexts i.e.
process, softirq, hardirq & nmi.

KCSAN reported the following race:

 Reported by Kernel Concurrency Sanitizer on:
 CPU: 60 UID: 0 PID: 5425 ... 6.16.0-rc3-next-20250626 #1 NONE
 Tainted: [E]=UNSIGNED_MODULE
 Hardware name: ...
 ==================================================================
 ==================================================================
 BUG: KCSAN: data-race in css_rstat_flush / css_rstat_updated
 write to 0xffffe8fffe1c85f0 of 8 bytes by task 1061 on cpu 1:
  css_rstat_flush+0x1b8/0xeb0
  __mem_cgroup_flush_stats+0x184/0x190
  flush_memcg_stats_dwork+0x22/0x50
  process_one_work+0x335/0x630
  worker_thread+0x5f1/0x8a0
  kthread+0x197/0x340
  ret_from_fork+0xd3/0x110
  ret_from_fork_asm+0x11/0x20
 read to 0xffffe8fffe1c85f0 of 8 bytes by task 3551 on cpu 15:
  css_rstat_updated+0x81/0x180
  mod_memcg_lruvec_state+0x113/0x2d0
  __mod_lruvec_state+0x3d/0x50
  lru_add+0x21e/0x3f0
  folio_batch_move_lru+0x80/0x1b0
  __folio_batch_add_and_move+0xd7/0x160
  folio_add_lru_vma+0x42/0x50
  do_anonymous_page+0x892/0xe90
  __handle_mm_fault+0xfaa/0x1520
  handle_mm_fault+0xdc/0x350
  do_user_addr_fault+0x1dc/0x650
  exc_page_fault+0x5c/0x110
  asm_exc_page_fault+0x22/0x30
 value changed: 0xffffe8fffe18e0d0 -> 0xffffe8fffe1c85f0

$ ./scripts/faddr2line vmlinux css_rstat_flush+0x1b8/0xeb0
css_rstat_flush+0x1b8/0xeb0:
init_llist_node at include/linux/llist.h:86
(inlined by) llist_del_first_init at include/linux/llist.h:308
(inlined by) css_process_update_tree at kernel/cgroup/rstat.c:148
(inlined by) css_rstat_updated_list at kernel/cgroup/rstat.c:258
(inlined by) css_rstat_flush at kernel/cgroup/rstat.c:389

$ ./scripts/faddr2line vmlinux css_rstat_updated+0x81/0x180
css_rstat_updated+0x81/0x180:
css_rstat_updated at kernel/cgroup/rstat.c:90 (discriminator 1)

These are expected race and a simple READ_ONCE/WRITE_ONCE resolves these
reports.

Fixes: 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe")
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 include/linux/llist.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/llist.h b/include/linux/llist.h
index 27b17f64bcee..607b2360c938 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -83,7 +83,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 static inline void init_llist_node(struct llist_node *node)
 {
-	node->next = node;
+	WRITE_ONCE(node->next, node);
 }
 
 /**
@@ -97,7 +97,7 @@ static inline void init_llist_node(struct llist_node *node)
  */
 static inline bool llist_on_list(const struct llist_node *node)
 {
-	return node->next != node;
+	return READ_ONCE(node->next) != node;
 }
 
 /**
@@ -220,7 +220,7 @@ static inline bool llist_empty(const struct llist_head *head)
 
 static inline struct llist_node *llist_next(struct llist_node *node)
 {
-	return node->next;
+	return READ_ONCE(node->next);
 }
 
 /**
-- 
2.47.1
Re: [PATCH] llist: add [READ|WRITE]_ONCE tags for llist_node
Posted by Shakeel Butt 3 months, 1 week ago
On Thu, Jun 26, 2025 at 12:05:50PM -0700, Shakeel Butt wrote:
> Before the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi
> safe"), the struct llist_node is expected to be private to the one
> inserting the node to the lockless list or the one removing the node
> from the lockless list. After the mentioned commit, the llist_node in
> the rstat code is per-cpu shared between the stacked contexts i.e.
> process, softirq, hardirq & nmi.
> 
> KCSAN reported the following race:
> 
>  Reported by Kernel Concurrency Sanitizer on:
>  CPU: 60 UID: 0 PID: 5425 ... 6.16.0-rc3-next-20250626 #1 NONE
>  Tainted: [E]=UNSIGNED_MODULE
>  Hardware name: ...
>  ==================================================================
>  ==================================================================
>  BUG: KCSAN: data-race in css_rstat_flush / css_rstat_updated
>  write to 0xffffe8fffe1c85f0 of 8 bytes by task 1061 on cpu 1:
>   css_rstat_flush+0x1b8/0xeb0
>   __mem_cgroup_flush_stats+0x184/0x190
>   flush_memcg_stats_dwork+0x22/0x50
>   process_one_work+0x335/0x630
>   worker_thread+0x5f1/0x8a0
>   kthread+0x197/0x340
>   ret_from_fork+0xd3/0x110
>   ret_from_fork_asm+0x11/0x20
>  read to 0xffffe8fffe1c85f0 of 8 bytes by task 3551 on cpu 15:
>   css_rstat_updated+0x81/0x180
>   mod_memcg_lruvec_state+0x113/0x2d0
>   __mod_lruvec_state+0x3d/0x50
>   lru_add+0x21e/0x3f0
>   folio_batch_move_lru+0x80/0x1b0
>   __folio_batch_add_and_move+0xd7/0x160
>   folio_add_lru_vma+0x42/0x50
>   do_anonymous_page+0x892/0xe90
>   __handle_mm_fault+0xfaa/0x1520
>   handle_mm_fault+0xdc/0x350
>   do_user_addr_fault+0x1dc/0x650
>   exc_page_fault+0x5c/0x110
>   asm_exc_page_fault+0x22/0x30
>  value changed: 0xffffe8fffe18e0d0 -> 0xffffe8fffe1c85f0
> 
> $ ./scripts/faddr2line vmlinux css_rstat_flush+0x1b8/0xeb0
> css_rstat_flush+0x1b8/0xeb0:
> init_llist_node at include/linux/llist.h:86
> (inlined by) llist_del_first_init at include/linux/llist.h:308
> (inlined by) css_process_update_tree at kernel/cgroup/rstat.c:148
> (inlined by) css_rstat_updated_list at kernel/cgroup/rstat.c:258
> (inlined by) css_rstat_flush at kernel/cgroup/rstat.c:389
> 
> $ ./scripts/faddr2line vmlinux css_rstat_updated+0x81/0x180
> css_rstat_updated+0x81/0x180:
> css_rstat_updated at kernel/cgroup/rstat.c:90 (discriminator 1)
> 
> These are expected race and a simple READ_ONCE/WRITE_ONCE resolves these
> reports.

Tejun privately communicated that though the race is benign, we should
document it better instead of just silencing kcsan.

More specifically the llist_on_list() check on the update side and the
init_llist_node() reset on the flush side needs to coornidate to
guarantee that either the updater should get false from llist_on_list()
and it adds the node to the lockless list or the flush side get the
updated stats from concurrent updaters.

To guarantee that, on the update side we need a barrier between stats
update and llist_on_list() check and on the flush side, a barrier in
between init_llist_node() and the actual stats flush.

However do we really need such a guarantee and can we be fine with the
race? Particularly the update side is very performance critical path and
adding a barrier might be very costly. I think this race is benign and
we can just document it and then ignore it.

Tejun, is something like following acceptable? I know you mentioned to
add the barrier on the flush but I am wondering if we are not adding
barrier on the update side, what's the point to add it on the flush
side. Let me know if you still prefer to add the barrier on the flush
side.

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c8a48cf83878..02258b43abb3 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -86,8 +86,12 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 		return;
 
 	rstatc = css_rstat_cpu(css, cpu);
-	/* If already on list return. */
-	if (llist_on_list(&rstatc->lnode))
+	/*
+	 * If already on list return.
+	 *
+	 * TODO: add detailed comment on the potential race.
+	 */
+	if (data_race(llist_on_list(&rstatc->lnode)))
 		return;
 
 	/*
@@ -145,9 +149,13 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
 	struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
 	struct llist_node *lnode;
 
-	while ((lnode = llist_del_first_init(lhead))) {
+	while ((lnode = data_race(llist_del_first_init(lhead)))) {
 		struct css_rstat_cpu *rstatc;
 
+		/*
+		 * TODO: add detailed comment and maybe smp_mb().
+		 */
+
 		rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
 		__css_process_update_tree(rstatc->owner, cpu);
 	}
Re: [PATCH] llist: add [READ|WRITE]_ONCE tags for llist_node
Posted by Tejun Heo 3 months, 1 week ago
Hello,

On Thu, Jun 26, 2025 at 04:38:18PM -0700, Shakeel Butt wrote:
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index c8a48cf83878..02258b43abb3 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -86,8 +86,12 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
>  		return;
>  
>  	rstatc = css_rstat_cpu(css, cpu);
> -	/* If already on list return. */
> -	if (llist_on_list(&rstatc->lnode))
> +	/*
> +	 * If already on list return.
> +	 *
> +	 * TODO: add detailed comment on the potential race.
> +	 */
> +	if (data_race(llist_on_list(&rstatc->lnode)))
>  		return;

Yeah, also in the functdion comment, explain that if avoiding such races is
important enough to justify the overhead, the caller can put smp_mb() before
calling css_rstat_updated().

Thanks.

-- 
tejun