[PATCH 3/4] kcsan, debugfs: fix atomic sleep by converting spinlock_t to rcu lock

ran xiaokai posted 4 patches 2 months ago
[PATCH 3/4] kcsan, debugfs: fix atomic sleep by converting spinlock_t to rcu lock
Posted by ran xiaokai 2 months ago
From: Ran Xiaokai <ran.xiaokai@zte.com.cn>

In a preempt-RT kernel, most of the irq handlers have been
converted to the threaded mode except those which have the
IRQF_NO_THREAD flag set. The hrtimer IRQ is such an example.
So kcsan report could be triggered from a HARD-irq context, this will
trigger the "sleeping function called from invalid context" bug.

[    C1] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
[    C1] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/1
[    C1] preempt_count: 10002, expected: 0
[    C1] RCU nest depth: 0, expected: 0
[    C1] no locks held by swapper/1/0.
[    C1] irq event stamp: 156674
[    C1] hardirqs last  enabled at (156673): [<ffffffff81130bd9>] do_idle+0x1f9/0x240
[    C1] hardirqs last disabled at (156674): [<ffffffff82254f84>] sysvec_apic_timer_interrupt+0x14/0xc0
[    C1] softirqs last  enabled at (0): [<ffffffff81099f47>] copy_process+0xfc7/0x4b60
[    C1] softirqs last disabled at (0): [<0000000000000000>] 0x0
[    C1] Preemption disabled at:
[    C1] [<ffffffff814a3e2a>] paint_ptr+0x2a/0x90
[    C1] CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Not tainted 6.11.0+ #3
[    C1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014
[    C1] Call Trace:
[    C1]  <IRQ>
[    C1]  dump_stack_lvl+0x7e/0xc0
[    C1]  dump_stack+0x1d/0x30
[    C1]  __might_resched+0x1a2/0x270
[    C1]  rt_spin_lock+0x68/0x170
[    C1]  ? kcsan_skip_report_debugfs+0x43/0xe0
[    C1]  kcsan_skip_report_debugfs+0x43/0xe0
[    C1]  ? hrtimer_next_event_without+0x110/0x110
[    C1]  print_report+0xb5/0x590
[    C1]  kcsan_report_known_origin+0x1b1/0x1d0
[    C1]  kcsan_setup_watchpoint+0x348/0x650
[    C1]  __tsan_unaligned_write1+0x16d/0x1d0
[    C1]  hrtimer_interrupt+0x3d6/0x430
[    C1]  __sysvec_apic_timer_interrupt+0xe8/0x3a0
[    C1]  sysvec_apic_timer_interrupt+0x97/0xc0
[    C1]  </IRQ>

To fix this, we can not simply convert the report_filterlist_lock
to a raw_spinlock_t. In the insert_report_filterlist() path:

raw_spin_lock_irqsave(&report_filterlist_lock, flags);
  krealloc
    __do_kmalloc_node
      slab_alloc_node
        __slab_alloc
          local_lock_irqsave(&s->cpu_slab->lock, flags)

local_lock_t is now a spinlock_t which is sleepable in preempt-RT
kernel, so kmalloc() and similar functions can not be called with
a raw_spinlock_t lock held.

Instead, we can convert it to rcu lock to fix this.
Aso introduce a mutex to serialize user-space write operations.

Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
---
 kernel/kcsan/debugfs.c | 172 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 113 insertions(+), 59 deletions(-)

diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 30547507f497..d5e624c37125 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -40,20 +40,17 @@ static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT);
  * Addresses for filtering functions from reporting. This list can be used as a
  * whitelist or blacklist.
  */
-static struct {
+struct report_filterlist {
 	unsigned long	*addrs;		/* array of addresses */
 	size_t		size;		/* current size */
 	int		used;		/* number of elements used */
 	bool		sorted;		/* if elements are sorted */
 	bool		whitelist;	/* if list is a blacklist or whitelist */
-} report_filterlist = {
-	.addrs		= NULL,
-	.size		= 8,		/* small initial size */
-	.used		= 0,
-	.sorted		= false,
-	.whitelist	= false,	/* default is blacklist */
+	struct rcu_head rcu;
 };
-static DEFINE_SPINLOCK(report_filterlist_lock);
+
+static DEFINE_MUTEX(rp_flist_mutex);
+static struct report_filterlist __rcu *rp_flist;
 
 /*
  * The microbenchmark allows benchmarking KCSAN core runtime only. To run
@@ -103,98 +100,141 @@ static int cmp_filterlist_addrs(const void *rhs, const void *lhs)
 bool kcsan_skip_report_debugfs(unsigned long func_addr)
 {
 	unsigned long symbolsize, offset;
-	unsigned long flags;
 	bool ret = false;
+	struct report_filterlist *list;
 
 	if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset))
 		return false;
 	func_addr -= offset; /* Get function start */
 
-	spin_lock_irqsave(&report_filterlist_lock, flags);
-	if (report_filterlist.used == 0)
+	rcu_read_lock();
+	list = rcu_dereference(rp_flist);
+
+	if (!list)
+		goto out;
+
+	if (list->used == 0)
 		goto out;
 
 	/* Sort array if it is unsorted, and then do a binary search. */
-	if (!report_filterlist.sorted) {
-		sort(report_filterlist.addrs, report_filterlist.used,
+	if (!list->sorted) {
+		sort(list->addrs, list->used,
 		     sizeof(unsigned long), cmp_filterlist_addrs, NULL);
-		report_filterlist.sorted = true;
+		list->sorted = true;
 	}
-	ret = !!bsearch(&func_addr, report_filterlist.addrs,
-			report_filterlist.used, sizeof(unsigned long),
+	ret = !!bsearch(&func_addr, list->addrs,
+			list->used, sizeof(unsigned long),
 			cmp_filterlist_addrs);
-	if (report_filterlist.whitelist)
+	if (list->whitelist)
 		ret = !ret;
 
 out:
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
+	rcu_read_unlock();
 	return ret;
 }
 
 static ssize_t set_report_filterlist_whitelist(bool whitelist)
 {
-	unsigned long flags;
+	struct report_filterlist *new_list, *old_list;
+	ssize_t ret = 0;
 
-	spin_lock_irqsave(&report_filterlist_lock, flags);
-	report_filterlist.whitelist = whitelist;
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
-	return 0;
+	mutex_lock(&rp_flist_mutex);
+	old_list = rcu_dereference_protected(rp_flist,
+					   lockdep_is_held(&rp_flist_mutex));
+
+	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
+	if (!new_list) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memcpy(new_list, old_list, sizeof(struct report_filterlist));
+	new_list->whitelist = whitelist;
+
+	rcu_assign_pointer(rp_flist, new_list);
+	synchronize_rcu();
+	kfree(old_list);
+
+out:
+	mutex_unlock(&rp_flist_mutex);
+	return ret;
 }
 
 /* Returns 0 on success, error-code otherwise. */
 static ssize_t insert_report_filterlist(const char *func)
 {
-	unsigned long flags;
 	unsigned long addr = kallsyms_lookup_name(func);
 	ssize_t ret = 0;
+	struct report_filterlist *new_list, *old_list;
+	unsigned long *new_addrs;
 
 	if (!addr) {
 		pr_err("could not find function: '%s'\n", func);
 		return -ENOENT;
 	}
 
-	spin_lock_irqsave(&report_filterlist_lock, flags);
+	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
+	if (!new_list)
+		return -ENOMEM;
+
+	mutex_lock(&rp_flist_mutex);
+	old_list = rcu_dereference_protected(rp_flist,
+					   lockdep_is_held(&rp_flist_mutex));
+	memcpy(new_list, old_list, sizeof(struct report_filterlist));
 
-	if (report_filterlist.addrs == NULL) {
+	if (new_list->addrs == NULL) {
 		/* initial allocation */
-		report_filterlist.addrs =
-			kmalloc_array(report_filterlist.size,
-				      sizeof(unsigned long), GFP_ATOMIC);
-		if (report_filterlist.addrs == NULL) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	} else if (report_filterlist.used == report_filterlist.size) {
+		new_list->addrs =
+			kmalloc_array(new_list->size,
+					  sizeof(unsigned long), GFP_KERNEL);
+		if (new_list->addrs == NULL)
+			goto out_free;
+	} else if (new_list->used == new_list->size) {
 		/* resize filterlist */
-		size_t new_size = report_filterlist.size * 2;
-		unsigned long *new_addrs =
-			krealloc(report_filterlist.addrs,
-				 new_size * sizeof(unsigned long), GFP_ATOMIC);
-
-		if (new_addrs == NULL) {
-			/* leave filterlist itself untouched */
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		report_filterlist.size = new_size;
-		report_filterlist.addrs = new_addrs;
+		size_t new_size = new_list->size * 2;
+
+		new_addrs = kmalloc_array(new_size,
+					  sizeof(unsigned long), GFP_KERNEL);
+		if (new_addrs == NULL)
+			goto out_free;
+
+		memcpy(new_addrs, old_list->addrs,
+				old_list->size * sizeof(unsigned long));
+		new_list->size = new_size;
+		new_list->addrs = new_addrs;
+	} else {
+		new_addrs = kmalloc_array(new_list->size,
+					  sizeof(unsigned long), GFP_KERNEL);
+		if (new_addrs == NULL)
+			goto out_free;
+
+		memcpy(new_addrs, old_list->addrs,
+				old_list->size * sizeof(unsigned long));
+		new_list->addrs = new_addrs;
 	}
 
 	/* Note: deduplicating should be done in userspace. */
-	report_filterlist.addrs[report_filterlist.used++] = addr;
-	report_filterlist.sorted = false;
-
+	new_list->addrs[new_list->used++] = addr;
+	new_list->sorted = false;
+
+	rcu_assign_pointer(rp_flist, new_list);
+	synchronize_rcu();
+	kfree(old_list->addrs);
+	kfree(old_list);
+	goto out;
+
+out_free:
+	ret = -ENOMEM;
+	kfree(new_list);
 out:
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
-
+	mutex_unlock(&rp_flist_mutex);
 	return ret;
 }
 
 static int show_info(struct seq_file *file, void *v)
 {
 	int i;
-	unsigned long flags;
+	struct report_filterlist *list;
 
 	/* show stats */
 	seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled));
@@ -203,14 +243,15 @@ static int show_info(struct seq_file *file, void *v)
 			   atomic_long_read(&kcsan_counters[i]));
 	}
 
+	rcu_read_lock();
+	list = rcu_dereference(rp_flist);
 	/* show filter functions, and filter type */
-	spin_lock_irqsave(&report_filterlist_lock, flags);
 	seq_printf(file, "\n%s functions: %s\n",
-		   report_filterlist.whitelist ? "whitelisted" : "blacklisted",
-		   report_filterlist.used == 0 ? "none" : "");
-	for (i = 0; i < report_filterlist.used; ++i)
-		seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]);
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
+		   list->whitelist ? "whitelisted" : "blacklisted",
+		   list->used == 0 ? "none" : "");
+	for (i = 0; i < list->used; ++i)
+		seq_printf(file, " %ps\n", (void *)list->addrs[i]);
+	rcu_read_unlock();
 
 	return 0;
 }
@@ -269,6 +310,19 @@ static const struct file_operations debugfs_ops =
 
 static int __init kcsan_debugfs_init(void)
 {
+	struct report_filterlist *list;
+
+	list = kzalloc(sizeof(*list), GFP_KERNEL);
+	if (!list)
+		return -ENOMEM;
+
+	list->addrs		= NULL;
+	list->size		= 8;		/* small initial size */
+	list->used		= 0;
+	list->sorted	= false;
+	list->whitelist	= false;	/* default is blacklist */
+	rcu_assign_pointer(rp_flist, list);
+
 	debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops);
 	return 0;
 }
-- 
2.15.2
Re: [PATCH 3/4] kcsan, debugfs: fix atomic sleep by converting spinlock_t to rcu lock
Posted by Marco Elver 1 month, 4 weeks ago
On Wed, Sep 25, 2024 at 02:31PM +0000, ran xiaokai wrote:
> From: Ran Xiaokai <ran.xiaokai@zte.com.cn>
> 
> In a preempt-RT kernel, most of the irq handlers have been
> converted to the threaded mode except those which have the
> IRQF_NO_THREAD flag set. The hrtimer IRQ is such an example.
> So kcsan report could be triggered from a HARD-irq context, this will
> trigger the "sleeping function called from invalid context" bug.
> 
> [    C1] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
> [    C1] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/1
> [    C1] preempt_count: 10002, expected: 0
> [    C1] RCU nest depth: 0, expected: 0
> [    C1] no locks held by swapper/1/0.
> [    C1] irq event stamp: 156674
> [    C1] hardirqs last  enabled at (156673): [<ffffffff81130bd9>] do_idle+0x1f9/0x240
> [    C1] hardirqs last disabled at (156674): [<ffffffff82254f84>] sysvec_apic_timer_interrupt+0x14/0xc0
> [    C1] softirqs last  enabled at (0): [<ffffffff81099f47>] copy_process+0xfc7/0x4b60
> [    C1] softirqs last disabled at (0): [<0000000000000000>] 0x0
> [    C1] Preemption disabled at:
> [    C1] [<ffffffff814a3e2a>] paint_ptr+0x2a/0x90
> [    C1] CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Not tainted 6.11.0+ #3
> [    C1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014
> [    C1] Call Trace:
> [    C1]  <IRQ>
> [    C1]  dump_stack_lvl+0x7e/0xc0
> [    C1]  dump_stack+0x1d/0x30
> [    C1]  __might_resched+0x1a2/0x270
> [    C1]  rt_spin_lock+0x68/0x170
> [    C1]  ? kcsan_skip_report_debugfs+0x43/0xe0
> [    C1]  kcsan_skip_report_debugfs+0x43/0xe0
> [    C1]  ? hrtimer_next_event_without+0x110/0x110
> [    C1]  print_report+0xb5/0x590
> [    C1]  kcsan_report_known_origin+0x1b1/0x1d0
> [    C1]  kcsan_setup_watchpoint+0x348/0x650
> [    C1]  __tsan_unaligned_write1+0x16d/0x1d0
> [    C1]  hrtimer_interrupt+0x3d6/0x430
> [    C1]  __sysvec_apic_timer_interrupt+0xe8/0x3a0
> [    C1]  sysvec_apic_timer_interrupt+0x97/0xc0
> [    C1]  </IRQ>
> 
> To fix this, we can not simply convert the report_filterlist_lock
> to a raw_spinlock_t. In the insert_report_filterlist() path:
> 
> raw_spin_lock_irqsave(&report_filterlist_lock, flags);
>   krealloc
>     __do_kmalloc_node
>       slab_alloc_node
>         __slab_alloc
>           local_lock_irqsave(&s->cpu_slab->lock, flags)
> 
> local_lock_t is now a spinlock_t which is sleepable in preempt-RT
> kernel, so kmalloc() and similar functions can not be called with
> a raw_spinlock_t lock held.
> 
> Instead, we can convert it to rcu lock to fix this.
> Aso introduce a mutex to serialize user-space write operations.
> 
> Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
[...]
> -	spin_lock_irqsave(&report_filterlist_lock, flags);
> -	if (report_filterlist.used == 0)
> +	rcu_read_lock();
> +	list = rcu_dereference(rp_flist);
> +
> +	if (!list)
> +		goto out;
> +
> +	if (list->used == 0)
>  		goto out;
>  
>  	/* Sort array if it is unsorted, and then do a binary search. */
> -	if (!report_filterlist.sorted) {
> -		sort(report_filterlist.addrs, report_filterlist.used,
> +	if (!list->sorted) {
> +		sort(list->addrs, list->used,
>  		     sizeof(unsigned long), cmp_filterlist_addrs, NULL);
> -		report_filterlist.sorted = true;
> +		list->sorted = true;
>  	}

This used to be under the report_filterlist_lock, but now there's no
protection against this happening concurrently.

Sure, at the moment, this is not a problem, because this function is
only called under the report_lock which serializes it. Is that intended?

> -	ret = !!bsearch(&func_addr, report_filterlist.addrs,
> -			report_filterlist.used, sizeof(unsigned long),
> +	ret = !!bsearch(&func_addr, list->addrs,
> +			list->used, sizeof(unsigned long),
>  			cmp_filterlist_addrs);
> -	if (report_filterlist.whitelist)
> +	if (list->whitelist)
>  		ret = !ret;
[...]
> +
> +	memcpy(new_list, old_list, sizeof(struct report_filterlist));
> +	new_list->whitelist = whitelist;
> +
> +	rcu_assign_pointer(rp_flist, new_list);
> +	synchronize_rcu();
> +	kfree(old_list);

Why not kfree_rcu()?

> +out:
> +	mutex_unlock(&rp_flist_mutex);
> +	return ret;
>  }
[...]
> +	} else {
> +		new_addrs = kmalloc_array(new_list->size,
> +					  sizeof(unsigned long), GFP_KERNEL);
> +		if (new_addrs == NULL)
> +			goto out_free;
> +
> +		memcpy(new_addrs, old_list->addrs,
> +				old_list->size * sizeof(unsigned long));
> +		new_list->addrs = new_addrs;
>  	}

Wait, for every insertion it ends up copying the list now? That's very
wasteful.

In general, this solution seems overly complex, esp. the part where it
ends up copying the whole list on _every_ insertion.

If the whole point is to avoid kmalloc() under the lock, we can do
something much simpler.

Please test the patch below - it's much simpler, and in the common case
I expect it to rarely throw away the preemptive allocation done outside
the critical section because concurrent insertions by the user should be
rarely done.

Thanks,
-- Marco

------ >8 ------

From: Marco Elver <elver@google.com>
Date: Tue, 1 Oct 2024 16:00:45 +0200
Subject: [PATCH] kcsan: turn report_filterlist_lock into a raw_spinlock

<tbd... please test>

Reported-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Signed-off-by: Marco Elver <elver@google.com>
---
 kernel/kcsan/debugfs.c | 76 +++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 1d1d1b0e4248..5ffb6cc5298b 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -46,14 +46,8 @@ static struct {
 	int		used;		/* number of elements used */
 	bool		sorted;		/* if elements are sorted */
 	bool		whitelist;	/* if list is a blacklist or whitelist */
-} report_filterlist = {
-	.addrs		= NULL,
-	.size		= 8,		/* small initial size */
-	.used		= 0,
-	.sorted		= false,
-	.whitelist	= false,	/* default is blacklist */
-};
-static DEFINE_SPINLOCK(report_filterlist_lock);
+} report_filterlist;
+static DEFINE_RAW_SPINLOCK(report_filterlist_lock);
 
 /*
  * The microbenchmark allows benchmarking KCSAN core runtime only. To run
@@ -110,7 +104,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr)
 		return false;
 	func_addr -= offset; /* Get function start */
 
-	spin_lock_irqsave(&report_filterlist_lock, flags);
+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
 	if (report_filterlist.used == 0)
 		goto out;
 
@@ -127,7 +121,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr)
 		ret = !ret;
 
 out:
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
 	return ret;
 }
 
@@ -135,9 +129,9 @@ static void set_report_filterlist_whitelist(bool whitelist)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&report_filterlist_lock, flags);
+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
 	report_filterlist.whitelist = whitelist;
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
 }
 
 /* Returns 0 on success, error-code otherwise. */
@@ -145,6 +139,9 @@ static ssize_t insert_report_filterlist(const char *func)
 {
 	unsigned long flags;
 	unsigned long addr = kallsyms_lookup_name(func);
+	unsigned long *delay_free = NULL;
+	unsigned long *new_addrs = NULL;
+	size_t new_size = 0;
 	ssize_t ret = 0;
 
 	if (!addr) {
@@ -152,32 +149,33 @@ static ssize_t insert_report_filterlist(const char *func)
 		return -ENOENT;
 	}
 
-	spin_lock_irqsave(&report_filterlist_lock, flags);
+retry_alloc:
+	/*
+	 * Check if we need an allocation, and re-validate under the lock. Since
+	 * the report_filterlist_lock is a raw, cannot allocate under the lock.
+	 */
+	if (data_race(report_filterlist.used == report_filterlist.size)) {
+		new_size = (report_filterlist.size ?: 4) * 2;
+		delay_free = new_addrs = kmalloc_array(new_size, sizeof(unsigned long), GFP_KERNEL);
+		if (!new_addrs)
+			return -ENOMEM;
+	}
 
-	if (report_filterlist.addrs == NULL) {
-		/* initial allocation */
-		report_filterlist.addrs =
-			kmalloc_array(report_filterlist.size,
-				      sizeof(unsigned long), GFP_ATOMIC);
-		if (report_filterlist.addrs == NULL) {
-			ret = -ENOMEM;
-			goto out;
-		}
-	} else if (report_filterlist.used == report_filterlist.size) {
-		/* resize filterlist */
-		size_t new_size = report_filterlist.size * 2;
-		unsigned long *new_addrs =
-			krealloc(report_filterlist.addrs,
-				 new_size * sizeof(unsigned long), GFP_ATOMIC);
-
-		if (new_addrs == NULL) {
-			/* leave filterlist itself untouched */
-			ret = -ENOMEM;
-			goto out;
+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
+	if (report_filterlist.used == report_filterlist.size) {
+		/* Check we pre-allocated enough, and retry if not. */
+		if (report_filterlist.used >= new_size) {
+			raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+			kfree(new_addrs); /* kfree(NULL) is safe */
+			delay_free = new_addrs = NULL;
+			goto retry_alloc;
 		}
 
+		if (report_filterlist.used)
+			memcpy(new_addrs, report_filterlist.addrs, report_filterlist.used * sizeof(unsigned long));
+		delay_free = report_filterlist.addrs; /* free the old list */
+		report_filterlist.addrs = new_addrs;  /* switch to the new list */
 		report_filterlist.size = new_size;
-		report_filterlist.addrs = new_addrs;
 	}
 
 	/* Note: deduplicating should be done in userspace. */
@@ -185,8 +183,10 @@ static ssize_t insert_report_filterlist(const char *func)
 		kallsyms_lookup_name(func);
 	report_filterlist.sorted = false;
 
-out:
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
+
+	if (delay_free)
+		kfree(delay_free);
 
 	return ret;
 }
@@ -204,13 +204,13 @@ static int show_info(struct seq_file *file, void *v)
 	}
 
 	/* show filter functions, and filter type */
-	spin_lock_irqsave(&report_filterlist_lock, flags);
+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
 	seq_printf(file, "\n%s functions: %s\n",
 		   report_filterlist.whitelist ? "whitelisted" : "blacklisted",
 		   report_filterlist.used == 0 ? "none" : "");
 	for (i = 0; i < report_filterlist.used; ++i)
 		seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]);
-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
 
 	return 0;
 }
-- 
2.46.1.824.gd892dcdcdd-goog
Re: [PATCH 3/4] kcsan, debugfs: fix atomic sleep by converting spinlock_t to rcu lock
Posted by Ran Xiaokai 1 month, 3 weeks ago
>> -	spin_lock_irqsave(&report_filterlist_lock, flags);
>> -	if (report_filterlist.used == 0)
>> +	rcu_read_lock();
>> +	list = rcu_dereference(rp_flist);
>> +
>> +	if (!list)
>> +		goto out;
>> +
>> +	if (list->used == 0)
>>  		goto out;
>>  
>>  	/* Sort array if it is unsorted, and then do a binary search. */
>> -	if (!report_filterlist.sorted) {
>> -		sort(report_filterlist.addrs, report_filterlist.used,
>> +	if (!list->sorted) {
>> +		sort(list->addrs, list->used,
>>  		     sizeof(unsigned long), cmp_filterlist_addrs, NULL);
>> -		report_filterlist.sorted = true;
>> +		list->sorted = true;
>>  	}
>
>This used to be under the report_filterlist_lock, but now there's no
>protection against this happening concurrently.
>
>Sure, at the moment, this is not a problem, because this function is
>only called under the report_lock which serializes it. Is that intended?
>
>> -	ret = !!bsearch(&func_addr, report_filterlist.addrs,
>> -			report_filterlist.used, sizeof(unsigned long),
>> +	ret = !!bsearch(&func_addr, list->addrs,
>> +			list->used, sizeof(unsigned long),
>>  			cmp_filterlist_addrs);
>> -	if (report_filterlist.whitelist)
>> +	if (list->whitelist)
>>  		ret = !ret;
>[...]
>> +
>> +	memcpy(new_list, old_list, sizeof(struct report_filterlist));
>> +	new_list->whitelist = whitelist;
>> +
>> +	rcu_assign_pointer(rp_flist, new_list);
>> +	synchronize_rcu();
>> +	kfree(old_list);
>
>Why not kfree_rcu()?
>
>> +out:
>> +	mutex_unlock(&rp_flist_mutex);
>> +	return ret;
>>  }
>[...]
>> +	} else {
>> +		new_addrs = kmalloc_array(new_list->size,
>> +					  sizeof(unsigned long), GFP_KERNEL);
>> +		if (new_addrs == NULL)
>> +			goto out_free;
>> +
>> +		memcpy(new_addrs, old_list->addrs,
>> +				old_list->size * sizeof(unsigned long));
>> +		new_list->addrs = new_addrs;
>>  	}
>
>Wait, for every insertion it ends up copying the list now? That's very
>wasteful.
>
>In general, this solution seems overly complex, esp. the part where it
>ends up copying the whole list on _every_ insertion.
>
>If the whole point is to avoid kmalloc() under the lock, we can do
>something much simpler.
>
>Please test the patch below - it's much simpler, and in the common case
>I expect it to rarely throw away the preemptive allocation done outside
>the critical section because concurrent insertions by the user should be
>rarely done.

I have tested this, it works.
Yes, this patch is much simpler. 
Another consideration for me to convert the spinlock to a RCU lock was that
this would reduce the irq-latency when kcsan_skip_report_debugfs() called from
hard-irq context, but as you said, insertions by the user should not be a frequent 
operation, this should not be a problem. 

>Thanks,
>-- Marco
>
>------ >8 ------
>
>From: Marco Elver <elver@google.com>
>Date: Tue, 1 Oct 2024 16:00:45 +0200
>Subject: [PATCH] kcsan: turn report_filterlist_lock into a raw_spinlock
>
><tbd... please test>
>
>Reported-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
>Signed-off-by: Marco Elver <elver@google.com>
>---
> kernel/kcsan/debugfs.c | 76 +++++++++++++++++++++---------------------
> 1 file changed, 38 insertions(+), 38 deletions(-)
>
>diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
>index 1d1d1b0e4248..5ffb6cc5298b 100644
>--- a/kernel/kcsan/debugfs.c
>+++ b/kernel/kcsan/debugfs.c
>@@ -46,14 +46,8 @@ static struct {
> 	int		used;		/* number of elements used */
> 	bool		sorted;		/* if elements are sorted */
> 	bool		whitelist;	/* if list is a blacklist or whitelist */
>-} report_filterlist = {
>-	.addrs		= NULL,
>-	.size		= 8,		/* small initial size */
>-	.used		= 0,
>-	.sorted		= false,
>-	.whitelist	= false,	/* default is blacklist */
>-};
>-static DEFINE_SPINLOCK(report_filterlist_lock);
>+} report_filterlist;
>+static DEFINE_RAW_SPINLOCK(report_filterlist_lock);
> 
> /*
>  * The microbenchmark allows benchmarking KCSAN core runtime only. To run
>@@ -110,7 +104,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr)
> 		return false;
> 	func_addr -= offset; /* Get function start */
> 
>-	spin_lock_irqsave(&report_filterlist_lock, flags);
>+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
> 	if (report_filterlist.used == 0)
> 		goto out;
> 
>@@ -127,7 +121,7 @@ bool kcsan_skip_report_debugfs(unsigned long func_addr)
> 		ret = !ret;
> 
> out:
>-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
>+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
> 	return ret;
> }
> 
>@@ -135,9 +129,9 @@ static void set_report_filterlist_whitelist(bool whitelist)
> {
> 	unsigned long flags;
> 
>-	spin_lock_irqsave(&report_filterlist_lock, flags);
>+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
> 	report_filterlist.whitelist = whitelist;
>-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
>+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
> }
> 
> /* Returns 0 on success, error-code otherwise. */
>@@ -145,6 +139,9 @@ static ssize_t insert_report_filterlist(const char *func)
> {
> 	unsigned long flags;
> 	unsigned long addr = kallsyms_lookup_name(func);
>+	unsigned long *delay_free = NULL;
>+	unsigned long *new_addrs = NULL;
>+	size_t new_size = 0;
> 	ssize_t ret = 0;
> 
> 	if (!addr) {
>@@ -152,32 +149,33 @@ static ssize_t insert_report_filterlist(const char *func)
> 		return -ENOENT;
> 	}
> 
>-	spin_lock_irqsave(&report_filterlist_lock, flags);
>+retry_alloc:
>+	/*
>+	 * Check if we need an allocation, and re-validate under the lock. Since
>+	 * the report_filterlist_lock is a raw, cannot allocate under the lock.
>+	 */
>+	if (data_race(report_filterlist.used == report_filterlist.size)) {
>+		new_size = (report_filterlist.size ?: 4) * 2;
>+		delay_free = new_addrs = kmalloc_array(new_size, sizeof(unsigned long), GFP_KERNEL);
>+		if (!new_addrs)
>+			return -ENOMEM;
>+	}
> 
>-	if (report_filterlist.addrs == NULL) {
>-		/* initial allocation */
>-		report_filterlist.addrs =
>-			kmalloc_array(report_filterlist.size,
>-				      sizeof(unsigned long), GFP_ATOMIC);
>-		if (report_filterlist.addrs == NULL) {
>-			ret = -ENOMEM;
>-			goto out;
>-		}
>-	} else if (report_filterlist.used == report_filterlist.size) {
>-		/* resize filterlist */
>-		size_t new_size = report_filterlist.size * 2;
>-		unsigned long *new_addrs =
>-			krealloc(report_filterlist.addrs,
>-				 new_size * sizeof(unsigned long), GFP_ATOMIC);
>-
>-		if (new_addrs == NULL) {
>-			/* leave filterlist itself untouched */
>-			ret = -ENOMEM;
>-			goto out;
>+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
>+	if (report_filterlist.used == report_filterlist.size) {
>+		/* Check we pre-allocated enough, and retry if not. */
>+		if (report_filterlist.used >= new_size) {
>+			raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
>+			kfree(new_addrs); /* kfree(NULL) is safe */
>+			delay_free = new_addrs = NULL;
>+			goto retry_alloc;
> 		}
> 
>+		if (report_filterlist.used)
>+			memcpy(new_addrs, report_filterlist.addrs, report_filterlist.used * sizeof(unsigned long));
>+		delay_free = report_filterlist.addrs; /* free the old list */
>+		report_filterlist.addrs = new_addrs;  /* switch to the new list */
> 		report_filterlist.size = new_size;
>-		report_filterlist.addrs = new_addrs;
> 	}
> 
> 	/* Note: deduplicating should be done in userspace. */
>@@ -185,8 +183,10 @@ static ssize_t insert_report_filterlist(const char *func)
> 		kallsyms_lookup_name(func);
> 	report_filterlist.sorted = false;
> 
>-out:
>-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
>+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
>+
>+	if (delay_free)
>+		kfree(delay_free);
> 
> 	return ret;
> }
>@@ -204,13 +204,13 @@ static int show_info(struct seq_file *file, void *v)
> 	}
> 
> 	/* show filter functions, and filter type */
>-	spin_lock_irqsave(&report_filterlist_lock, flags);
>+	raw_spin_lock_irqsave(&report_filterlist_lock, flags);
> 	seq_printf(file, "\n%s functions: %s\n",
> 		   report_filterlist.whitelist ? "whitelisted" : "blacklisted",
> 		   report_filterlist.used == 0 ? "none" : "");
> 	for (i = 0; i < report_filterlist.used; ++i)
> 		seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]);
>-	spin_unlock_irqrestore(&report_filterlist_lock, flags);
>+	raw_spin_unlock_irqrestore(&report_filterlist_lock, flags);
> 
> 	return 0;
> }
>-- 
>2.46.1.824.gd892dcdcdd-goog
>