scftorture: Use workqueue to free scf_check

[PATCH] scftorture: Use workqueue to free scf_check

Posted by Boqun Feng 1 year, 3 months ago

Paul reported an invalid wait context issue in scftorture catched by
lockdep, and the cause of the issue is because scf_handler() may call
kfree() to free the struct scf_check:

	static void scf_handler(void *scfc_in)
        {
        [...]
                } else {
                        kfree(scfcp);
                }
        }

(call chain anlysis from Marco Elver)

This is problematic because smp_call_function() uses non-threaded
interrupt and kfree() may acquire a local_lock which is a sleepable lock
on RT.

The general rule is: do not alloc or free memory in non-threaded
interrupt conntexts.

A quick fix is to use workqueue to defer the kfree(). However, this is
OK only because scftorture is test code. In general the users of
interrupts should avoid giving interrupt handlers the ownership of
objects, that is, users should handle the lifetime of objects outside
and interrupt handlers should only hold references to objects.

Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
Link: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
---
 kernel/scftorture.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 44e83a646264..ab6dcc7c0116 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -127,6 +127,7 @@ static unsigned long scf_sel_totweight;
 
 // Communicate between caller and handler.
 struct scf_check {
+	struct work_struct work;
 	bool scfc_in;
 	bool scfc_out;
 	int scfc_cpu; // -1 for not _single().
@@ -252,6 +253,13 @@ static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
 	return &scf_sel_array[0];
 }
 
+static void kfree_scf_check_work(struct work_struct *w)
+{
+	struct scf_check *scfcp = container_of(w, struct scf_check, work);
+
+	kfree(scfcp);
+}
+
 // Update statistics and occasionally burn up mass quantities of CPU time,
 // if told to do so via scftorture.longwait.  Otherwise, occasionally burn
 // a little bit.
@@ -296,7 +304,10 @@ static void scf_handler(void *scfc_in)
 		if (scfcp->scfc_rpc)
 			complete(&scfcp->scfc_completion);
 	} else {
-		kfree(scfcp);
+		// Cannot call kfree() directly, pass it to workqueue. It's OK
+		// only because this is test code, avoid this in real world
+		// usage.
+		queue_work(system_wq, &scfcp->work);
 	}
 }
 
@@ -335,6 +346,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfcp->scfc_wait = scfsp->scfs_wait;
 			scfcp->scfc_out = false;
 			scfcp->scfc_rpc = false;
+			INIT_WORK(&scfcp->work, kfree_scf_check_work);
 		}
 	}
 	switch (scfsp->scfs_prim) {
-- 
2.45.2

Re: [PATCH] scftorture: Use workqueue to free scf_check

Posted by Paul E. McKenney 1 year, 3 months ago

On Fri, Nov 01, 2024 at 12:54:38PM -0700, Boqun Feng wrote:
> Paul reported an invalid wait context issue in scftorture catched by
> lockdep, and the cause of the issue is because scf_handler() may call
> kfree() to free the struct scf_check:
> 
> 	static void scf_handler(void *scfc_in)
>         {
>         [...]
>                 } else {
>                         kfree(scfcp);
>                 }
>         }
> 
> (call chain anlysis from Marco Elver)
> 
> This is problematic because smp_call_function() uses non-threaded
> interrupt and kfree() may acquire a local_lock which is a sleepable lock
> on RT.
> 
> The general rule is: do not alloc or free memory in non-threaded
> interrupt conntexts.
> 
> A quick fix is to use workqueue to defer the kfree(). However, this is
> OK only because scftorture is test code. In general the users of
> interrupts should avoid giving interrupt handlers the ownership of
> objects, that is, users should handle the lifetime of objects outside
> and interrupt handlers should only hold references to objects.
> 
> Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
> Link: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>

Thank you!

I was worried that putting each kfree() into a separate workqueue handler
would result in freeing not keeping up with allocation for asynchronous
testing (for example, scftorture.weight_single=1), but it seems to be
doing fine in early testing.

So I have queued this in my -rcu tree for review and further testing.

							Thanx, Paul

> ---
>  kernel/scftorture.c | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/scftorture.c b/kernel/scftorture.c
> index 44e83a646264..ab6dcc7c0116 100644
> --- a/kernel/scftorture.c
> +++ b/kernel/scftorture.c
> @@ -127,6 +127,7 @@ static unsigned long scf_sel_totweight;
>  
>  // Communicate between caller and handler.
>  struct scf_check {
> +	struct work_struct work;
>  	bool scfc_in;
>  	bool scfc_out;
>  	int scfc_cpu; // -1 for not _single().
> @@ -252,6 +253,13 @@ static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
>  	return &scf_sel_array[0];
>  }
>  
> +static void kfree_scf_check_work(struct work_struct *w)
> +{
> +	struct scf_check *scfcp = container_of(w, struct scf_check, work);
> +
> +	kfree(scfcp);
> +}
> +
>  // Update statistics and occasionally burn up mass quantities of CPU time,
>  // if told to do so via scftorture.longwait.  Otherwise, occasionally burn
>  // a little bit.
> @@ -296,7 +304,10 @@ static void scf_handler(void *scfc_in)
>  		if (scfcp->scfc_rpc)
>  			complete(&scfcp->scfc_completion);
>  	} else {
> -		kfree(scfcp);
> +		// Cannot call kfree() directly, pass it to workqueue. It's OK
> +		// only because this is test code, avoid this in real world
> +		// usage.
> +		queue_work(system_wq, &scfcp->work);
>  	}
>  }
>  
> @@ -335,6 +346,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
>  			scfcp->scfc_wait = scfsp->scfs_wait;
>  			scfcp->scfc_out = false;
>  			scfcp->scfc_rpc = false;
> +			INIT_WORK(&scfcp->work, kfree_scf_check_work);
>  		}
>  	}
>  	switch (scfsp->scfs_prim) {
> -- 
> 2.45.2
>

Re: [PATCH] scftorture: Use workqueue to free scf_check

Posted by Boqun Feng 1 year, 3 months ago

On Fri, Nov 01, 2024 at 04:35:28PM -0700, Paul E. McKenney wrote:
> On Fri, Nov 01, 2024 at 12:54:38PM -0700, Boqun Feng wrote:
> > Paul reported an invalid wait context issue in scftorture catched by
> > lockdep, and the cause of the issue is because scf_handler() may call
> > kfree() to free the struct scf_check:
> > 
> > 	static void scf_handler(void *scfc_in)
> >         {
> >         [...]
> >                 } else {
> >                         kfree(scfcp);
> >                 }
> >         }
> > 
> > (call chain anlysis from Marco Elver)
> > 
> > This is problematic because smp_call_function() uses non-threaded
> > interrupt and kfree() may acquire a local_lock which is a sleepable lock
> > on RT.
> > 
> > The general rule is: do not alloc or free memory in non-threaded
> > interrupt conntexts.
> > 
> > A quick fix is to use workqueue to defer the kfree(). However, this is
> > OK only because scftorture is test code. In general the users of
> > interrupts should avoid giving interrupt handlers the ownership of
> > objects, that is, users should handle the lifetime of objects outside
> > and interrupt handlers should only hold references to objects.
> > 
> > Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
> > Link: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
> > Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> 
> Thank you!
> 
> I was worried that putting each kfree() into a separate workqueue handler
> would result in freeing not keeping up with allocation for asynchronous
> testing (for example, scftorture.weight_single=1), but it seems to be
> doing fine in early testing.
> 

I shared the same worry, so it's why I added the comments before
queue_work() saying it's only OK because it's test code, it's certainly
not something recommended for general use.

But glad it turns out OK so far for scftorture ;-)

Regards,
Boqun

> So I have queued this in my -rcu tree for review and further testing.
> 
> 							Thanx, Paul
> 
> > ---
> >  kernel/scftorture.c | 14 +++++++++++++-
> >  1 file changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/kernel/scftorture.c b/kernel/scftorture.c
> > index 44e83a646264..ab6dcc7c0116 100644
> > --- a/kernel/scftorture.c
> > +++ b/kernel/scftorture.c
> > @@ -127,6 +127,7 @@ static unsigned long scf_sel_totweight;
> >  
> >  // Communicate between caller and handler.
> >  struct scf_check {
> > +	struct work_struct work;
> >  	bool scfc_in;
> >  	bool scfc_out;
> >  	int scfc_cpu; // -1 for not _single().
> > @@ -252,6 +253,13 @@ static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
> >  	return &scf_sel_array[0];
> >  }
> >  
> > +static void kfree_scf_check_work(struct work_struct *w)
> > +{
> > +	struct scf_check *scfcp = container_of(w, struct scf_check, work);
> > +
> > +	kfree(scfcp);
> > +}
> > +
> >  // Update statistics and occasionally burn up mass quantities of CPU time,
> >  // if told to do so via scftorture.longwait.  Otherwise, occasionally burn
> >  // a little bit.
> > @@ -296,7 +304,10 @@ static void scf_handler(void *scfc_in)
> >  		if (scfcp->scfc_rpc)
> >  			complete(&scfcp->scfc_completion);
> >  	} else {
> > -		kfree(scfcp);
> > +		// Cannot call kfree() directly, pass it to workqueue. It's OK
> > +		// only because this is test code, avoid this in real world
> > +		// usage.
> > +		queue_work(system_wq, &scfcp->work);
> >  	}
> >  }
> >  
> > @@ -335,6 +346,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
> >  			scfcp->scfc_wait = scfsp->scfs_wait;
> >  			scfcp->scfc_out = false;
> >  			scfcp->scfc_rpc = false;
> > +			INIT_WORK(&scfcp->work, kfree_scf_check_work);
> >  		}
> >  	}
> >  	switch (scfsp->scfs_prim) {
> > -- 
> > 2.45.2
> >

Re: [PATCH] scftorture: Use workqueue to free scf_check

Posted by Paul E. McKenney 1 year, 3 months ago

On Sat, Nov 02, 2024 at 08:35:36PM -0700, Boqun Feng wrote:
> On Fri, Nov 01, 2024 at 04:35:28PM -0700, Paul E. McKenney wrote:
> > On Fri, Nov 01, 2024 at 12:54:38PM -0700, Boqun Feng wrote:
> > > Paul reported an invalid wait context issue in scftorture catched by
> > > lockdep, and the cause of the issue is because scf_handler() may call
> > > kfree() to free the struct scf_check:
> > > 
> > > 	static void scf_handler(void *scfc_in)
> > >         {
> > >         [...]
> > >                 } else {
> > >                         kfree(scfcp);
> > >                 }
> > >         }
> > > 
> > > (call chain anlysis from Marco Elver)
> > > 
> > > This is problematic because smp_call_function() uses non-threaded
> > > interrupt and kfree() may acquire a local_lock which is a sleepable lock
> > > on RT.
> > > 
> > > The general rule is: do not alloc or free memory in non-threaded
> > > interrupt conntexts.
> > > 
> > > A quick fix is to use workqueue to defer the kfree(). However, this is
> > > OK only because scftorture is test code. In general the users of
> > > interrupts should avoid giving interrupt handlers the ownership of
> > > objects, that is, users should handle the lifetime of objects outside
> > > and interrupt handlers should only hold references to objects.
> > > 
> > > Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
> > > Link: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
> > > Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
> > 
> > Thank you!
> > 
> > I was worried that putting each kfree() into a separate workqueue handler
> > would result in freeing not keeping up with allocation for asynchronous
> > testing (for example, scftorture.weight_single=1), but it seems to be
> > doing fine in early testing.
> 
> I shared the same worry, so it's why I added the comments before
> queue_work() saying it's only OK because it's test code, it's certainly
> not something recommended for general use.
> 
> But glad it turns out OK so far for scftorture ;-)

That said, I have only tried a couple of memory sizes at 64 CPUs, the
default (512M), which OOMs both with and without this fix and 7G, which
is selected by torture.sh, which avoids OOMing either way.  It would be
interesting to vary the memory provided between those limits and see if
there is any difference in behavior.

It avoids OOM at the default 512M at 16 CPUs.

Ah, and I did not check throughput, which might have changed.  A quick
test on my laptop says that it dropped by almost a factor of two,
from not quite 1M invocations/s to a bit more than 500K invocations/s.
So something more efficient does seem in order.  ;-)

tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --configs PREEMPT --duration 30 --bootargs "scftorture.weight_single=1" --trust-make

							Thanx, Paul

> Regards,
> Boqun
> 
> > So I have queued this in my -rcu tree for review and further testing.
> > 
> > 							Thanx, Paul
> > 
> > > ---
> > >  kernel/scftorture.c | 14 +++++++++++++-
> > >  1 file changed, 13 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/kernel/scftorture.c b/kernel/scftorture.c
> > > index 44e83a646264..ab6dcc7c0116 100644
> > > --- a/kernel/scftorture.c
> > > +++ b/kernel/scftorture.c
> > > @@ -127,6 +127,7 @@ static unsigned long scf_sel_totweight;
> > >  
> > >  // Communicate between caller and handler.
> > >  struct scf_check {
> > > +	struct work_struct work;
> > >  	bool scfc_in;
> > >  	bool scfc_out;
> > >  	int scfc_cpu; // -1 for not _single().
> > > @@ -252,6 +253,13 @@ static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
> > >  	return &scf_sel_array[0];
> > >  }
> > >  
> > > +static void kfree_scf_check_work(struct work_struct *w)
> > > +{
> > > +	struct scf_check *scfcp = container_of(w, struct scf_check, work);
> > > +
> > > +	kfree(scfcp);
> > > +}
> > > +
> > >  // Update statistics and occasionally burn up mass quantities of CPU time,
> > >  // if told to do so via scftorture.longwait.  Otherwise, occasionally burn
> > >  // a little bit.
> > > @@ -296,7 +304,10 @@ static void scf_handler(void *scfc_in)
> > >  		if (scfcp->scfc_rpc)
> > >  			complete(&scfcp->scfc_completion);
> > >  	} else {
> > > -		kfree(scfcp);
> > > +		// Cannot call kfree() directly, pass it to workqueue. It's OK
> > > +		// only because this is test code, avoid this in real world
> > > +		// usage.
> > > +		queue_work(system_wq, &scfcp->work);
> > >  	}
> > >  }
> > >  
> > > @@ -335,6 +346,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
> > >  			scfcp->scfc_wait = scfsp->scfs_wait;
> > >  			scfcp->scfc_out = false;
> > >  			scfcp->scfc_rpc = false;
> > > +			INIT_WORK(&scfcp->work, kfree_scf_check_work);
> > >  		}
> > >  	}
> > >  	switch (scfsp->scfs_prim) {
> > > -- 
> > > 2.45.2
> > >

[PATCH 1/2] scftorture: Move memory allocation outside of preempt_disable region.

Posted by Sebastian Andrzej Siewior 1 year, 3 months ago

Memory allocations can not happen within regions with explicit disabled
preemption PREEMPT_RT. The problem is that the locking structures
underneath are sleeping locks.

Move the memory allocation outside of the preempt-disabled section. Keep
the GFP_ATOMIC for the allocation to behave like a "ememergncy
allocation".

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/scftorture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 44e83a6462647..e5546fe256329 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -320,10 +320,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 	struct scf_check *scfcp = NULL;
 	struct scf_selector *scfsp = scf_sel_rand(trsp);
 
-	if (use_cpus_read_lock)
-		cpus_read_lock();
-	else
-		preempt_disable();
 	if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
 		scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
 		if (!scfcp) {
@@ -337,6 +333,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfcp->scfc_rpc = false;
 		}
 	}
+	if (use_cpus_read_lock)
+		cpus_read_lock();
+	else
+		preempt_disable();
 	switch (scfsp->scfs_prim) {
 	case SCF_PRIM_RESCHED:
 		if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) {
-- 
2.45.2

[PATCH 2/2] scftorture: Use a lock-less list to free memory.

Posted by Sebastian Andrzej Siewior 1 year, 3 months ago

scf_handler() is used as a SMP function call. This function is always
invoked in IRQ-context even with forced-threading enabled. This function
frees memory which not allowed on PREEMPT_RT because the locking
underneath is using sleeping locks.

Add a per-CPU scf_free_pool where each SMP functions adds its memory to
be freed. This memory is then freed by scftorture_invoker() on each
iteration. On the majority of invocations the number of items is less
than five. If the thread sleeps/ gets delayed the number exceed 350 but
did not reach 400 in testing. These were the spikes during testing.
The bulk free of 64 pointers at once should improve the give-back if the
list grows. The list size is ~1.3 items per invocations.

Having one global scf_free_pool with one cleaning thread let the list
grow to over 10.000 items with 32 CPUs (again, spikes not the average)
especially if the CPU went to sleep. The per-CPU part looks like a good
compromise.

Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
Closes: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/scftorture.c | 47 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index e5546fe256329..ba9f1125821b8 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -97,6 +97,7 @@ struct scf_statistics {
 static struct scf_statistics *scf_stats_p;
 static struct task_struct *scf_torture_stats_task;
 static DEFINE_PER_CPU(long long, scf_invoked_count);
+static DEFINE_PER_CPU(struct llist_head, scf_free_pool);
 
 // Data for random primitive selection
 #define SCF_PRIM_RESCHED	0
@@ -133,6 +134,7 @@ struct scf_check {
 	bool scfc_wait;
 	bool scfc_rpc;
 	struct completion scfc_completion;
+	struct llist_node scf_node;
 };
 
 // Use to wait for all threads to start.
@@ -148,6 +150,40 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
 
 extern void resched_cpu(int cpu); // An alternative IPI vector.
 
+static void scf_add_to_free_list(struct scf_check *scfcp)
+{
+	struct llist_head *pool;
+	unsigned int cpu;
+
+	cpu = raw_smp_processor_id() % nthreads;
+	pool = &per_cpu(scf_free_pool, cpu);
+	llist_add(&scfcp->scf_node, pool);
+}
+
+static void scf_cleanup_free_list(unsigned int cpu)
+{
+	struct llist_head *pool;
+	struct llist_node *node;
+	struct scf_check *scfcp;
+	unsigned int slot = 0;
+	void *free_pool[64];
+
+	pool = &per_cpu(scf_free_pool, cpu);
+	node = llist_del_all(pool);
+	while (node) {
+		scfcp = llist_entry(node, struct scf_check, scf_node);
+		node = node->next;
+		free_pool[slot] = scfcp;
+		slot++;
+		if (slot == ARRAY_SIZE(free_pool)) {
+			kfree_bulk(slot, free_pool);
+			slot = 0;
+		}
+	}
+	if (slot)
+		kfree_bulk(slot, free_pool);
+}
+
 // Print torture statistics.  Caller must ensure serialization.
 static void scf_torture_stats_print(void)
 {
@@ -296,7 +332,7 @@ static void scf_handler(void *scfc_in)
 		if (scfcp->scfc_rpc)
 			complete(&scfcp->scfc_completion);
 	} else {
-		kfree(scfcp);
+		scf_add_to_free_list(scfcp);
 	}
 }
 
@@ -363,7 +399,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 				scfp->n_single_wait_ofl++;
 			else
 				scfp->n_single_ofl++;
-			kfree(scfcp);
+			scf_add_to_free_list(scfcp);
 			scfcp = NULL;
 		}
 		break;
@@ -391,7 +427,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 				preempt_disable();
 		} else {
 			scfp->n_single_rpc_ofl++;
-			kfree(scfcp);
+			scf_add_to_free_list(scfcp);
 			scfcp = NULL;
 		}
 		break;
@@ -428,7 +464,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
 			atomic_inc(&n_mb_out_errs); // Leak rather than trash!
 		} else {
-			kfree(scfcp);
+			scf_add_to_free_list(scfcp);
 		}
 		barrier(); // Prevent race-reduction compiler optimizations.
 	}
@@ -442,6 +478,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		schedule_timeout_uninterruptible(1);
 }
 
+
 // SCF test kthread.  Repeatedly does calls to members of the
 // smp_call_function() family of functions.
 static int scftorture_invoker(void *arg)
@@ -479,6 +516,8 @@ static int scftorture_invoker(void *arg)
 	VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
 
 	do {
+		scf_cleanup_free_list(scfp->cpu);
+
 		scftorture_invoke_one(scfp, &rand);
 		while (cpu_is_offline(cpu) && !torture_must_stop()) {
 			schedule_timeout_interruptible(HZ / 5);
-- 
2.45.2