scf_handler() is used as a SMP function call. This function is always
invoked in IRQ-context even with forced-threading enabled. This function
frees memory which not allowed on PREEMPT_RT because the locking
underneath is using sleeping locks.
Add a per-CPU scf_free_pool where each SMP functions adds its memory to
be freed. This memory is then freed by scftorture_invoker() on each
iteration. On the majority of invocations the number of items is less
than five. If the thread sleeps/ gets delayed the number exceed 350 but
did not reach 400 in testing. These were the spikes during testing.
The bulk free of 64 pointers at once should improve the give-back if the
list grows. The list size is ~1.3 items per invocations.
Having one global scf_free_pool with one cleaning thread let the list
grow to over 10.000 items with 32 CPUs (again, spikes not the average)
especially if the CPU went to sleep. The per-CPU part looks like a good
compromise.
Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
Closes: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/scftorture.c | 39 +++++++++++++++++++++++++++++++++++----
1 file changed, 35 insertions(+), 4 deletions(-)
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 555b3b10621fe..1268a91af5d88 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -97,6 +97,7 @@ struct scf_statistics {
static struct scf_statistics *scf_stats_p;
static struct task_struct *scf_torture_stats_task;
static DEFINE_PER_CPU(long long, scf_invoked_count);
+static DEFINE_PER_CPU(struct llist_head, scf_free_pool);
// Data for random primitive selection
#define SCF_PRIM_RESCHED 0
@@ -133,6 +134,7 @@ struct scf_check {
bool scfc_wait;
bool scfc_rpc;
struct completion scfc_completion;
+ struct llist_node scf_node;
};
// Use to wait for all threads to start.
@@ -148,6 +150,31 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
extern void resched_cpu(int cpu); // An alternative IPI vector.
+static void scf_add_to_free_list(struct scf_check *scfcp)
+{
+ struct llist_head *pool;
+ unsigned int cpu;
+
+ cpu = raw_smp_processor_id() % nthreads;
+ pool = &per_cpu(scf_free_pool, cpu);
+ llist_add(&scfcp->scf_node, pool);
+}
+
+static void scf_cleanup_free_list(unsigned int cpu)
+{
+ struct llist_head *pool;
+ struct llist_node *node;
+ struct scf_check *scfcp;
+
+ pool = &per_cpu(scf_free_pool, cpu);
+ node = llist_del_all(pool);
+ while (node) {
+ scfcp = llist_entry(node, struct scf_check, scf_node);
+ node = node->next;
+ kfree(scfcp);
+ }
+}
+
// Print torture statistics. Caller must ensure serialization.
static void scf_torture_stats_print(void)
{
@@ -296,7 +323,7 @@ static void scf_handler(void *scfc_in)
if (scfcp->scfc_rpc)
complete(&scfcp->scfc_completion);
} else {
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
}
}
@@ -363,7 +390,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfp->n_single_wait_ofl++;
else
scfp->n_single_ofl++;
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
scfcp = NULL;
}
break;
@@ -391,7 +418,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
preempt_disable();
} else {
scfp->n_single_rpc_ofl++;
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
scfcp = NULL;
}
break;
@@ -428,7 +455,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
atomic_inc(&n_mb_out_errs); // Leak rather than trash!
} else {
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
}
barrier(); // Prevent race-reduction compiler optimizations.
}
@@ -479,6 +506,8 @@ static int scftorture_invoker(void *arg)
VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
do {
+ scf_cleanup_free_list(cpu);
+
scftorture_invoke_one(scfp, &rand);
while (cpu_is_offline(cpu) && !torture_must_stop()) {
schedule_timeout_interruptible(HZ / 5);
@@ -538,6 +567,8 @@ static void scf_torture_cleanup(void)
end:
torture_cleanup_end();
+ for (i = 0; i < nthreads; i++)
+ scf_cleanup_free_list(i);
}
static int __init scf_torture_init(void)
--
2.45.2
On Thu, Nov 07, 2024 at 12:13:08PM +0100, Sebastian Andrzej Siewior wrote: > scf_handler() is used as a SMP function call. This function is always > invoked in IRQ-context even with forced-threading enabled. This function > frees memory which not allowed on PREEMPT_RT because the locking > underneath is using sleeping locks. > > Add a per-CPU scf_free_pool where each SMP functions adds its memory to > be freed. This memory is then freed by scftorture_invoker() on each > iteration. On the majority of invocations the number of items is less > than five. If the thread sleeps/ gets delayed the number exceed 350 but > did not reach 400 in testing. These were the spikes during testing. > The bulk free of 64 pointers at once should improve the give-back if the > list grows. The list size is ~1.3 items per invocations. > > Having one global scf_free_pool with one cleaning thread let the list > grow to over 10.000 items with 32 CPUs (again, spikes not the average) > especially if the CPU went to sleep. The per-CPU part looks like a good > compromise. > > Reported-by: "Paul E. McKenney" <paulmck@kernel.org> > Closes: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/ > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > --- > kernel/scftorture.c | 39 +++++++++++++++++++++++++++++++++++---- > 1 file changed, 35 insertions(+), 4 deletions(-) > > diff --git a/kernel/scftorture.c b/kernel/scftorture.c > index 555b3b10621fe..1268a91af5d88 100644 > --- a/kernel/scftorture.c > +++ b/kernel/scftorture.c > @@ -97,6 +97,7 @@ struct scf_statistics { > static struct scf_statistics *scf_stats_p; > static struct task_struct *scf_torture_stats_task; > static DEFINE_PER_CPU(long long, scf_invoked_count); > +static DEFINE_PER_CPU(struct llist_head, scf_free_pool); > > // Data for random primitive selection > #define SCF_PRIM_RESCHED 0 > @@ -133,6 +134,7 @@ struct scf_check { > bool scfc_wait; > bool scfc_rpc; > struct completion scfc_completion; > + struct llist_node scf_node; > }; > > // Use to wait for all threads to start. > @@ -148,6 +150,31 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); > > extern void resched_cpu(int cpu); // An alternative IPI vector. > > +static void scf_add_to_free_list(struct scf_check *scfcp) > +{ > + struct llist_head *pool; > + unsigned int cpu; > + > + cpu = raw_smp_processor_id() % nthreads; > + pool = &per_cpu(scf_free_pool, cpu); > + llist_add(&scfcp->scf_node, pool); > +} > + > +static void scf_cleanup_free_list(unsigned int cpu) > +{ > + struct llist_head *pool; > + struct llist_node *node; > + struct scf_check *scfcp; > + > + pool = &per_cpu(scf_free_pool, cpu); > + node = llist_del_all(pool); > + while (node) { > + scfcp = llist_entry(node, struct scf_check, scf_node); > + node = node->next; > + kfree(scfcp); > + } > +} > + > // Print torture statistics. Caller must ensure serialization. > static void scf_torture_stats_print(void) > { > @@ -296,7 +323,7 @@ static void scf_handler(void *scfc_in) > if (scfcp->scfc_rpc) > complete(&scfcp->scfc_completion); > } else { > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > } > } > > @@ -363,7 +390,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > scfp->n_single_wait_ofl++; > else > scfp->n_single_ofl++; > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > scfcp = NULL; > } > break; > @@ -391,7 +418,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > preempt_disable(); > } else { > scfp->n_single_rpc_ofl++; > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > scfcp = NULL; > } > break; > @@ -428,7 +455,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); > atomic_inc(&n_mb_out_errs); // Leak rather than trash! > } else { > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > } > barrier(); // Prevent race-reduction compiler optimizations. > } > @@ -479,6 +506,8 @@ static int scftorture_invoker(void *arg) > VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); > > do { > + scf_cleanup_free_list(cpu); > + > scftorture_invoke_one(scfp, &rand); > while (cpu_is_offline(cpu) && !torture_must_stop()) { > schedule_timeout_interruptible(HZ / 5); > @@ -538,6 +567,8 @@ static void scf_torture_cleanup(void) > > end: > torture_cleanup_end(); > + for (i = 0; i < nthreads; i++) This needs to be: for (i = 0; i < nr_cpu_ids; i++) because nthreads can be larger than nr_cpu_ids, and it'll access a out-of-bound percpu section. Regards, Boqun > + scf_cleanup_free_list(i); > } > > static int __init scf_torture_init(void) > -- > 2.45.2 >
On 2024-11-07 12:45:25 [-0800], Boqun Feng wrote: > > @@ -538,6 +567,8 @@ static void scf_torture_cleanup(void) > > > > end: > > torture_cleanup_end(); > > + for (i = 0; i < nthreads; i++) > > This needs to be: > > for (i = 0; i < nr_cpu_ids; i++) > > because nthreads can be larger than nr_cpu_ids, and it'll access a > out-of-bound percpu section. And I though I learned my lesson last time. Thank you. > Regards, > Boqun Sebastian
On Thu, Nov 07, 2024 at 12:45:25PM -0800, Boqun Feng wrote: > On Thu, Nov 07, 2024 at 12:13:08PM +0100, Sebastian Andrzej Siewior wrote: > > scf_handler() is used as a SMP function call. This function is always > > invoked in IRQ-context even with forced-threading enabled. This function > > frees memory which not allowed on PREEMPT_RT because the locking > > underneath is using sleeping locks. > > > > Add a per-CPU scf_free_pool where each SMP functions adds its memory to > > be freed. This memory is then freed by scftorture_invoker() on each > > iteration. On the majority of invocations the number of items is less > > than five. If the thread sleeps/ gets delayed the number exceed 350 but > > did not reach 400 in testing. These were the spikes during testing. > > The bulk free of 64 pointers at once should improve the give-back if the > > list grows. The list size is ~1.3 items per invocations. > > > > Having one global scf_free_pool with one cleaning thread let the list > > grow to over 10.000 items with 32 CPUs (again, spikes not the average) > > especially if the CPU went to sleep. The per-CPU part looks like a good > > compromise. > > > > Reported-by: "Paul E. McKenney" <paulmck@kernel.org> > > Closes: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/ > > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> > > --- > > kernel/scftorture.c | 39 +++++++++++++++++++++++++++++++++++---- > > 1 file changed, 35 insertions(+), 4 deletions(-) > > > > diff --git a/kernel/scftorture.c b/kernel/scftorture.c > > index 555b3b10621fe..1268a91af5d88 100644 > > --- a/kernel/scftorture.c > > +++ b/kernel/scftorture.c > > @@ -97,6 +97,7 @@ struct scf_statistics { > > static struct scf_statistics *scf_stats_p; > > static struct task_struct *scf_torture_stats_task; > > static DEFINE_PER_CPU(long long, scf_invoked_count); > > +static DEFINE_PER_CPU(struct llist_head, scf_free_pool); > > > > // Data for random primitive selection > > #define SCF_PRIM_RESCHED 0 > > @@ -133,6 +134,7 @@ struct scf_check { > > bool scfc_wait; > > bool scfc_rpc; > > struct completion scfc_completion; > > + struct llist_node scf_node; > > }; > > > > // Use to wait for all threads to start. > > @@ -148,6 +150,31 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); > > > > extern void resched_cpu(int cpu); // An alternative IPI vector. > > > > +static void scf_add_to_free_list(struct scf_check *scfcp) > > +{ > > + struct llist_head *pool; > > + unsigned int cpu; > > + > > + cpu = raw_smp_processor_id() % nthreads; > > + pool = &per_cpu(scf_free_pool, cpu); > > + llist_add(&scfcp->scf_node, pool); > > +} > > + > > +static void scf_cleanup_free_list(unsigned int cpu) > > +{ > > + struct llist_head *pool; > > + struct llist_node *node; > > + struct scf_check *scfcp; > > + > > + pool = &per_cpu(scf_free_pool, cpu); > > + node = llist_del_all(pool); > > + while (node) { > > + scfcp = llist_entry(node, struct scf_check, scf_node); > > + node = node->next; > > + kfree(scfcp); > > + } > > +} > > + > > // Print torture statistics. Caller must ensure serialization. > > static void scf_torture_stats_print(void) > > { > > @@ -296,7 +323,7 @@ static void scf_handler(void *scfc_in) > > if (scfcp->scfc_rpc) > > complete(&scfcp->scfc_completion); > > } else { > > - kfree(scfcp); > > + scf_add_to_free_list(scfcp); > > } > > } > > > > @@ -363,7 +390,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > > scfp->n_single_wait_ofl++; > > else > > scfp->n_single_ofl++; > > - kfree(scfcp); > > + scf_add_to_free_list(scfcp); > > scfcp = NULL; > > } > > break; > > @@ -391,7 +418,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > > preempt_disable(); > > } else { > > scfp->n_single_rpc_ofl++; > > - kfree(scfcp); > > + scf_add_to_free_list(scfcp); > > scfcp = NULL; > > } > > break; > > @@ -428,7 +455,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > > pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); > > atomic_inc(&n_mb_out_errs); // Leak rather than trash! > > } else { > > - kfree(scfcp); > > + scf_add_to_free_list(scfcp); > > } > > barrier(); // Prevent race-reduction compiler optimizations. > > } > > @@ -479,6 +506,8 @@ static int scftorture_invoker(void *arg) > > VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); > > > > do { > > + scf_cleanup_free_list(cpu); > > + > > scftorture_invoke_one(scfp, &rand); > > while (cpu_is_offline(cpu) && !torture_must_stop()) { > > schedule_timeout_interruptible(HZ / 5); > > @@ -538,6 +567,8 @@ static void scf_torture_cleanup(void) > > > > end: > > torture_cleanup_end(); > > + for (i = 0; i < nthreads; i++) > > This needs to be: > > for (i = 0; i < nr_cpu_ids; i++) > > because nthreads can be larger than nr_cpu_ids, and it'll access a > out-of-bound percpu section. I clearly did not test thoroughly enough. Good catch!!! Thanx, Paul > Regards, > Boqun > > > + scf_cleanup_free_list(i); > > } > > > > static int __init scf_torture_init(void) > > -- > > 2.45.2 > >
On Thu, Nov 07, 2024 at 12:13:08PM +0100, Sebastian Andrzej Siewior wrote: > scf_handler() is used as a SMP function call. This function is always > invoked in IRQ-context even with forced-threading enabled. This function > frees memory which not allowed on PREEMPT_RT because the locking > underneath is using sleeping locks. > > Add a per-CPU scf_free_pool where each SMP functions adds its memory to > be freed. This memory is then freed by scftorture_invoker() on each > iteration. On the majority of invocations the number of items is less > than five. If the thread sleeps/ gets delayed the number exceed 350 but > did not reach 400 in testing. These were the spikes during testing. > The bulk free of 64 pointers at once should improve the give-back if the > list grows. The list size is ~1.3 items per invocations. > > Having one global scf_free_pool with one cleaning thread let the list > grow to over 10.000 items with 32 CPUs (again, spikes not the average) > especially if the CPU went to sleep. The per-CPU part looks like a good > compromise. > > Reported-by: "Paul E. McKenney" <paulmck@kernel.org> > Closes: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/ > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Nice!!! One nit at the end below. > --- > kernel/scftorture.c | 39 +++++++++++++++++++++++++++++++++++---- > 1 file changed, 35 insertions(+), 4 deletions(-) > > diff --git a/kernel/scftorture.c b/kernel/scftorture.c > index 555b3b10621fe..1268a91af5d88 100644 > --- a/kernel/scftorture.c > +++ b/kernel/scftorture.c > @@ -97,6 +97,7 @@ struct scf_statistics { > static struct scf_statistics *scf_stats_p; > static struct task_struct *scf_torture_stats_task; > static DEFINE_PER_CPU(long long, scf_invoked_count); > +static DEFINE_PER_CPU(struct llist_head, scf_free_pool); > > // Data for random primitive selection > #define SCF_PRIM_RESCHED 0 > @@ -133,6 +134,7 @@ struct scf_check { > bool scfc_wait; > bool scfc_rpc; > struct completion scfc_completion; > + struct llist_node scf_node; > }; > > // Use to wait for all threads to start. > @@ -148,6 +150,31 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); > > extern void resched_cpu(int cpu); // An alternative IPI vector. > > +static void scf_add_to_free_list(struct scf_check *scfcp) > +{ > + struct llist_head *pool; > + unsigned int cpu; > + > + cpu = raw_smp_processor_id() % nthreads; > + pool = &per_cpu(scf_free_pool, cpu); > + llist_add(&scfcp->scf_node, pool); > +} > + > +static void scf_cleanup_free_list(unsigned int cpu) > +{ > + struct llist_head *pool; > + struct llist_node *node; > + struct scf_check *scfcp; > + > + pool = &per_cpu(scf_free_pool, cpu); > + node = llist_del_all(pool); > + while (node) { > + scfcp = llist_entry(node, struct scf_check, scf_node); > + node = node->next; > + kfree(scfcp); > + } > +} > + > // Print torture statistics. Caller must ensure serialization. > static void scf_torture_stats_print(void) > { > @@ -296,7 +323,7 @@ static void scf_handler(void *scfc_in) > if (scfcp->scfc_rpc) > complete(&scfcp->scfc_completion); > } else { > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > } > } > > @@ -363,7 +390,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > scfp->n_single_wait_ofl++; > else > scfp->n_single_ofl++; > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > scfcp = NULL; > } > break; > @@ -391,7 +418,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > preempt_disable(); > } else { > scfp->n_single_rpc_ofl++; > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > scfcp = NULL; > } > break; > @@ -428,7 +455,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra > pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); > atomic_inc(&n_mb_out_errs); // Leak rather than trash! > } else { > - kfree(scfcp); > + scf_add_to_free_list(scfcp); > } > barrier(); // Prevent race-reduction compiler optimizations. > } > @@ -479,6 +506,8 @@ static int scftorture_invoker(void *arg) > VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); > > do { > + scf_cleanup_free_list(cpu); > + > scftorture_invoke_one(scfp, &rand); > while (cpu_is_offline(cpu) && !torture_must_stop()) { > schedule_timeout_interruptible(HZ / 5); > @@ -538,6 +567,8 @@ static void scf_torture_cleanup(void) > > end: > torture_cleanup_end(); > + for (i = 0; i < nthreads; i++) > + scf_cleanup_free_list(i); It would be better for this to precede the call to torture_cleanup_end(). As soon as torture_cleanup_end() is invoked, in theory, another torture test might start. Yes, in practice, this would only matter if the next module was again scftorture and you aren't supposed to modprobe a given module until after the prior rmmod has completed, which would prevent this scf_cleanup_free_list() from interacting with the incoming instance of scftorture. But why even allow the possibility? Thanx, Paul > } > > static int __init scf_torture_init(void) > -- > 2.45.2 >
© 2016 - 2024 Red Hat, Inc.