Documentation/admin-guide/cgroup-v2.rst | 6 ------ include/linux/sched.h | 4 ---- include/linux/vm_event_item.h | 2 -- kernel/sched/core.c | 9 ++------- kernel/sched/debug.c | 4 ---- mm/memcontrol.c | 2 -- mm/vmstat.c | 2 -- 7 files changed, 2 insertions(+), 27 deletions(-)
This reverts commit ad6b26b6a0a79166b53209df2ca1cf8636296382.
This commit introduces per-memcg/task NUMA balance statistics,
but unfortunately it introduced a NULL pointer exception due
to the following race condition: After a swap task candidate
was chosen, its mm_struct pointer was set to NULL due to task
exit. Later, when performing the actual task swapping, the
p->mm caused the problem.
CPU0 CPU1
:
...
task_numa_migrate
task_numa_find_cpu
task_numa_compare
# a normal task p is chosen
env->best_task = p
# p exit:
exit_signals(p);
p->flags |= PF_EXITING
exit_mm
p->mm = NULL;
migrate_swap_stop
__migrate_swap_task((arg->src_task, arg->dst_cpu)
count_memcg_event_mm(p->mm, NUMA_TASK_SWAP)# p->mm is NULL
task_lock() should be held and the PF_EXITING flag needs to
be checked to prevent this from happening. After discussion,
the conclusion was that adding a lock is not worthwhile for
some statistics calculations. Revert the change and rely on
the tracepoint for this purpose.
Fixes: ad6b26b6a0a7 ("sched/numa: add statistics of numa balance task")
Reported-by: Jirka Hladky <jhladky@redhat.com>
Closes: https://lore.kernel.org/all/CAE4VaGBLJxpd=NeRJXpSCuw=REhC5LWJpC29kDy-Zh2ZDyzQZA@mail.gmail.com/
Reported-by: Srikanth Aithal <Srikanth.Aithal@amd.com>
Reported-by: Suneeth D <Suneeth.D@amd.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
Documentation/admin-guide/cgroup-v2.rst | 6 ------
include/linux/sched.h | 4 ----
include/linux/vm_event_item.h | 2 --
kernel/sched/core.c | 9 ++-------
kernel/sched/debug.c | 4 ----
mm/memcontrol.c | 2 --
mm/vmstat.c | 2 --
7 files changed, 2 insertions(+), 27 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0cc35a14afbe..bd98ea3175ec 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1732,12 +1732,6 @@ The following nested keys are defined.
numa_hint_faults (npn)
Number of NUMA hinting faults.
- numa_task_migrated (npn)
- Number of task migration by NUMA balancing.
-
- numa_task_swapped (npn)
- Number of task swap by NUMA balancing.
-
pgdemote_kswapd
Number of pages demoted by kswapd.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..aa9c5be7a632 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -548,10 +548,6 @@ struct sched_statistics {
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
-#ifdef CONFIG_NUMA_BALANCING
- u64 numa_task_migrated;
- u64 numa_task_swapped;
-#endif
u64 nr_wakeups;
u64 nr_wakeups_sync;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 91a3ce9a2687..9e15a088ba38 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -66,8 +66,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NUMA_HINT_FAULTS,
NUMA_HINT_FAULTS_LOCAL,
NUMA_PAGE_MIGRATE,
- NUMA_TASK_MIGRATE,
- NUMA_TASK_SWAP,
#endif
#ifdef CONFIG_MIGRATION
PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8988d38d46a3..ca0be74e865b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- __schedstat_inc(p->stats.numa_task_swapped);
- count_vm_numa_event(NUMA_TASK_SWAP);
- count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
-
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@@ -7934,9 +7930,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
- __schedstat_inc(p->stats.numa_task_migrated);
- count_vm_numa_event(NUMA_TASK_MIGRATE);
- count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE);
+ /* TODO: This is not properly updating schedstats */
+
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9d71baf08075..557246880a7e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P_SCHEDSTAT(nr_failed_migrations_running);
P_SCHEDSTAT(nr_failed_migrations_hot);
P_SCHEDSTAT(nr_forced_migrations);
-#ifdef CONFIG_NUMA_BALANCING
- P_SCHEDSTAT(numa_task_migrated);
- P_SCHEDSTAT(numa_task_swapped);
-#endif
P_SCHEDSTAT(nr_wakeups);
P_SCHEDSTAT(nr_wakeups_sync);
P_SCHEDSTAT(nr_wakeups_migrate);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 902da8a9c643..70fdeda1120b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -474,8 +474,6 @@ static const unsigned int memcg_vm_event_stat[] = {
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
- NUMA_TASK_MIGRATE,
- NUMA_TASK_SWAP,
#endif
};
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 429ae5339bfe..a78d70ddeacd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1346,8 +1346,6 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
- "numa_task_migrated",
- "numa_task_swapped",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
--
2.25.1
On Fri 04-07-25 21:56:20, Chen Yu wrote: > This reverts commit ad6b26b6a0a79166b53209df2ca1cf8636296382. > > This commit introduces per-memcg/task NUMA balance statistics, > but unfortunately it introduced a NULL pointer exception due > to the following race condition: After a swap task candidate > was chosen, its mm_struct pointer was set to NULL due to task > exit. Later, when performing the actual task swapping, the > p->mm caused the problem. > > CPU0 CPU1 > : > ... > task_numa_migrate > task_numa_find_cpu > task_numa_compare > # a normal task p is chosen > env->best_task = p > > # p exit: > exit_signals(p); > p->flags |= PF_EXITING > exit_mm > p->mm = NULL; > > migrate_swap_stop > __migrate_swap_task((arg->src_task, arg->dst_cpu) > count_memcg_event_mm(p->mm, NUMA_TASK_SWAP)# p->mm is NULL > > task_lock() should be held and the PF_EXITING flag needs to > be checked to prevent this from happening. After discussion, > the conclusion was that adding a lock is not worthwhile for > some statistics calculations. Revert the change and rely on > the tracepoint for this purpose. > > Fixes: ad6b26b6a0a7 ("sched/numa: add statistics of numa balance task") > Reported-by: Jirka Hladky <jhladky@redhat.com> > Closes: https://lore.kernel.org/all/CAE4VaGBLJxpd=NeRJXpSCuw=REhC5LWJpC29kDy-Zh2ZDyzQZA@mail.gmail.com/ > Reported-by: Srikanth Aithal <Srikanth.Aithal@amd.com> > Reported-by: Suneeth D <Suneeth.D@amd.com> > Signed-off-by: Chen Yu <yu.c.chen@intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Thanks! > --- > Documentation/admin-guide/cgroup-v2.rst | 6 ------ > include/linux/sched.h | 4 ---- > include/linux/vm_event_item.h | 2 -- > kernel/sched/core.c | 9 ++------- > kernel/sched/debug.c | 4 ---- > mm/memcontrol.c | 2 -- > mm/vmstat.c | 2 -- > 7 files changed, 2 insertions(+), 27 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index 0cc35a14afbe..bd98ea3175ec 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -1732,12 +1732,6 @@ The following nested keys are defined. > numa_hint_faults (npn) > Number of NUMA hinting faults. > > - numa_task_migrated (npn) > - Number of task migration by NUMA balancing. > - > - numa_task_swapped (npn) > - Number of task swap by NUMA balancing. > - > pgdemote_kswapd > Number of pages demoted by kswapd. > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 4f78a64beb52..aa9c5be7a632 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -548,10 +548,6 @@ struct sched_statistics { > u64 nr_failed_migrations_running; > u64 nr_failed_migrations_hot; > u64 nr_forced_migrations; > -#ifdef CONFIG_NUMA_BALANCING > - u64 numa_task_migrated; > - u64 numa_task_swapped; > -#endif > > u64 nr_wakeups; > u64 nr_wakeups_sync; > diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h > index 91a3ce9a2687..9e15a088ba38 100644 > --- a/include/linux/vm_event_item.h > +++ b/include/linux/vm_event_item.h > @@ -66,8 +66,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, > NUMA_HINT_FAULTS, > NUMA_HINT_FAULTS_LOCAL, > NUMA_PAGE_MIGRATE, > - NUMA_TASK_MIGRATE, > - NUMA_TASK_SWAP, > #endif > #ifdef CONFIG_MIGRATION > PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 8988d38d46a3..ca0be74e865b 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) > #ifdef CONFIG_NUMA_BALANCING > static void __migrate_swap_task(struct task_struct *p, int cpu) > { > - __schedstat_inc(p->stats.numa_task_swapped); > - count_vm_numa_event(NUMA_TASK_SWAP); > - count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); > - > if (task_on_rq_queued(p)) { > struct rq *src_rq, *dst_rq; > struct rq_flags srf, drf; > @@ -7934,9 +7930,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu) > if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) > return -EINVAL; > > - __schedstat_inc(p->stats.numa_task_migrated); > - count_vm_numa_event(NUMA_TASK_MIGRATE); > - count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); > + /* TODO: This is not properly updating schedstats */ > + > trace_sched_move_numa(p, curr_cpu, target_cpu); > return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); > } > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c > index 9d71baf08075..557246880a7e 100644 > --- a/kernel/sched/debug.c > +++ b/kernel/sched/debug.c > @@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, > P_SCHEDSTAT(nr_failed_migrations_running); > P_SCHEDSTAT(nr_failed_migrations_hot); > P_SCHEDSTAT(nr_forced_migrations); > -#ifdef CONFIG_NUMA_BALANCING > - P_SCHEDSTAT(numa_task_migrated); > - P_SCHEDSTAT(numa_task_swapped); > -#endif > P_SCHEDSTAT(nr_wakeups); > P_SCHEDSTAT(nr_wakeups_sync); > P_SCHEDSTAT(nr_wakeups_migrate); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 902da8a9c643..70fdeda1120b 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -474,8 +474,6 @@ static const unsigned int memcg_vm_event_stat[] = { > NUMA_PAGE_MIGRATE, > NUMA_PTE_UPDATES, > NUMA_HINT_FAULTS, > - NUMA_TASK_MIGRATE, > - NUMA_TASK_SWAP, > #endif > }; > > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 429ae5339bfe..a78d70ddeacd 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -1346,8 +1346,6 @@ const char * const vmstat_text[] = { > "numa_hint_faults", > "numa_hint_faults_local", > "numa_pages_migrated", > - "numa_task_migrated", > - "numa_task_swapped", > #endif > #ifdef CONFIG_MIGRATION > "pgmigrate_success", > -- > 2.25.1 > -- Michal Hocko SUSE Labs
© 2016 - 2025 Red Hat, Inc.