Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ include/linux/sched.h | 4 ++++ include/linux/vm_event_item.h | 2 ++ kernel/sched/core.c | 10 ++++++++-- kernel/sched/debug.c | 4 ++++ mm/memcontrol.c | 2 ++ mm/vmstat.c | 2 ++ 7 files changed, 28 insertions(+), 2 deletions(-)
On systems with NUMA balancing enabled, it is found that tracking
the task activities due to NUMA balancing is helpful. NUMA balancing
has two mechanisms for task migration: one is to migrate the task to
an idle CPU in its preferred node, the other is to swap tasks on
different nodes if they are on each other's preferred node.
The kernel already has NUMA page migration statistics in
/sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched,
but does not have statistics for task migration/swap.
Add the task migration and swap count accordingly.
The following two new fields:
numa_task_migrated
numa_task_swapped
will be displayed in both
/sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched
Introducing both pertask and permemcg NUMA balancing statistics helps
to quickly evaluate the performance and resource usage of the target
workload. For example, the user can first identify the container which
has high NUMA balance activity and then narrow down to a specific task
within that group, and tune the memory policy of that task.
In summary, it is plausible to iterate the /proc/$pid/sched to find the
offending task, but the introduction of per memcg tasks' Numa balancing
aggregated activity can further help users identify the task in a
divide-and-conquer way.
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
v1->v2:
Update the Documentation/admin-guide/cgroup-v2.rst. (Michal)
---
Documentation/admin-guide/cgroup-v2.rst | 6 ++++++
include/linux/sched.h | 4 ++++
include/linux/vm_event_item.h | 2 ++
kernel/sched/core.c | 10 ++++++++--
kernel/sched/debug.c | 4 ++++
mm/memcontrol.c | 2 ++
mm/vmstat.c | 2 ++
7 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index f293a13b42ed..b698be14942c 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1652,6 +1652,12 @@ The following nested keys are defined.
numa_hint_faults (npn)
Number of NUMA hinting faults.
+ numa_task_migrated (npn)
+ Number of task migration by NUMA balancing.
+
+ numa_task_swapped (npn)
+ Number of task swap by NUMA balancing.
+
pgdemote_kswapd
Number of pages demoted by kswapd.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56ddeb37b5cd..2e91326c16ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,6 +549,10 @@ struct sched_statistics {
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
+#ifdef CONFIG_NUMA_BALANCING
+ u64 numa_task_migrated;
+ u64 numa_task_swapped;
+#endif
u64 nr_wakeups;
u64 nr_wakeups_sync;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 5a37cb2b6f93..df8a1b30930f 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -64,6 +64,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NUMA_HINT_FAULTS,
NUMA_HINT_FAULTS_LOCAL,
NUMA_PAGE_MIGRATE,
+ NUMA_TASK_MIGRATE,
+ NUMA_TASK_SWAP,
#endif
#ifdef CONFIG_MIGRATION
PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b434c2f7e3c1..54e7d63f7785 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3352,6 +3352,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
+ __schedstat_inc(p->stats.numa_task_swapped);
+
+ if (p->mm)
+ count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1);
+
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@@ -7955,8 +7960,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
- /* TODO: This is not properly updating schedstats */
-
+ __schedstat_inc(p->stats.numa_task_migrated);
+ if (p->mm)
+ count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1);
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 56ae54e0ce6a..f971c2af7912 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P_SCHEDSTAT(nr_failed_migrations_running);
P_SCHEDSTAT(nr_failed_migrations_hot);
P_SCHEDSTAT(nr_forced_migrations);
+#ifdef CONFIG_NUMA_BALANCING
+ P_SCHEDSTAT(numa_task_migrated);
+ P_SCHEDSTAT(numa_task_swapped);
+#endif
P_SCHEDSTAT(nr_wakeups);
P_SCHEDSTAT(nr_wakeups_sync);
P_SCHEDSTAT(nr_wakeups_migrate);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 83c2df73e4b6..1ba1fa9ed8cb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -460,6 +460,8 @@ static const unsigned int memcg_vm_event_stat[] = {
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
+ NUMA_TASK_MIGRATE,
+ NUMA_TASK_SWAP,
#endif
};
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 651318765ebf..4abd2ca05d2a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1342,6 +1342,8 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
+ "numa_task_migrated",
+ "numa_task_swapped",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
--
2.25.1
Hi Chen Yu, I think this is quite useful! I hope it can be picked up. I have one comment below On 4/8/25 03:14, Chen Yu wrote: > On systems with NUMA balancing enabled, it is found that tracking > the task activities due to NUMA balancing is helpful. NUMA balancing > has two mechanisms for task migration: one is to migrate the task to > an idle CPU in its preferred node, the other is to swap tasks on > different nodes if they are on each other's preferred node. > > The kernel already has NUMA page migration statistics in > /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched, > but does not have statistics for task migration/swap. > Add the task migration and swap count accordingly. > > The following two new fields: > > numa_task_migrated > numa_task_swapped > > will be displayed in both > /sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched > > Introducing both pertask and permemcg NUMA balancing statistics helps > to quickly evaluate the performance and resource usage of the target > workload. For example, the user can first identify the container which > has high NUMA balance activity and then narrow down to a specific task > within that group, and tune the memory policy of that task. > In summary, it is plausible to iterate the /proc/$pid/sched to find the > offending task, but the introduction of per memcg tasks' Numa balancing > aggregated activity can further help users identify the task in a > divide-and-conquer way. > > Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> > Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> > Signed-off-by: Chen Yu <yu.c.chen@intel.com> > --- > v1->v2: > Update the Documentation/admin-guide/cgroup-v2.rst. (Michal) > --- > Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ > include/linux/sched.h | 4 ++++ > include/linux/vm_event_item.h | 2 ++ > kernel/sched/core.c | 10 ++++++++-- > kernel/sched/debug.c | 4 ++++ > mm/memcontrol.c | 2 ++ > mm/vmstat.c | 2 ++ > 7 files changed, 28 insertions(+), 2 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index f293a13b42ed..b698be14942c 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -1652,6 +1652,12 @@ The following nested keys are defined. > numa_hint_faults (npn) > Number of NUMA hinting faults. > > + numa_task_migrated (npn) > + Number of task migration by NUMA balancing. > + > + numa_task_swapped (npn) > + Number of task swap by NUMA balancing. > + > pgdemote_kswapd > Number of pages demoted by kswapd. > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 56ddeb37b5cd..2e91326c16ec 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -549,6 +549,10 @@ struct sched_statistics { > u64 nr_failed_migrations_running; > u64 nr_failed_migrations_hot; > u64 nr_forced_migrations; > +#ifdef CONFIG_NUMA_BALANCING > + u64 numa_task_migrated; > + u64 numa_task_swapped; > +#endif > > u64 nr_wakeups; > u64 nr_wakeups_sync; > diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h > index 5a37cb2b6f93..df8a1b30930f 100644 > --- a/include/linux/vm_event_item.h > +++ b/include/linux/vm_event_item.h > @@ -64,6 +64,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, > NUMA_HINT_FAULTS, > NUMA_HINT_FAULTS_LOCAL, > NUMA_PAGE_MIGRATE, > + NUMA_TASK_MIGRATE, > + NUMA_TASK_SWAP, > #endif > #ifdef CONFIG_MIGRATION > PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index b434c2f7e3c1..54e7d63f7785 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3352,6 +3352,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) > #ifdef CONFIG_NUMA_BALANCING > static void __migrate_swap_task(struct task_struct *p, int cpu) > { > + __schedstat_inc(p->stats.numa_task_swapped); > + > + if (p->mm) > + count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1); > + Is p->mm check necessary? I am pretty sure a !p->mm task cannot reach to this point, task_tick_numa() will filter out those tasks, no hinting page fault on such ones. We can add a likely() macro here to minimize the overhead if there is a reason to keep that check. Same comment to the other one in migrate_task_to(). Thanks, Libo > if (task_on_rq_queued(p)) { > struct rq *src_rq, *dst_rq; > struct rq_flags srf, drf; > @@ -7955,8 +7960,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) > if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) > return -EINVAL; > > - /* TODO: This is not properly updating schedstats */ > - > + __schedstat_inc(p->stats.numa_task_migrated); > + if (p->mm) > + count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1); > trace_sched_move_numa(p, curr_cpu, target_cpu); > return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); > } > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c > index 56ae54e0ce6a..f971c2af7912 100644 > --- a/kernel/sched/debug.c > +++ b/kernel/sched/debug.c > @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, > P_SCHEDSTAT(nr_failed_migrations_running); > P_SCHEDSTAT(nr_failed_migrations_hot); > P_SCHEDSTAT(nr_forced_migrations); > +#ifdef CONFIG_NUMA_BALANCING > + P_SCHEDSTAT(numa_task_migrated); > + P_SCHEDSTAT(numa_task_swapped); > +#endif > P_SCHEDSTAT(nr_wakeups); > P_SCHEDSTAT(nr_wakeups_sync); > P_SCHEDSTAT(nr_wakeups_migrate); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 83c2df73e4b6..1ba1fa9ed8cb 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -460,6 +460,8 @@ static const unsigned int memcg_vm_event_stat[] = { > NUMA_PAGE_MIGRATE, > NUMA_PTE_UPDATES, > NUMA_HINT_FAULTS, > + NUMA_TASK_MIGRATE, > + NUMA_TASK_SWAP, > #endif > }; > > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 651318765ebf..4abd2ca05d2a 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -1342,6 +1342,8 @@ const char * const vmstat_text[] = { > "numa_hint_faults", > "numa_hint_faults_local", > "numa_pages_migrated", > + "numa_task_migrated", > + "numa_task_swapped", > #endif > #ifdef CONFIG_MIGRATION > "pgmigrate_success",
Hi Libo, On 4/29/2025 7:10 AM, Libo Chen wrote: > Hi Chen Yu, > > I think this is quite useful! I hope it can be picked up. > > I have one comment below > > On 4/8/25 03:14, Chen Yu wrote: >> On systems with NUMA balancing enabled, it is found that tracking >> the task activities due to NUMA balancing is helpful. NUMA balancing >> has two mechanisms for task migration: one is to migrate the task to >> an idle CPU in its preferred node, the other is to swap tasks on >> different nodes if they are on each other's preferred node. >> >> The kernel already has NUMA page migration statistics in >> /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched, >> but does not have statistics for task migration/swap. >> Add the task migration and swap count accordingly. >> >> The following two new fields: >> >> numa_task_migrated >> numa_task_swapped >> >> will be displayed in both >> /sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched >> >> Introducing both pertask and permemcg NUMA balancing statistics helps >> to quickly evaluate the performance and resource usage of the target >> workload. For example, the user can first identify the container which >> has high NUMA balance activity and then narrow down to a specific task >> within that group, and tune the memory policy of that task. >> In summary, it is plausible to iterate the /proc/$pid/sched to find the >> offending task, but the introduction of per memcg tasks' Numa balancing >> aggregated activity can further help users identify the task in a >> divide-and-conquer way. >> >> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> >> Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> >> Signed-off-by: Chen Yu <yu.c.chen@intel.com> >> --- >> v1->v2: >> Update the Documentation/admin-guide/cgroup-v2.rst. (Michal) >> --- >> Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ >> include/linux/sched.h | 4 ++++ >> include/linux/vm_event_item.h | 2 ++ >> kernel/sched/core.c | 10 ++++++++-- >> kernel/sched/debug.c | 4 ++++ >> mm/memcontrol.c | 2 ++ >> mm/vmstat.c | 2 ++ >> 7 files changed, 28 insertions(+), 2 deletions(-) >> >> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst >> index f293a13b42ed..b698be14942c 100644 >> --- a/Documentation/admin-guide/cgroup-v2.rst >> +++ b/Documentation/admin-guide/cgroup-v2.rst >> @@ -1652,6 +1652,12 @@ The following nested keys are defined. >> numa_hint_faults (npn) >> Number of NUMA hinting faults. >> >> + numa_task_migrated (npn) >> + Number of task migration by NUMA balancing. >> + >> + numa_task_swapped (npn) >> + Number of task swap by NUMA balancing. >> + >> pgdemote_kswapd >> Number of pages demoted by kswapd. >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index 56ddeb37b5cd..2e91326c16ec 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -549,6 +549,10 @@ struct sched_statistics { >> u64 nr_failed_migrations_running; >> u64 nr_failed_migrations_hot; >> u64 nr_forced_migrations; >> +#ifdef CONFIG_NUMA_BALANCING >> + u64 numa_task_migrated; >> + u64 numa_task_swapped; >> +#endif >> >> u64 nr_wakeups; >> u64 nr_wakeups_sync; >> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h >> index 5a37cb2b6f93..df8a1b30930f 100644 >> --- a/include/linux/vm_event_item.h >> +++ b/include/linux/vm_event_item.h >> @@ -64,6 +64,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, >> NUMA_HINT_FAULTS, >> NUMA_HINT_FAULTS_LOCAL, >> NUMA_PAGE_MIGRATE, >> + NUMA_TASK_MIGRATE, >> + NUMA_TASK_SWAP, >> #endif >> #ifdef CONFIG_MIGRATION >> PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >> index b434c2f7e3c1..54e7d63f7785 100644 >> --- a/kernel/sched/core.c >> +++ b/kernel/sched/core.c >> @@ -3352,6 +3352,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) >> #ifdef CONFIG_NUMA_BALANCING >> static void __migrate_swap_task(struct task_struct *p, int cpu) >> { >> + __schedstat_inc(p->stats.numa_task_swapped); >> + >> + if (p->mm) >> + count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1); >> + > > Is p->mm check necessary? I am pretty sure a !p->mm task cannot reach to this point, > task_tick_numa() will filter out those tasks, no hinting page fault on such ones. Right, kernel thread is not supposed to reach here. The swap task is obtained from env->best_task, which is assigned by the functions task_numa_compare() and task_numa_assign(), iff that task has a valid numa_preferred_nid . In other words, only when the swap task is a non-kernel thread will it possess a valid numa_preferred_nid. Let me remove these checks both in __migrate_swap_task and migrate_task_to(). Thanks, Chenyu > We can add a likely() macro here to minimize the overhead if there is a reason to > keep that check. > > Same comment to the other one in migrate_task_to(). > >
On 4/28/25 16:10, Libo Chen wrote: > Hi Chen Yu, > > I think this is quite useful! I hope it can be picked up. > oops, my bad. It looks like this has been picked up. But I think my comment still applies~ > I have one comment below > > On 4/8/25 03:14, Chen Yu wrote: >> On systems with NUMA balancing enabled, it is found that tracking >> the task activities due to NUMA balancing is helpful. NUMA balancing >> has two mechanisms for task migration: one is to migrate the task to >> an idle CPU in its preferred node, the other is to swap tasks on >> different nodes if they are on each other's preferred node. >> >> The kernel already has NUMA page migration statistics in >> /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched, >> but does not have statistics for task migration/swap. >> Add the task migration and swap count accordingly. >> >> The following two new fields: >> >> numa_task_migrated >> numa_task_swapped >> >> will be displayed in both >> /sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched >> >> Introducing both pertask and permemcg NUMA balancing statistics helps >> to quickly evaluate the performance and resource usage of the target >> workload. For example, the user can first identify the container which >> has high NUMA balance activity and then narrow down to a specific task >> within that group, and tune the memory policy of that task. >> In summary, it is plausible to iterate the /proc/$pid/sched to find the >> offending task, but the introduction of per memcg tasks' Numa balancing >> aggregated activity can further help users identify the task in a >> divide-and-conquer way. >> >> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com> >> Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com> >> Signed-off-by: Chen Yu <yu.c.chen@intel.com> >> --- >> v1->v2: >> Update the Documentation/admin-guide/cgroup-v2.rst. (Michal) >> --- >> Documentation/admin-guide/cgroup-v2.rst | 6 ++++++ >> include/linux/sched.h | 4 ++++ >> include/linux/vm_event_item.h | 2 ++ >> kernel/sched/core.c | 10 ++++++++-- >> kernel/sched/debug.c | 4 ++++ >> mm/memcontrol.c | 2 ++ >> mm/vmstat.c | 2 ++ >> 7 files changed, 28 insertions(+), 2 deletions(-) >> >> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst >> index f293a13b42ed..b698be14942c 100644 >> --- a/Documentation/admin-guide/cgroup-v2.rst >> +++ b/Documentation/admin-guide/cgroup-v2.rst >> @@ -1652,6 +1652,12 @@ The following nested keys are defined. >> numa_hint_faults (npn) >> Number of NUMA hinting faults. >> >> + numa_task_migrated (npn) >> + Number of task migration by NUMA balancing. >> + >> + numa_task_swapped (npn) >> + Number of task swap by NUMA balancing. >> + >> pgdemote_kswapd >> Number of pages demoted by kswapd. >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index 56ddeb37b5cd..2e91326c16ec 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -549,6 +549,10 @@ struct sched_statistics { >> u64 nr_failed_migrations_running; >> u64 nr_failed_migrations_hot; >> u64 nr_forced_migrations; >> +#ifdef CONFIG_NUMA_BALANCING >> + u64 numa_task_migrated; >> + u64 numa_task_swapped; >> +#endif >> >> u64 nr_wakeups; >> u64 nr_wakeups_sync; >> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h >> index 5a37cb2b6f93..df8a1b30930f 100644 >> --- a/include/linux/vm_event_item.h >> +++ b/include/linux/vm_event_item.h >> @@ -64,6 +64,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, >> NUMA_HINT_FAULTS, >> NUMA_HINT_FAULTS_LOCAL, >> NUMA_PAGE_MIGRATE, >> + NUMA_TASK_MIGRATE, >> + NUMA_TASK_SWAP, >> #endif >> #ifdef CONFIG_MIGRATION >> PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >> index b434c2f7e3c1..54e7d63f7785 100644 >> --- a/kernel/sched/core.c >> +++ b/kernel/sched/core.c >> @@ -3352,6 +3352,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) >> #ifdef CONFIG_NUMA_BALANCING >> static void __migrate_swap_task(struct task_struct *p, int cpu) >> { >> + __schedstat_inc(p->stats.numa_task_swapped); >> + >> + if (p->mm) >> + count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1); >> + > > Is p->mm check necessary? I am pretty sure a !p->mm task cannot reach to this point, > task_tick_numa() will filter out those tasks, no hinting page fault on such ones. > We can add a likely() macro here to minimize the overhead if there is a reason to > keep that check. > > Same comment to the other one in migrate_task_to(). > > > Thanks, > Libo > >> if (task_on_rq_queued(p)) { >> struct rq *src_rq, *dst_rq; >> struct rq_flags srf, drf; >> @@ -7955,8 +7960,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) >> if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) >> return -EINVAL; >> >> - /* TODO: This is not properly updating schedstats */ >> - >> + __schedstat_inc(p->stats.numa_task_migrated); >> + if (p->mm) >> + count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1); >> trace_sched_move_numa(p, curr_cpu, target_cpu); >> return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); >> } >> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c >> index 56ae54e0ce6a..f971c2af7912 100644 >> --- a/kernel/sched/debug.c >> +++ b/kernel/sched/debug.c >> @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, >> P_SCHEDSTAT(nr_failed_migrations_running); >> P_SCHEDSTAT(nr_failed_migrations_hot); >> P_SCHEDSTAT(nr_forced_migrations); >> +#ifdef CONFIG_NUMA_BALANCING >> + P_SCHEDSTAT(numa_task_migrated); >> + P_SCHEDSTAT(numa_task_swapped); >> +#endif >> P_SCHEDSTAT(nr_wakeups); >> P_SCHEDSTAT(nr_wakeups_sync); >> P_SCHEDSTAT(nr_wakeups_migrate); >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index 83c2df73e4b6..1ba1fa9ed8cb 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -460,6 +460,8 @@ static const unsigned int memcg_vm_event_stat[] = { >> NUMA_PAGE_MIGRATE, >> NUMA_PTE_UPDATES, >> NUMA_HINT_FAULTS, >> + NUMA_TASK_MIGRATE, >> + NUMA_TASK_SWAP, >> #endif >> }; >> >> diff --git a/mm/vmstat.c b/mm/vmstat.c >> index 651318765ebf..4abd2ca05d2a 100644 >> --- a/mm/vmstat.c >> +++ b/mm/vmstat.c >> @@ -1342,6 +1342,8 @@ const char * const vmstat_text[] = { >> "numa_hint_faults", >> "numa_hint_faults_local", >> "numa_pages_migrated", >> + "numa_task_migrated", >> + "numa_task_swapped", >> #endif >> #ifdef CONFIG_MIGRATION >> "pgmigrate_success", >
© 2016 - 2025 Red Hat, Inc.