include/trace/events/sched.h | 60 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 14 +++++++-- 2 files changed, 72 insertions(+), 2 deletions(-)
From: Jiayuan Chen <jiayuan.chen@shopee.com>
In systems with multiple NUMA nodes, memory imbalance between nodes often
occurs. To address this, we typically tune parameters like scan_size_mb or
scan_period_{min,max}_ms to allow processes to migrate pages between NUMA
nodes.
Currently, the migration task task_numa_work() holds the mmap_lock during
the entire migration process, which can significantly impact process
performance, especially for memory operations. This patch introduces a new
tracepoint that records the migration duration, along with the number of
scanned pages and migrated pages. These metrics can be used to calculate
efficiency metrics similar to %vmeff in 'sar -B'.
These metrics help evaluate whether the adjusted NUMA balancing parameters
are properly tuned.
Here's an example bpftrace script:
```bash
bpftrace -e '
tracepoint:sched:sched_numa_balance_start
{
@start_time[cpu] = nsecs;
}
tracepoint:sched:sched_numa_balance_end {
if (@start_time[cpu] > 0) {
$cost = nsecs - @start_time[cpu];
printf("task '%s' migrate cost %lu, scanned %lu, migrated %lu\n",
args.comm, $cost, args.scanned, args.migrated);
}
}
'
```
Sample output:
Attaching 2 probes...
task 'rs:main Q:Reg' migrate cost 5584655, scanned 24516, migrated 22373
task 'systemd-journal' migrate cost 123191, scanned 6308, migrated 0
task 'wrk' migrate cost 894026, scanned 5842, migrated 5841
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
---
include/trace/events/sched.h | 60 ++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 14 +++++++--
2 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..e24bf700a614 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -804,6 +804,66 @@ TRACE_EVENT(sched_skip_cpuset_numa,
__entry->ngid,
MAX_NUMNODES, __entry->mem_allowed)
);
+
+TRACE_EVENT(sched_numa_balance_start,
+
+ TP_PROTO(struct task_struct *tsk),
+
+ TP_ARGS(tsk),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(pid_t, tgid)
+ __field(pid_t, ngid)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = task_pid_nr(tsk);
+ __entry->tgid = task_tgid_nr(tsk);
+ __entry->ngid = task_numa_group_id(tsk);
+ ),
+
+ TP_printk("comm=%s pid=%d tgid=%d ngid=%d",
+ __entry->comm,
+ __entry->pid,
+ __entry->tgid,
+ __entry->ngid)
+);
+
+TRACE_EVENT(sched_numa_balance_end,
+
+ TP_PROTO(struct task_struct *tsk, unsigned long scanned, unsigned long migrated),
+
+ TP_ARGS(tsk, scanned, migrated),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(pid_t, tgid)
+ __field(pid_t, ngid)
+ __field(unsigned long, migrated)
+ __field(unsigned long, scanned)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+ __entry->pid = task_pid_nr(tsk);
+ __entry->tgid = task_tgid_nr(tsk);
+ __entry->ngid = task_numa_group_id(tsk);
+ __entry->migrated = migrated;
+ __entry->scanned = scanned;
+ ),
+
+ TP_printk("comm=%s pid=%d tgid=%d ngid=%d scanned=%lu migrated=%lu",
+ __entry->comm,
+ __entry->pid,
+ __entry->tgid,
+ __entry->ngid,
+ __entry->scanned,
+ __entry->migrated)
+);
#endif /* CONFIG_NUMA_BALANCING */
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25970dbbb279..173c9c8397e2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3294,6 +3294,9 @@ static void task_numa_work(struct callback_head *work)
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
+ unsigned long nr_scanned = 0;
+ unsigned long total_migrated = 0;
+ unsigned long total_scanned = 0;
long pages, virtpages;
struct vma_iterator vmi;
bool vma_pids_skipped;
@@ -3359,6 +3362,7 @@ static void task_numa_work(struct callback_head *work)
if (!mmap_read_trylock(mm))
return;
+ trace_sched_numa_balance_start(p);
/*
* VMAs are skipped if the current PID has not trapped a fault within
* the VMA recently. Allow scanning to be forced if there is no
@@ -3477,6 +3481,10 @@ static void task_numa_work(struct callback_head *work)
end = min(end, vma->vm_end);
nr_pte_updates = change_prot_numa(vma, start, end);
+ nr_scanned = (end - start) >> PAGE_SHIFT;
+ total_migrated += nr_pte_updates;
+ total_scanned += nr_scanned;
+
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
@@ -3486,8 +3494,8 @@ static void task_numa_work(struct callback_head *work)
* areas faster.
*/
if (nr_pte_updates)
- pages -= (end - start) >> PAGE_SHIFT;
- virtpages -= (end - start) >> PAGE_SHIFT;
+ pages -= nr_scanned;
+ virtpages -= nr_scanned;
start = end;
if (pages <= 0 || virtpages <= 0)
@@ -3528,6 +3536,8 @@ static void task_numa_work(struct callback_head *work)
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
+
+ trace_sched_numa_balance_end(p, total_scanned, total_migrated);
mmap_read_unlock(mm);
/*
--
2.43.0
On Wed, 29 Oct 2025 21:22:55 +0800
Jiayuan Chen <jiayuan.chen@linux.dev> wrote:
> From: Jiayuan Chen <jiayuan.chen@shopee.com>
>
> In systems with multiple NUMA nodes, memory imbalance between nodes often
> occurs. To address this, we typically tune parameters like scan_size_mb or
> scan_period_{min,max}_ms to allow processes to migrate pages between NUMA
> nodes.
>
> Currently, the migration task task_numa_work() holds the mmap_lock during
> the entire migration process, which can significantly impact process
> performance, especially for memory operations. This patch introduces a new
> tracepoint that records the migration duration, along with the number of
> scanned pages and migrated pages. These metrics can be used to calculate
> efficiency metrics similar to %vmeff in 'sar -B'.
>
> These metrics help evaluate whether the adjusted NUMA balancing parameters
> are properly tuned.
>
> Here's an example bpftrace script:
> ```bash
>
> bpftrace -e '
> tracepoint:sched:sched_numa_balance_start
> {
> @start_time[cpu] = nsecs;
> }
>
> tracepoint:sched:sched_numa_balance_end {
> if (@start_time[cpu] > 0) {
> $cost = nsecs - @start_time[cpu];
> printf("task '%s' migrate cost %lu, scanned %lu, migrated %lu\n",
> args.comm, $cost, args.scanned, args.migrated);
> }
> }
> '
BTW, you don't need bpf for this either:
# trace-cmd sqlhist -e -n numa_balance SELECT end.comm, TIMESTAMP_DELTA_USECS as cost, \
end.scanned, end.migrated FROM sched_numa_balance_start AS start \
JOIN sched_numa_balance_end AS end ON start.common_pid = end.common_pid
# trace-cmd start -e numa_balance
[ I'd show the output, but my test boxes don't have NUMA ]
You could also make a histogram with it:
# trace-cmd sqlhist -e SELECT start.comm, 'CAST(start.cost AS BUCKETS=50)' FROM numa_balance AS start
And then cat /sys/kernel/tracing/events/synthetic/numa_balance/hist
Just to give you an idea.
> ```
> Sample output:
> Attaching 2 probes...
> task 'rs:main Q:Reg' migrate cost 5584655, scanned 24516, migrated 22373
> task 'systemd-journal' migrate cost 123191, scanned 6308, migrated 0
> task 'wrk' migrate cost 894026, scanned 5842, migrated 5841
>
> Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
> ---
> include/trace/events/sched.h | 60 ++++++++++++++++++++++++++++++++++++
> kernel/sched/fair.c | 14 +++++++--
> 2 files changed, 72 insertions(+), 2 deletions(-)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 7b2645b50e78..e24bf700a614 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -804,6 +804,66 @@ TRACE_EVENT(sched_skip_cpuset_numa,
> __entry->ngid,
> MAX_NUMNODES, __entry->mem_allowed)
> );
> +
> +TRACE_EVENT(sched_numa_balance_start,
> +
> + TP_PROTO(struct task_struct *tsk),
> +
> + TP_ARGS(tsk),
> +
> + TP_STRUCT__entry(
> + __array(char, comm, TASK_COMM_LEN)
Please use __string() and not __array(). I'm trying to get rid of these for
task comm.
> + __field(pid_t, pid)
> + __field(pid_t, tgid)
> + __field(pid_t, ngid)
> + ),
> +
> + TP_fast_assign(
> + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
> + __entry->pid = task_pid_nr(tsk);
> + __entry->tgid = task_tgid_nr(tsk);
> + __entry->ngid = task_numa_group_id(tsk);
> + ),
> +
> + TP_printk("comm=%s pid=%d tgid=%d ngid=%d",
> + __entry->comm,
> + __entry->pid,
> + __entry->tgid,
> + __entry->ngid)
> +);
> +
> +TRACE_EVENT(sched_numa_balance_end,
> +
> + TP_PROTO(struct task_struct *tsk, unsigned long scanned, unsigned long migrated),
> +
> + TP_ARGS(tsk, scanned, migrated),
> +
> + TP_STRUCT__entry(
> + __array(char, comm, TASK_COMM_LEN)
> + __field(pid_t, pid)
> + __field(pid_t, tgid)
> + __field(pid_t, ngid)
> + __field(unsigned long, migrated)
> + __field(unsigned long, scanned)
> + ),
> +
> + TP_fast_assign(
> + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
> + __entry->pid = task_pid_nr(tsk);
> + __entry->tgid = task_tgid_nr(tsk);
> + __entry->ngid = task_numa_group_id(tsk);
> + __entry->migrated = migrated;
> + __entry->scanned = scanned;
> + ),
> +
> + TP_printk("comm=%s pid=%d tgid=%d ngid=%d scanned=%lu migrated=%lu",
> + __entry->comm,
> + __entry->pid,
> + __entry->tgid,
> + __entry->ngid,
> + __entry->scanned,
> + __entry->migrated)
> +);
> #endif /* CONFIG_NUMA_BALANCING */
>
> /*
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 25970dbbb279..173c9c8397e2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3294,6 +3294,9 @@ static void task_numa_work(struct callback_head *work)
> struct vm_area_struct *vma;
> unsigned long start, end;
> unsigned long nr_pte_updates = 0;
> + unsigned long nr_scanned = 0;
> + unsigned long total_migrated = 0;
> + unsigned long total_scanned = 0;
> long pages, virtpages;
> struct vma_iterator vmi;
> bool vma_pids_skipped;
> @@ -3359,6 +3362,7 @@ static void task_numa_work(struct callback_head *work)
> if (!mmap_read_trylock(mm))
> return;
>
> + trace_sched_numa_balance_start(p);
> /*
> * VMAs are skipped if the current PID has not trapped a fault within
> * the VMA recently. Allow scanning to be forced if there is no
> @@ -3477,6 +3481,10 @@ static void task_numa_work(struct callback_head *work)
> end = min(end, vma->vm_end);
> nr_pte_updates = change_prot_numa(vma, start, end);
>
> + nr_scanned = (end - start) >> PAGE_SHIFT;
> + total_migrated += nr_pte_updates;
> + total_scanned += nr_scanned;
> +
This will require the scheduler maintainers agreeing on this for acceptance.
Will kprobes not due?
-- Steve
> /*
> * Try to scan sysctl_numa_balancing_size worth of
> * hpages that have at least one present PTE that
> @@ -3486,8 +3494,8 @@ static void task_numa_work(struct callback_head *work)
> * areas faster.
> */
> if (nr_pte_updates)
> - pages -= (end - start) >> PAGE_SHIFT;
> - virtpages -= (end - start) >> PAGE_SHIFT;
> + pages -= nr_scanned;
> + virtpages -= nr_scanned;
>
> start = end;
> if (pages <= 0 || virtpages <= 0)
> @@ -3528,6 +3536,8 @@ static void task_numa_work(struct callback_head *work)
> mm->numa_scan_offset = start;
> else
> reset_ptenuma_scan(p);
> +
> + trace_sched_numa_balance_end(p, total_scanned, total_migrated);
> mmap_read_unlock(mm);
>
> /*
© 2016 - 2026 Red Hat, Inc.