fs/proc/task_mmu.c | 14 +++++++------- include/linux/mm.h | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-)
On some large machines with a high number of CPUs running a 64K pagesize
kernel, we found that the 'RES' field is always 0 displayed by the top
command for some processes, which will cause a lot of confusion for users.
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top
1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd
The main reason is that the batch size of the percpu counter is quite large
on these machines, caching a significant percpu value, since converting mm's
rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss
stats into percpu_counter"). Intuitively, the batch number should be optimized,
but on some paths, performance may take precedence over statistical accuracy.
Therefore, introducing a new interface to add the percpu statistical count
and display it to users, which can remove the confusion. In addition, this
change is not expected to be on a performance-critical path, so the modification
should be acceptable.
In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and
dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch().
In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock'
contention. This patch changes task_mem() and task_statm() to get the accurate
mm counters under the 'fbc->lock', but this should not exacerbate kernel
'mm->rss_stat' lock contention due to the percpu batch caching of the mm
counters. The following test also confirm the theoretical analysis.
I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores
machine, while simultaneously running a script that starts 32 threads to
busy-loop pread each stress-ng thread's /proc/pid/status interface. From the
following data, I did not observe any obvious impact of this patch on the
stress-ng tests.
w/o patch:
stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec
stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle)
stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec
stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec
w/patch:
stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec
stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle)
stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec
stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec
Tested-by Donet Tom <donettom@linux.ibm.com>
Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
Changes from v1:
- Update the commit message to add some measurements.
- Add acked tag from Michal. Thanks.
- Drop the Fixes tag.
Changes from RFC:
- Collect reviewed and tested tags. Thanks.
- Add Fixes tag.
---
fs/proc/task_mmu.c | 14 +++++++-------
include/linux/mm.h | 5 +++++
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b9e4fbbdf6e6..f629e6526935 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -36,9 +36,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long text, lib, swap, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
- anon = get_mm_counter(mm, MM_ANONPAGES);
- file = get_mm_counter(mm, MM_FILEPAGES);
- shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+ anon = get_mm_counter_sum(mm, MM_ANONPAGES);
+ file = get_mm_counter_sum(mm, MM_FILEPAGES);
+ shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = min(text, mm->exec_vm << PAGE_SHIFT);
lib = (mm->exec_vm << PAGE_SHIFT) - text;
- swap = get_mm_counter(mm, MM_SWAPENTS);
+ swap = get_mm_counter_sum(mm, MM_SWAPENTS);
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
@@ -92,12 +92,12 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
+ *shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
- *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+ *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
return mm->total_vm;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 185424858f23..15ec5cfe9515 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2568,6 +2568,11 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
return percpu_counter_read_positive(&mm->rss_stat[member]);
}
+static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
+{
+ return percpu_counter_sum_positive(&mm->rss_stat[member]);
+}
+
void mm_trace_rss_stat(struct mm_struct *mm, int member);
static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
--
2.43.5
Baolin Wang <baolin.wang@linux.alibaba.com> writes:
> On some large machines with a high number of CPUs running a 64K pagesize
> kernel, we found that the 'RES' field is always 0 displayed by the top
> command for some processes, which will cause a lot of confusion for users.
>
> PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
> 875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top
> 1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd
>
> The main reason is that the batch size of the percpu counter is quite large
> on these machines, caching a significant percpu value, since converting mm's
> rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss
> stats into percpu_counter"). Intuitively, the batch number should be optimized,
> but on some paths, performance may take precedence over statistical accuracy.
> Therefore, introducing a new interface to add the percpu statistical count
> and display it to users, which can remove the confusion. In addition, this
> change is not expected to be on a performance-critical path, so the modification
> should be acceptable.
>
> In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and
> dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch().
> In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock'
> contention. This patch changes task_mem() and task_statm() to get the accurate
> mm counters under the 'fbc->lock', but this should not exacerbate kernel
> 'mm->rss_stat' lock contention due to the percpu batch caching of the mm
> counters. The following test also confirm the theoretical analysis.
>
> I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores
> machine, while simultaneously running a script that starts 32 threads to
> busy-loop pread each stress-ng thread's /proc/pid/status interface. From the
> following data, I did not observe any obvious impact of this patch on the
> stress-ng tests.
>
> w/o patch:
> stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec
> stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle)
> stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec
> stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec
>
> w/patch:
> stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec
> stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle)
> stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec
> stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec
>
> Tested-by Donet Tom <donettom@linux.ibm.com>
> Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> Acked-by: SeongJae Park <sj@kernel.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
> Changes from v1:
> - Update the commit message to add some measurements.
> - Add acked tag from Michal. Thanks.
> - Drop the Fixes tag.
Any reason why we dropped the Fixes tag? I see there were a series of
discussion on v1 and it got concluded that the fix was correct, then why
drop the fixes tag?
Background: Recently few folks internally reported this issue on Power
too. e.g.
$ ps -o rss $$
RSS
0
So it would be nice if we had fixes tag so that it gets backported
to all stable release. Does anybody sees any concern with that?
-ritesh
On Mon 09-06-25 10:57:41, Ritesh Harjani wrote:
> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
>
> > On some large machines with a high number of CPUs running a 64K pagesize
> > kernel, we found that the 'RES' field is always 0 displayed by the top
> > command for some processes, which will cause a lot of confusion for users.
> >
> > PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
> > 875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top
> > 1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd
> >
> > The main reason is that the batch size of the percpu counter is quite large
> > on these machines, caching a significant percpu value, since converting mm's
> > rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss
> > stats into percpu_counter"). Intuitively, the batch number should be optimized,
> > but on some paths, performance may take precedence over statistical accuracy.
> > Therefore, introducing a new interface to add the percpu statistical count
> > and display it to users, which can remove the confusion. In addition, this
> > change is not expected to be on a performance-critical path, so the modification
> > should be acceptable.
> >
> > In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and
> > dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch().
> > In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock'
> > contention. This patch changes task_mem() and task_statm() to get the accurate
> > mm counters under the 'fbc->lock', but this should not exacerbate kernel
> > 'mm->rss_stat' lock contention due to the percpu batch caching of the mm
> > counters. The following test also confirm the theoretical analysis.
> >
> > I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores
> > machine, while simultaneously running a script that starts 32 threads to
> > busy-loop pread each stress-ng thread's /proc/pid/status interface. From the
> > following data, I did not observe any obvious impact of this patch on the
> > stress-ng tests.
> >
> > w/o patch:
> > stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec
> > stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle)
> > stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec
> > stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec
> >
> > w/patch:
> > stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec
> > stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle)
> > stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec
> > stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec
> >
> > Tested-by Donet Tom <donettom@linux.ibm.com>
> > Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> > Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> > Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> > Acked-by: SeongJae Park <sj@kernel.org>
> > Acked-by: Michal Hocko <mhocko@suse.com>
> > Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > ---
> > Changes from v1:
> > - Update the commit message to add some measurements.
> > - Add acked tag from Michal. Thanks.
> > - Drop the Fixes tag.
>
> Any reason why we dropped the Fixes tag? I see there were a series of
> discussion on v1 and it got concluded that the fix was correct, then why
> drop the fixes tag?
This seems more like an improvement than a bug fix.
--
Michal Hocko
SUSE Labs
On 2025/6/9 15:35, Michal Hocko wrote:
> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote:
>> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
>>
>>> On some large machines with a high number of CPUs running a 64K pagesize
>>> kernel, we found that the 'RES' field is always 0 displayed by the top
>>> command for some processes, which will cause a lot of confusion for users.
>>>
>>> PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
>>> 875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top
>>> 1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd
>>>
>>> The main reason is that the batch size of the percpu counter is quite large
>>> on these machines, caching a significant percpu value, since converting mm's
>>> rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss
>>> stats into percpu_counter"). Intuitively, the batch number should be optimized,
>>> but on some paths, performance may take precedence over statistical accuracy.
>>> Therefore, introducing a new interface to add the percpu statistical count
>>> and display it to users, which can remove the confusion. In addition, this
>>> change is not expected to be on a performance-critical path, so the modification
>>> should be acceptable.
>>>
>>> In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and
>>> dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch().
>>> In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock'
>>> contention. This patch changes task_mem() and task_statm() to get the accurate
>>> mm counters under the 'fbc->lock', but this should not exacerbate kernel
>>> 'mm->rss_stat' lock contention due to the percpu batch caching of the mm
>>> counters. The following test also confirm the theoretical analysis.
>>>
>>> I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores
>>> machine, while simultaneously running a script that starts 32 threads to
>>> busy-loop pread each stress-ng thread's /proc/pid/status interface. From the
>>> following data, I did not observe any obvious impact of this patch on the
>>> stress-ng tests.
>>>
>>> w/o patch:
>>> stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec
>>> stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle)
>>> stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec
>>> stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec
>>>
>>> w/patch:
>>> stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec
>>> stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle)
>>> stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec
>>> stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec
>>>
>>> Tested-by Donet Tom <donettom@linux.ibm.com>
>>> Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
>>> Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
>>> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>>> Acked-by: SeongJae Park <sj@kernel.org>
>>> Acked-by: Michal Hocko <mhocko@suse.com>
>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> ---
>>> Changes from v1:
>>> - Update the commit message to add some measurements.
>>> - Add acked tag from Michal. Thanks.
>>> - Drop the Fixes tag.
>>
>> Any reason why we dropped the Fixes tag? I see there were a series of
>> discussion on v1 and it got concluded that the fix was correct, then why
>> drop the fixes tag?
>
> This seems more like an improvement than a bug fix.
Yes. I don't have a strong opinion on this, but we (Alibaba) will
backport it manually, because some of user-space monitoring tools depend
on these statistics.
Baolin Wang <baolin.wang@linux.alibaba.com> writes:
> On 2025/6/9 15:35, Michal Hocko wrote:
>> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote:
>>> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
>>>
>>>> On some large machines with a high number of CPUs running a 64K pagesize
>>>> kernel, we found that the 'RES' field is always 0 displayed by the top
>>>> command for some processes, which will cause a lot of confusion for users.
>>>>
>>>> PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
>>>> 875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top
>>>> 1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd
>>>>
>>>> The main reason is that the batch size of the percpu counter is quite large
>>>> on these machines, caching a significant percpu value, since converting mm's
>>>> rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss
>>>> stats into percpu_counter"). Intuitively, the batch number should be optimized,
>>>> but on some paths, performance may take precedence over statistical accuracy.
>>>> Therefore, introducing a new interface to add the percpu statistical count
>>>> and display it to users, which can remove the confusion. In addition, this
>>>> change is not expected to be on a performance-critical path, so the modification
>>>> should be acceptable.
>>>>
>>>> In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and
>>>> dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch().
>>>> In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock'
>>>> contention. This patch changes task_mem() and task_statm() to get the accurate
>>>> mm counters under the 'fbc->lock', but this should not exacerbate kernel
>>>> 'mm->rss_stat' lock contention due to the percpu batch caching of the mm
>>>> counters. The following test also confirm the theoretical analysis.
>>>>
>>>> I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores
>>>> machine, while simultaneously running a script that starts 32 threads to
>>>> busy-loop pread each stress-ng thread's /proc/pid/status interface. From the
>>>> following data, I did not observe any obvious impact of this patch on the
>>>> stress-ng tests.
>>>>
>>>> w/o patch:
>>>> stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec
>>>> stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle)
>>>> stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec
>>>> stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec
>>>>
>>>> w/patch:
>>>> stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec
>>>> stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle)
>>>> stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec
>>>> stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec
>>>>
>>>> Tested-by Donet Tom <donettom@linux.ibm.com>
>>>> Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
>>>> Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
>>>> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>>>> Acked-by: SeongJae Park <sj@kernel.org>
>>>> Acked-by: Michal Hocko <mhocko@suse.com>
>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> ---
>>>> Changes from v1:
>>>> - Update the commit message to add some measurements.
>>>> - Add acked tag from Michal. Thanks.
>>>> - Drop the Fixes tag.
>>>
>>> Any reason why we dropped the Fixes tag? I see there were a series of
>>> discussion on v1 and it got concluded that the fix was correct, then why
>>> drop the fixes tag?
>>
>> This seems more like an improvement than a bug fix.
>
> Yes. I don't have a strong opinion on this, but we (Alibaba) will
> backport it manually,
>
> because some of user-space monitoring tools depend
> on these statistics.
That sounds like a regression then, isn't it?
-ritesh
On 6/9/25 10:31 AM, Ritesh Harjani (IBM) wrote: > Baolin Wang <baolin.wang@linux.alibaba.com> writes: > >> On 2025/6/9 15:35, Michal Hocko wrote: >>> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote: >>>> >>>> Any reason why we dropped the Fixes tag? I see there were a series of >>>> discussion on v1 and it got concluded that the fix was correct, then why >>>> drop the fixes tag? >>> >>> This seems more like an improvement than a bug fix. >> >> Yes. I don't have a strong opinion on this, but we (Alibaba) will >> backport it manually, >> >> because some of user-space monitoring tools depend >> on these statistics. > > That sounds like a regression then, isn't it? Hm if counters were accurate before f1a7941243c1 and not afterwards, and this is making them accurate again, and some userspace depends on it, then Fixes: and stable is probably warranted then. If this was just a perf improvement, then not. But AFAIU f1a7941243c1 was the perf improvement... > -ritesh
On 6/9/25 10:52 AM, Vlastimil Babka wrote: > On 6/9/25 10:31 AM, Ritesh Harjani (IBM) wrote: >> Baolin Wang <baolin.wang@linux.alibaba.com> writes: >> >>> On 2025/6/9 15:35, Michal Hocko wrote: >>>> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote: >>>>> >>>>> Any reason why we dropped the Fixes tag? I see there were a series of >>>>> discussion on v1 and it got concluded that the fix was correct, then why >>>>> drop the fixes tag? >>>> >>>> This seems more like an improvement than a bug fix. >>> >>> Yes. I don't have a strong opinion on this, but we (Alibaba) will >>> backport it manually, >>> >>> because some of user-space monitoring tools depend >>> on these statistics. >> >> That sounds like a regression then, isn't it? > > Hm if counters were accurate before f1a7941243c1 and not afterwards, and > this is making them accurate again, and some userspace depends on it, > then Fixes: and stable is probably warranted then. If this was just a > perf improvement, then not. But AFAIU f1a7941243c1 was the perf > improvement... Dang, should have re-read the commit log of f1a7941243c1 first. It seems like the error margin due to batching existed also before f1a7941243c1. " This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2)." so if on some systems this means worse margin than before, the above "if" chain of thought might still hold. > >> -ritesh >
On Mon, 9 Jun 2025 10:56:46 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: > On 6/9/25 10:52 AM, Vlastimil Babka wrote: > > On 6/9/25 10:31 AM, Ritesh Harjani (IBM) wrote: > >> Baolin Wang <baolin.wang@linux.alibaba.com> writes: > >> > >>> On 2025/6/9 15:35, Michal Hocko wrote: > >>>> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote: > >>>>> > >>>>> Any reason why we dropped the Fixes tag? I see there were a series of > >>>>> discussion on v1 and it got concluded that the fix was correct, then why > >>>>> drop the fixes tag? > >>>> > >>>> This seems more like an improvement than a bug fix. > >>> > >>> Yes. I don't have a strong opinion on this, but we (Alibaba) will > >>> backport it manually, > >>> > >>> because some of user-space monitoring tools depend > >>> on these statistics. > >> > >> That sounds like a regression then, isn't it? > > > > Hm if counters were accurate before f1a7941243c1 and not afterwards, and > > this is making them accurate again, and some userspace depends on it, > > then Fixes: and stable is probably warranted then. If this was just a > > perf improvement, then not. But AFAIU f1a7941243c1 was the perf > > improvement... > > Dang, should have re-read the commit log of f1a7941243c1 first. It seems > like the error margin due to batching existed also before f1a7941243c1. > > " This patch converts the rss_stats into percpu_counter to convert the > error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2)." > > so if on some systems this means worse margin than before, the above > "if" chain of thought might still hold. f1a7941243c1 seems like a good enough place to tell -stable maintainers where to insert the patch (why does this sound rude). The patch is simple enough. I'll add fixes:f1a7941243c1 and cc:stable and, as the problem has been there for years, I'll leave the patch in mm-unstable so it will eventually get into LTS, in a well tested state.
On 2025-06-09 20:17, Andrew Morton wrote: > On Mon, 9 Jun 2025 10:56:46 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: > >> On 6/9/25 10:52 AM, Vlastimil Babka wrote: >>> On 6/9/25 10:31 AM, Ritesh Harjani (IBM) wrote: >>>> Baolin Wang <baolin.wang@linux.alibaba.com> writes: >>>> >>>>> On 2025/6/9 15:35, Michal Hocko wrote: >>>>>> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote: >>>>>>> >>>>>>> Any reason why we dropped the Fixes tag? I see there were a series of >>>>>>> discussion on v1 and it got concluded that the fix was correct, then why >>>>>>> drop the fixes tag? >>>>>> >>>>>> This seems more like an improvement than a bug fix. >>>>> >>>>> Yes. I don't have a strong opinion on this, but we (Alibaba) will >>>>> backport it manually, >>>>> >>>>> because some of user-space monitoring tools depend >>>>> on these statistics. >>>> >>>> That sounds like a regression then, isn't it? >>> >>> Hm if counters were accurate before f1a7941243c1 and not afterwards, and >>> this is making them accurate again, and some userspace depends on it, >>> then Fixes: and stable is probably warranted then. If this was just a >>> perf improvement, then not. But AFAIU f1a7941243c1 was the perf >>> improvement... >> >> Dang, should have re-read the commit log of f1a7941243c1 first. It seems >> like the error margin due to batching existed also before f1a7941243c1. >> >> " This patch converts the rss_stats into percpu_counter to convert the >> error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2)." >> >> so if on some systems this means worse margin than before, the above >> "if" chain of thought might still hold. > > f1a7941243c1 seems like a good enough place to tell -stable > maintainers where to insert the patch (why does this sound rude). > > The patch is simple enough. I'll add fixes:f1a7941243c1 and cc:stable > and, as the problem has been there for years, I'll leave the patch in > mm-unstable so it will eventually get into LTS, in a well tested state. Andrew, are you considering submitting this patch for 6.16? I think we should, it does look like a regression for larger systems built with 64k base page size. On comparing a very simple app which just allocates & touches some memory against v6.1 (which doesn't have f1a7941243c1) and latest Linus tree (4c06e63b9203) I can see that on latest Linus tree the values for VmRSS, RssAnon and RssFile from /proc/self/status are all zeroes while they do report values on v6.1 and a Linus tree with this patch. My test setup is a arm64 VM with 80 CPUs running a kernel with 64k pagesize. The kernel only reports the RSS values starting at 10MB (which makes sense since the Per-CPU counters will cache up to two times the number of CPUs and the kernel accounts pages). The situation will be worse on larger systems, of course.
On Fri, 4 Jul 2025 14:22:11 -0400 Luiz Capitulino <luizcap@redhat.com> wrote: > > The patch is simple enough. I'll add fixes:f1a7941243c1 and cc:stable > > and, as the problem has been there for years, I'll leave the patch in > > mm-unstable so it will eventually get into LTS, in a well tested state. > > Andrew, are you considering submitting this patch for 6.16? I think > we should, it does look like a regression for larger systems built > with 64k base page size. I wasn't planning on 6.16-rcX because it's been there for years but sure, I moved it into the mm-hotfixes pile so it'll go Linuswards next week. > On comparing a very simple app which just allocates & touches some > memory against v6.1 (which doesn't have f1a7941243c1) and latest > Linus tree (4c06e63b9203) I can see that on latest Linus tree the > values for VmRSS, RssAnon and RssFile from /proc/self/status are > all zeroes while they do report values on v6.1 and a Linus tree > with this patch. Cool, I'll paste this para into the changelog to help people link this patch with wrong behavior which they are observing.
On 2025-07-04 16:11, Andrew Morton wrote: > On Fri, 4 Jul 2025 14:22:11 -0400 Luiz Capitulino <luizcap@redhat.com> wrote: > >>> The patch is simple enough. I'll add fixes:f1a7941243c1 and cc:stable >>> and, as the problem has been there for years, I'll leave the patch in >>> mm-unstable so it will eventually get into LTS, in a well tested state. >> >> Andrew, are you considering submitting this patch for 6.16? I think >> we should, it does look like a regression for larger systems built >> with 64k base page size. > > I wasn't planning on 6.16-rcX because it's been there for years but > sure, I moved it into the mm-hotfixes pile so it'll go Linuswards next > week. Wonderful, thank you! > >> On comparing a very simple app which just allocates & touches some >> memory against v6.1 (which doesn't have f1a7941243c1) and latest >> Linus tree (4c06e63b9203) I can see that on latest Linus tree the >> values for VmRSS, RssAnon and RssFile from /proc/self/status are >> all zeroes while they do report values on v6.1 and a Linus tree >> with this patch. > > Cool, I'll paste this para into the changelog to help people link this > patch with wrong behavior which they are observing. OK.
On Mon, Jun 09, 2025 at 05:17:58PM -0700, Andrew Morton wrote: > On Mon, 9 Jun 2025 10:56:46 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: > > > On 6/9/25 10:52 AM, Vlastimil Babka wrote: > > > On 6/9/25 10:31 AM, Ritesh Harjani (IBM) wrote: > > >> Baolin Wang <baolin.wang@linux.alibaba.com> writes: > > >> > > >>> On 2025/6/9 15:35, Michal Hocko wrote: > > >>>> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote: > > >>>>> > > >>>>> Any reason why we dropped the Fixes tag? I see there were a series of > > >>>>> discussion on v1 and it got concluded that the fix was correct, then why > > >>>>> drop the fixes tag? > > >>>> > > >>>> This seems more like an improvement than a bug fix. > > >>> > > >>> Yes. I don't have a strong opinion on this, but we (Alibaba) will > > >>> backport it manually, > > >>> > > >>> because some of user-space monitoring tools depend > > >>> on these statistics. > > >> > > >> That sounds like a regression then, isn't it? > > > > > > Hm if counters were accurate before f1a7941243c1 and not afterwards, and > > > this is making them accurate again, and some userspace depends on it, > > > then Fixes: and stable is probably warranted then. If this was just a > > > perf improvement, then not. But AFAIU f1a7941243c1 was the perf > > > improvement... > > > > Dang, should have re-read the commit log of f1a7941243c1 first. It seems > > like the error margin due to batching existed also before f1a7941243c1. > > > > " This patch converts the rss_stats into percpu_counter to convert the > > error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2)." > > > > so if on some systems this means worse margin than before, the above > > "if" chain of thought might still hold. > > f1a7941243c1 seems like a good enough place to tell -stable > maintainers where to insert the patch (why does this sound rude). > > The patch is simple enough. I'll add fixes:f1a7941243c1 and cc:stable > and, as the problem has been there for years, I'll leave the patch in > mm-unstable so it will eventually get into LTS, in a well tested state. One thing f1a7941243c1 noted was that the percpu counter conversion enabled us to get more accurate stats with some cpu cost and in this patch Baolin has shown that the cpu cost of accurate stats is reasonable, so seems safe for stable backport. Also it seems like multiple users are impacted by this issue, so I am fine with stable backport.
On Mon 09-06-25 17:45:05, Shakeel Butt wrote: > On Mon, Jun 09, 2025 at 05:17:58PM -0700, Andrew Morton wrote: > > On Mon, 9 Jun 2025 10:56:46 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: > > > > > On 6/9/25 10:52 AM, Vlastimil Babka wrote: > > > > On 6/9/25 10:31 AM, Ritesh Harjani (IBM) wrote: > > > >> Baolin Wang <baolin.wang@linux.alibaba.com> writes: > > > >> > > > >>> On 2025/6/9 15:35, Michal Hocko wrote: > > > >>>> On Mon 09-06-25 10:57:41, Ritesh Harjani wrote: > > > >>>>> > > > >>>>> Any reason why we dropped the Fixes tag? I see there were a series of > > > >>>>> discussion on v1 and it got concluded that the fix was correct, then why > > > >>>>> drop the fixes tag? > > > >>>> > > > >>>> This seems more like an improvement than a bug fix. > > > >>> > > > >>> Yes. I don't have a strong opinion on this, but we (Alibaba) will > > > >>> backport it manually, > > > >>> > > > >>> because some of user-space monitoring tools depend > > > >>> on these statistics. > > > >> > > > >> That sounds like a regression then, isn't it? > > > > > > > > Hm if counters were accurate before f1a7941243c1 and not afterwards, and > > > > this is making them accurate again, and some userspace depends on it, > > > > then Fixes: and stable is probably warranted then. If this was just a > > > > perf improvement, then not. But AFAIU f1a7941243c1 was the perf > > > > improvement... > > > > > > Dang, should have re-read the commit log of f1a7941243c1 first. It seems > > > like the error margin due to batching existed also before f1a7941243c1. > > > > > > " This patch converts the rss_stats into percpu_counter to convert the > > > error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2)." > > > > > > so if on some systems this means worse margin than before, the above > > > "if" chain of thought might still hold. > > > > f1a7941243c1 seems like a good enough place to tell -stable > > maintainers where to insert the patch (why does this sound rude). > > > > The patch is simple enough. I'll add fixes:f1a7941243c1 and cc:stable > > and, as the problem has been there for years, I'll leave the patch in > > mm-unstable so it will eventually get into LTS, in a well tested state. > > One thing f1a7941243c1 noted was that the percpu counter conversion > enabled us to get more accurate stats with some cpu cost and in this > patch Baolin has shown that the cpu cost of accurate stats is > reasonable, so seems safe for stable backport. Also it seems like > multiple users are impacted by this issue, so I am fine with stable > backport. Fair point. -- Michal Hocko SUSE Labs
On 6/5/25 14:58, Baolin Wang wrote:
> On some large machines with a high number of CPUs running a 64K pagesize
> kernel, we found that the 'RES' field is always 0 displayed by the top
> command for some processes, which will cause a lot of confusion for users.
>
> PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
> 875525 root 20 0 12480 0 0 R 0.3 0.0 0:00.08 top
> 1 root 20 0 172800 0 0 S 0.0 0.0 0:04.52 systemd
>
> The main reason is that the batch size of the percpu counter is quite large
> on these machines, caching a significant percpu value, since converting mm's
> rss stats into percpu_counter by commit f1a7941243c1 ("mm: convert mm's rss
> stats into percpu_counter"). Intuitively, the batch number should be optimized,
> but on some paths, performance may take precedence over statistical accuracy.
> Therefore, introducing a new interface to add the percpu statistical count
> and display it to users, which can remove the confusion. In addition, this
> change is not expected to be on a performance-critical path, so the modification
> should be acceptable.
>
> In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and
> dec/inc_mm_counter(), which are all wrappers around percpu_counter_add_batch().
> In percpu_counter_add_batch(), there is percpu batch caching to avoid 'fbc->lock'
> contention. This patch changes task_mem() and task_statm() to get the accurate
> mm counters under the 'fbc->lock', but this should not exacerbate kernel
> 'mm->rss_stat' lock contention due to the percpu batch caching of the mm
> counters. The following test also confirm the theoretical analysis.
>
> I run the stress-ng that stresses anon page faults in 32 threads on my 32 cores
> machine, while simultaneously running a script that starts 32 threads to
> busy-loop pread each stress-ng thread's /proc/pid/status interface. From the
> following data, I did not observe any obvious impact of this patch on the
> stress-ng tests.
>
> w/o patch:
> stress-ng: info: [6848] 4,399,219,085,152 CPU Cycles 67.327 B/sec
> stress-ng: info: [6848] 1,616,524,844,832 Instructions 24.740 B/sec (0.367 instr. per cycle)
> stress-ng: info: [6848] 39,529,792 Page Faults Total 0.605 M/sec
> stress-ng: info: [6848] 39,529,792 Page Faults Minor 0.605 M/sec
>
> w/patch:
> stress-ng: info: [2485] 4,462,440,381,856 CPU Cycles 68.382 B/sec
> stress-ng: info: [2485] 1,615,101,503,296 Instructions 24.750 B/sec (0.362 instr. per cycle)
> stress-ng: info: [2485] 39,439,232 Page Faults Total 0.604 M/sec
> stress-ng: info: [2485] 39,439,232 Page Faults Minor 0.604 M/sec
>
> Tested-by Donet Tom <donettom@linux.ibm.com>
> Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> Acked-by: SeongJae Park <sj@kernel.org>
> Acked-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Thanks!
© 2016 - 2025 Red Hat, Inc.