From: Qi Zheng <zhengqi.arch@bytedance.com>
The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
reparent non-hierarchical stats, the values passed to them might exceed
the upper limit of the type int, so correct the val parameter type of them
to long.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
include/trace/events/memcg.h | 10 +++++-----
mm/memcontrol.c | 8 ++++----
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
index dfe2f51019b4c..51b62c5931fc2 100644
--- a/include/trace/events/memcg.h
+++ b/include/trace/events/memcg.h
@@ -11,14 +11,14 @@
DECLARE_EVENT_CLASS(memcg_rstat_stats,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val),
TP_STRUCT__entry(
__field(u64, id)
__field(int, item)
- __field(int, val)
+ __field(long, val)
),
TP_fast_assign(
@@ -27,20 +27,20 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats,
__entry->val = val;
),
- TP_printk("memcg_id=%llu item=%d val=%d",
+ TP_printk("memcg_id=%llu item=%d val=%ld",
__entry->id, __entry->item, __entry->val)
);
DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val)
);
DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state,
- TP_PROTO(struct mem_cgroup *memcg, int item, int val),
+ TP_PROTO(struct mem_cgroup *memcg, int item, long val),
TP_ARGS(memcg, item, val)
);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7fb9cbc10dfbb..4a78550f6174e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
#ifdef CONFIG_MEMCG_V1
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
- enum node_stat_item idx, int val);
+ enum node_stat_item idx, long val);
void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
struct mem_cgroup *parent, int idx)
@@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
* Normalize the value passed into memcg_rstat_updated() to be in pages. Round
* up non-zero sub-page updates to 1 page as zero page updates are ignored.
*/
-static int memcg_state_val_in_pages(int idx, int val)
+static long memcg_state_val_in_pages(int idx, long val)
{
int unit = memcg_page_state_unit(idx);
@@ -831,7 +831,7 @@ static inline void get_non_dying_memcg_end(void)
#endif
static void __mod_memcg_state(struct mem_cgroup *memcg,
- enum memcg_stat_item idx, int val)
+ enum memcg_stat_item idx, long val)
{
int i = memcg_stats_index(idx);
int cpu;
@@ -896,7 +896,7 @@ void reparent_memcg_state_local(struct mem_cgroup *memcg,
#endif
static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
- enum node_stat_item idx, int val)
+ enum node_stat_item idx, long val)
{
struct mem_cgroup *memcg = pn->memcg;
int i = memcg_stats_index(idx);
--
2.20.1
On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
>
> The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
> reparent non-hierarchical stats, the values passed to them might exceed
> the upper limit of the type int, so correct the val parameter type of them
> to long.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> include/trace/events/memcg.h | 10 +++++-----
> mm/memcontrol.c | 8 ++++----
> 2 files changed, 9 insertions(+), 9 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7fb9cbc10dfbb..4a78550f6174e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>
> #ifdef CONFIG_MEMCG_V1
> static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> - enum node_stat_item idx, int val);
> + enum node_stat_item idx, long val);
>
> void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> struct mem_cgroup *parent, int idx)
> @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
> * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
> * up non-zero sub-page updates to 1 page as zero page updates are ignored.
> */
> -static int memcg_state_val_in_pages(int idx, int val)
> +static long memcg_state_val_in_pages(int idx, long val)
> {
> int unit = memcg_page_state_unit(idx);
Sashiko AI made an interesting argument [1] that this could lead to
incorrectly returning a very large positive number. Let me verify that.
[1] https://sashiko.dev/#/patchset/cover.1774342371.git.zhengqi.arch%40bytedance.com
Sashiko wrote:
> Does this change inadvertently break the handling of negative byte-sized
> updates?
> Looking at the rest of the function:
> if (!val || unit == PAGE_SIZE)
> return val;
> else
> return max(val * unit / PAGE_SIZE, 1UL);
> PAGE_SIZE is defined as an unsigned long.
Right, it's defined as 1UL << PAGE_SHIFT.
> When val is negative, such as during uncharging of byte-sized stats like
> MEMCG_ZSWAP_B, the expression val * unit is a negative long.
Right.
> Dividing a signed long by an unsigned long causes the signed long to be
> promoted to unsigned before division,
Right.
> resulting in a massive positive
> number instead of a small negative one.
Let's look at an example (assuming unit is 1).
val = val * unit = -16384 (-16 KiB)
val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
max(0x3FFFFFFFFFFFFF, 1UL) = 0x3FFFFFFFFFF
Yeah, that's a massive positive number.
Hmm but how did it work when it was int?
val = val * unit = -16384 (-16KiB)
val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
max(val * unit / PAGE_SIZE, 1UL) = 0x3FFFFFFFFFFFFF
(int)0x3FFFFFFFFFFFFF = 0xFFFFFFFF = (-1)
That's incorrect. It should have been -4?
> Before this change, the function returned an int, which implicitly truncated
> the massive unsigned 64-bit result to a 32-bit int, accidentally yielding the
> correct negative arithmetic value.
So... "accidentally yielding the correct negative arithemetic value"
is wrong.
Sounds like it's been subtly broken even before this patch and nobody
noticed.
> By changing the return type to long, this implicit truncation is removed,
> and the huge positive value is returned unaltered.
That's true.
> Could this corrupt tracepoint logs when passed to trace_mod_memcg_state?
I'm not sure if that's critical but yeah that's true.
> Also, would passing this huge positive value to memcg_rstat_updated instantly
> exceed the charge batch threshold and trigger endless, expensive global
> cgroup_rstat flushing, severely degrading system performance?
It would lead to more frequent flushes, at least.
--
Cheers,
Harry / Hyeonggon
On Wed, Mar 25, 2026 at 10:43:58AM +0900, Harry Yoo (Oracle) wrote:
> On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
> > @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
> > * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
> > * up non-zero sub-page updates to 1 page as zero page updates are ignored.
> > */
> > -static int memcg_state_val_in_pages(int idx, int val)
> > +static long memcg_state_val_in_pages(int idx, long val)
> > {
> > int unit = memcg_page_state_unit(idx);
>
> Sashiko AI made an interesting argument [1] that this could lead to
> incorrectly returning a very large positive number. Let me verify that.
>
> [1] https://sashiko.dev/#/patchset/cover.1774342371.git.zhengqi.arch%40bytedance.com
>
> Sashiko wrote:
> > Does this change inadvertently break the handling of negative byte-sized
> > updates?
> > Looking at the rest of the function:
> > if (!val || unit == PAGE_SIZE)
> > return val;
> > else
> > return max(val * unit / PAGE_SIZE, 1UL);
>
> > PAGE_SIZE is defined as an unsigned long.
>
> Right, it's defined as 1UL << PAGE_SHIFT.
>
> > When val is negative, such as during uncharging of byte-sized stats like
> > MEMCG_ZSWAP_B, the expression val * unit is a negative long.
>
> Right.
>
> > Dividing a signed long by an unsigned long causes the signed long to be
> > promoted to unsigned before division,
>
> Right.
>
> > resulting in a massive positive
> > number instead of a small negative one.
>
> Let's look at an example (assuming unit is 1).
>
> val = val * unit = -16384 (-16 KiB)
> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
> max(0x3FFFFFFFFFFFFF, 1UL) = 0x3FFFFFFFFFF
>
> Yeah, that's a massive positive number.
>
> Hmm but how did it work when it was int?
Oops, I was about to say... "Oh, doesn't patch 4/4 in v2 need to have
Fixes: 7bd5bc3ce963 ("mm: memcg: normalize the value passed into memcg_rstat_updated()")
???"
but then I realized that I made a silly mistake here.
> val = val * unit = -16384 (-16KiB)
> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
Err, I should have divided it by 0x1000, not 0x4096.
val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / 0x1000 = 0xFFFFFFFFFFFFC
max(val * unit / PAGE_SIZE, 1UL) = 0xFFFFFFFFFFFFC
(int)0xFFFFFFFFFFFFC = -4.
> max(val * unit / PAGE_SIZE, 1UL) = 0x3FFFFFFFFFFFFF
> (int)0x3FFFFFFFFFFFFF = 0xFFFFFFFF = (-1)
>
> That's incorrect. It should have been -4?
So that was correct.
The existing logic produces an accurate number (intended or not) as it
is right-shifted to only PAGE_SHIFT bits and truncated to int.
The existing logic is fine, it'll only be a problem when it's not
truncated to int.
> > Before this change, the function returned an int, which implicitly truncated
> > the massive unsigned 64-bit result to a 32-bit int, accidentally yielding the
> > correct negative arithmetic value.
>
> So... "accidentally yielding the correct negative arithemetic value"
> is wrong.
I was wrong, not sashiko!
> Sounds like it's been subtly broken even before this patch and nobody
> noticed.
No, it's not.
--
Cheers,
Harry / Hyeonggon
On 3/25/26 9:43 AM, Harry Yoo (Oracle) wrote:
> On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
>> reparent non-hierarchical stats, the values passed to them might exceed
>> the upper limit of the type int, so correct the val parameter type of them
>> to long.
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>> include/trace/events/memcg.h | 10 +++++-----
>> mm/memcontrol.c | 8 ++++----
>> 2 files changed, 9 insertions(+), 9 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 7fb9cbc10dfbb..4a78550f6174e 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>
>> #ifdef CONFIG_MEMCG_V1
>> static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>> - enum node_stat_item idx, int val);
>> + enum node_stat_item idx, long val);
>>
>> void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>> struct mem_cgroup *parent, int idx)
>> @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
>> * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
>> * up non-zero sub-page updates to 1 page as zero page updates are ignored.
>> */
>> -static int memcg_state_val_in_pages(int idx, int val)
>> +static long memcg_state_val_in_pages(int idx, long val)
>> {
>> int unit = memcg_page_state_unit(idx);
>
> Sashiko AI made an interesting argument [1] that this could lead to
> incorrectly returning a very large positive number. Let me verify that.
>
> [1] https://sashiko.dev/#/patchset/cover.1774342371.git.zhengqi.arch%40bytedance.com
>
> Sashiko wrote:
>> Does this change inadvertently break the handling of negative byte-sized
>> updates?
>> Looking at the rest of the function:
>> if (!val || unit == PAGE_SIZE)
>> return val;
>> else
>> return max(val * unit / PAGE_SIZE, 1UL);
>
>> PAGE_SIZE is defined as an unsigned long.
>
> Right, it's defined as 1UL << PAGE_SHIFT.
>
>> When val is negative, such as during uncharging of byte-sized stats like
>> MEMCG_ZSWAP_B, the expression val * unit is a negative long.
>
> Right.
>
>> Dividing a signed long by an unsigned long causes the signed long to be
>> promoted to unsigned before division,
>
> Right.
>
>> resulting in a massive positive
>> number instead of a small negative one.
>
> Let's look at an example (assuming unit is 1).
>
> val = val * unit = -16384 (-16 KiB)
> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
> max(0x3FFFFFFFFFFFFF, 1UL) = 0x3FFFFFFFFFF
>
> Yeah, that's a massive positive number.
>
> Hmm but how did it work when it was int?
>
> val = val * unit = -16384 (-16KiB)
> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
> max(val * unit / PAGE_SIZE, 1UL) = 0x3FFFFFFFFFFFFF
> (int)0x3FFFFFFFFFFFFF = 0xFFFFFFFF = (-1)
>
> That's incorrect. It should have been -4?
>
>> Before this change, the function returned an int, which implicitly truncated
>> the massive unsigned 64-bit result to a 32-bit int, accidentally yielding the
>> correct negative arithmetic value.
>
> So... "accidentally yielding the correct negative arithemetic value"
> is wrong.
>
> Sounds like it's been subtly broken even before this patch and nobody
> noticed.
Thank you for such a detailed analysis! And I think you are right.
The memcg_state_val_in_pages() is only to make @val to be in pages, so
perhaps we can avoid the above problem by taking the absolute value
first?
Thanks,
Qi
>
>> By changing the return type to long, this implicit truncation is removed,
>> and the huge positive value is returned unaltered.
>
> That's true.
>
>> Could this corrupt tracepoint logs when passed to trace_mod_memcg_state?
>
> I'm not sure if that's critical but yeah that's true.
>
>> Also, would passing this huge positive value to memcg_rstat_updated instantly
>> exceed the charge batch threshold and trigger endless, expensive global
>> cgroup_rstat flushing, severely degrading system performance?
>
> It would lead to more frequent flushes, at least.
>
On Wed, Mar 25, 2026 at 11:25:06AM +0800, Qi Zheng wrote:
> On 3/25/26 9:43 AM, Harry Yoo (Oracle) wrote:
> > On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
> > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > >
> > > The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
> > > reparent non-hierarchical stats, the values passed to them might exceed
> > > the upper limit of the type int, so correct the val parameter type of them
> > > to long.
> > >
> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > ---
> > > include/trace/events/memcg.h | 10 +++++-----
> > > mm/memcontrol.c | 8 ++++----
> > > 2 files changed, 9 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 7fb9cbc10dfbb..4a78550f6174e 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > > #ifdef CONFIG_MEMCG_V1
> > > static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> > > - enum node_stat_item idx, int val);
> > > + enum node_stat_item idx, long val);
> > > void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> > > struct mem_cgroup *parent, int idx)
> > > @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
> > > * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
> > > * up non-zero sub-page updates to 1 page as zero page updates are ignored.
> > > */
> > > -static int memcg_state_val_in_pages(int idx, int val)
> > > +static long memcg_state_val_in_pages(int idx, long val)
> > > {
> > > int unit = memcg_page_state_unit(idx);
> >
> > Sashiko AI made an interesting argument [1] that this could lead to
> > incorrectly returning a very large positive number. Let me verify that.
> >
> > [1] https://sashiko.dev/#/patchset/cover.1774342371.git.zhengqi.arch%40bytedance.com
> >
> > Sashiko wrote:
> > > Does this change inadvertently break the handling of negative byte-sized
> > > updates?
> > > Looking at the rest of the function:
> > > if (!val || unit == PAGE_SIZE)
> > > return val;
> > > else
> > > return max(val * unit / PAGE_SIZE, 1UL);
> >
> > > PAGE_SIZE is defined as an unsigned long.
> >
> > Right, it's defined as 1UL << PAGE_SHIFT.
> >
> > > When val is negative, such as during uncharging of byte-sized stats like
> > > MEMCG_ZSWAP_B, the expression val * unit is a negative long.
> >
> > Right.
> >
> > > Dividing a signed long by an unsigned long causes the signed long to be
> > > promoted to unsigned before division,
> >
> > Right.
> >
> > > resulting in a massive positive
> > > number instead of a small negative one.
> >
> > Let's look at an example (assuming unit is 1).
> >
> > val = val * unit = -16384 (-16 KiB)
> > val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
> > max(0x3FFFFFFFFFFFFF, 1UL) = 0x3FFFFFFFFFF
> >
> > Yeah, that's a massive positive number.
> >
> > Hmm but how did it work when it was int?
> >
> > val = val * unit = -16384 (-16KiB)
> > val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
> > max(val * unit / PAGE_SIZE, 1UL) = 0x3FFFFFFFFFFFFF
> > (int)0x3FFFFFFFFFFFFF = 0xFFFFFFFF = (-1)
> >
> > That's incorrect. It should have been -4?
> >
> > > Before this change, the function returned an int, which implicitly truncated
> > > the massive unsigned 64-bit result to a 32-bit int, accidentally yielding the
> > > correct negative arithmetic value.
> >
> > So... "accidentally yielding the correct negative arithemetic value"
> > is wrong.
> >
> > Sounds like it's been subtly broken even before this patch and nobody
> > noticed.
>
> Thank you for such a detailed analysis! And I think you are right.
No problem ;)
> The memcg_state_val_in_pages() is only to make @val to be in pages, so
> perhaps we can avoid the above problem by taking the absolute value
> first?
That would work for memcg_rstat_updated(),
but not for trace_mod_memcg_state()?
--
Cheers,
Harry / Hyeonggon
On 3/25/26 1:17 PM, Harry Yoo (Oracle) wrote:
> On Wed, Mar 25, 2026 at 11:25:06AM +0800, Qi Zheng wrote:
>> On 3/25/26 9:43 AM, Harry Yoo (Oracle) wrote:
>>> On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>
>>>> The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
>>>> reparent non-hierarchical stats, the values passed to them might exceed
>>>> the upper limit of the type int, so correct the val parameter type of them
>>>> to long.
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> ---
>>>> include/trace/events/memcg.h | 10 +++++-----
>>>> mm/memcontrol.c | 8 ++++----
>>>> 2 files changed, 9 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>> index 7fb9cbc10dfbb..4a78550f6174e 100644
>>>> --- a/mm/memcontrol.c
>>>> +++ b/mm/memcontrol.c
>>>> @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>> #ifdef CONFIG_MEMCG_V1
>>>> static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>>>> - enum node_stat_item idx, int val);
>>>> + enum node_stat_item idx, long val);
>>>> void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>>> struct mem_cgroup *parent, int idx)
>>>> @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
>>>> * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
>>>> * up non-zero sub-page updates to 1 page as zero page updates are ignored.
>>>> */
>>>> -static int memcg_state_val_in_pages(int idx, int val)
>>>> +static long memcg_state_val_in_pages(int idx, long val)
>>>> {
>>>> int unit = memcg_page_state_unit(idx);
>>>
>>> Sashiko AI made an interesting argument [1] that this could lead to
>>> incorrectly returning a very large positive number. Let me verify that.
>>>
>>> [1] https://sashiko.dev/#/patchset/cover.1774342371.git.zhengqi.arch%40bytedance.com
>>>
>>> Sashiko wrote:
>>>> Does this change inadvertently break the handling of negative byte-sized
>>>> updates?
>>>> Looking at the rest of the function:
>>>> if (!val || unit == PAGE_SIZE)
>>>> return val;
>>>> else
>>>> return max(val * unit / PAGE_SIZE, 1UL);
>>>
>>>> PAGE_SIZE is defined as an unsigned long.
>>>
>>> Right, it's defined as 1UL << PAGE_SHIFT.
>>>
>>>> When val is negative, such as during uncharging of byte-sized stats like
>>>> MEMCG_ZSWAP_B, the expression val * unit is a negative long.
>>>
>>> Right.
>>>
>>>> Dividing a signed long by an unsigned long causes the signed long to be
>>>> promoted to unsigned before division,
>>>
>>> Right.
>>>
>>>> resulting in a massive positive
>>>> number instead of a small negative one.
>>>
>>> Let's look at an example (assuming unit is 1).
>>>
>>> val = val * unit = -16384 (-16 KiB)
>>> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
>>> max(0x3FFFFFFFFFFFFF, 1UL) = 0x3FFFFFFFFFF
>>>
>>> Yeah, that's a massive positive number.
>>>
>>> Hmm but how did it work when it was int?
>>>
>>> val = val * unit = -16384 (-16KiB)
>>> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
>>> max(val * unit / PAGE_SIZE, 1UL) = 0x3FFFFFFFFFFFFF
>>> (int)0x3FFFFFFFFFFFFF = 0xFFFFFFFF = (-1)
>>>
>>> That's incorrect. It should have been -4?
>>>
>>>> Before this change, the function returned an int, which implicitly truncated
>>>> the massive unsigned 64-bit result to a 32-bit int, accidentally yielding the
>>>> correct negative arithmetic value.
>>>
>>> So... "accidentally yielding the correct negative arithemetic value"
>>> is wrong.
>>>
>>> Sounds like it's been subtly broken even before this patch and nobody
>>> noticed.
>>
>> Thank you for such a detailed analysis! And I think you are right.
>
> No problem ;)
>
>> The memcg_state_val_in_pages() is only to make @val to be in pages, so
>> perhaps we can avoid the above problem by taking the absolute value
>> first?
For more clearly, here is the diff:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6491ca74b3398..2b34805b01476 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -787,11 +787,17 @@ static int memcg_page_state_unit(int item);
static long memcg_state_val_in_pages(int idx, long val)
{
int unit = memcg_page_state_unit(idx);
+ unsigned long uval;
+ long res;
if (!val || unit == PAGE_SIZE)
return val;
- else
- return max(val * unit / PAGE_SIZE, 1UL);
+
+ uval = val < 0 ? -(unsigned long)val : (unsigned long)val;
+
+ res = max(mult_frac(uval, unit, PAGE_SIZE), 1UL);
+
+ return val < 0 ? -res : res;
}
#ifdef CONFIG_MEMCG_V1
>
> That would work for memcg_rstat_updated(),
> but not for trace_mod_memcg_state()?
Why? The trace_mod_memcg_state() seems to accept 'long' type,
am I missing something?
Thanks,
Qi
>
On Wed, Mar 25, 2026 at 03:26:46PM +0800, Qi Zheng wrote:
>
>
> On 3/25/26 1:17 PM, Harry Yoo (Oracle) wrote:
> > On Wed, Mar 25, 2026 at 11:25:06AM +0800, Qi Zheng wrote:
> > > On 3/25/26 9:43 AM, Harry Yoo (Oracle) wrote:
> > > > On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
> > > > > From: Qi Zheng <zhengqi.arch@bytedance.com>
> > > > >
> > > > > The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
> > > > > reparent non-hierarchical stats, the values passed to them might exceed
> > > > > the upper limit of the type int, so correct the val parameter type of them
> > > > > to long.
> > > > >
> > > > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > > > ---
> > > > > include/trace/events/memcg.h | 10 +++++-----
> > > > > mm/memcontrol.c | 8 ++++----
> > > > > 2 files changed, 9 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index 7fb9cbc10dfbb..4a78550f6174e 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > > > > #ifdef CONFIG_MEMCG_V1
> > > > > static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> > > > > - enum node_stat_item idx, int val);
> > > > > + enum node_stat_item idx, long val);
> > > > > void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> > > > > struct mem_cgroup *parent, int idx)
> > > > > @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
> > > > > * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
> > > > > * up non-zero sub-page updates to 1 page as zero page updates are ignored.
> > > > > */
> > > > > -static int memcg_state_val_in_pages(int idx, int val)
> > > > > +static long memcg_state_val_in_pages(int idx, long val)
> > > > > {
> > > > > int unit = memcg_page_state_unit(idx);
> > > >
> > > > Sashiko AI made an interesting argument [1] that this could lead to
> > > > incorrectly returning a very large positive number. Let me verify that.
> > > >
> > > > [1] https://sashiko.dev/#/patchset/cover.1774342371.git.zhengqi.arch%40bytedance.com
> > > >
> > > > Sashiko wrote:
> > > > > Does this change inadvertently break the handling of negative byte-sized
> > > > > updates?
> > > > > Looking at the rest of the function:
> > > > > if (!val || unit == PAGE_SIZE)
> > > > > return val;
> > > > > else
> > > > > return max(val * unit / PAGE_SIZE, 1UL);
> > > >
> > > > > PAGE_SIZE is defined as an unsigned long.
> > > >
> > > > Right, it's defined as 1UL << PAGE_SHIFT.
> > > >
> > > > > When val is negative, such as during uncharging of byte-sized stats like
> > > > > MEMCG_ZSWAP_B, the expression val * unit is a negative long.
> > > >
> > > > Right.
> > > >
> > > > > Dividing a signed long by an unsigned long causes the signed long to be
> > > > > promoted to unsigned before division,
> > > >
> > > > Right.
> > > >
> > > > > resulting in a massive positive
> > > > > number instead of a small negative one.
> > > >
> > > > Let's look at an example (assuming unit is 1).
> > > >
> > > > val = val * unit = -16384 (-16 KiB)
> > > > val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
> > > > max(0x3FFFFFFFFFFFFF, 1UL) = 0x3FFFFFFFFFF
> > > >
> > > > Yeah, that's a massive positive number.
> > > >
> > > > Hmm but how did it work when it was int?
> > > >
> > > > val = val * unit = -16384 (-16KiB)
> > > > val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
> > > > max(val * unit / PAGE_SIZE, 1UL) = 0x3FFFFFFFFFFFFF
> > > > (int)0x3FFFFFFFFFFFFF = 0xFFFFFFFF = (-1)
> > > >
> > > > That's incorrect. It should have been -4?
> > > >
> > > > > Before this change, the function returned an int, which implicitly truncated
> > > > > the massive unsigned 64-bit result to a 32-bit int, accidentally yielding the
> > > > > correct negative arithmetic value.
> > > >
> > > > So... "accidentally yielding the correct negative arithemetic value"
> > > > is wrong.
> > > >
> > > > Sounds like it's been subtly broken even before this patch and nobody
> > > > noticed.
> > >
> > > Thank you for such a detailed analysis! And I think you are right.
> >
> > No problem ;)
> >
> > > The memcg_state_val_in_pages() is only to make @val to be in pages, so
> > > perhaps we can avoid the above problem by taking the absolute value
> > > first?
>
> For more clearly, here is the diff:
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 6491ca74b3398..2b34805b01476 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -787,11 +787,17 @@ static int memcg_page_state_unit(int item);
> static long memcg_state_val_in_pages(int idx, long val)
> {
> int unit = memcg_page_state_unit(idx);
> + unsigned long uval;
> + long res;
>
> if (!val || unit == PAGE_SIZE)
> return val;
> - else
> - return max(val * unit / PAGE_SIZE, 1UL);
> +
> + uval = val < 0 ? -(unsigned long)val : (unsigned long)val;
We could use abs() macro in linux/math.h here :)
val should not be LONG_MIN, but it should be fine
to assume it won't reach LONG_MIN.
> +
> + res = max(mult_frac(uval, unit, PAGE_SIZE), 1UL);
> +
> + return val < 0 ? -res : res;
> }
>
> #ifdef CONFIG_MEMCG_V1
>
> > That would work for memcg_rstat_updated(),
> > but not for trace_mod_memcg_state()?
>
> Why? The trace_mod_memcg_state() seems to accept 'long' type,
> am I missing something?
Nevermind. Without the diff, I misunderstood what you meant.
Thanks!
--
Cheers,
Harry / Hyeonggon
On 3/25/26 3:36 PM, Harry Yoo (Oracle) wrote:
> On Wed, Mar 25, 2026 at 03:26:46PM +0800, Qi Zheng wrote:
>>
>>
>> On 3/25/26 1:17 PM, Harry Yoo (Oracle) wrote:
>>> On Wed, Mar 25, 2026 at 11:25:06AM +0800, Qi Zheng wrote:
>>>> On 3/25/26 9:43 AM, Harry Yoo (Oracle) wrote:
>>>>> On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
>>>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>
>>>>>> The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
>>>>>> reparent non-hierarchical stats, the values passed to them might exceed
>>>>>> the upper limit of the type int, so correct the val parameter type of them
>>>>>> to long.
>>>>>>
>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>> ---
>>>>>> include/trace/events/memcg.h | 10 +++++-----
>>>>>> mm/memcontrol.c | 8 ++++----
>>>>>> 2 files changed, 9 insertions(+), 9 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>>> index 7fb9cbc10dfbb..4a78550f6174e 100644
>>>>>> --- a/mm/memcontrol.c
>>>>>> +++ b/mm/memcontrol.c
>>>>>> @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>>>>>> #ifdef CONFIG_MEMCG_V1
>>>>>> static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
>>>>>> - enum node_stat_item idx, int val);
>>>>>> + enum node_stat_item idx, long val);
>>>>>> void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
>>>>>> struct mem_cgroup *parent, int idx)
>>>>>> @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
>>>>>> * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
>>>>>> * up non-zero sub-page updates to 1 page as zero page updates are ignored.
>>>>>> */
>>>>>> -static int memcg_state_val_in_pages(int idx, int val)
>>>>>> +static long memcg_state_val_in_pages(int idx, long val)
>>>>>> {
>>>>>> int unit = memcg_page_state_unit(idx);
>>>>>
>>>>> Sashiko AI made an interesting argument [1] that this could lead to
>>>>> incorrectly returning a very large positive number. Let me verify that.
>>>>>
>>>>> [1] https://sashiko.dev/#/patchset/cover.1774342371.git.zhengqi.arch%40bytedance.com
>>>>>
>>>>> Sashiko wrote:
>>>>>> Does this change inadvertently break the handling of negative byte-sized
>>>>>> updates?
>>>>>> Looking at the rest of the function:
>>>>>> if (!val || unit == PAGE_SIZE)
>>>>>> return val;
>>>>>> else
>>>>>> return max(val * unit / PAGE_SIZE, 1UL);
>>>>>
>>>>>> PAGE_SIZE is defined as an unsigned long.
>>>>>
>>>>> Right, it's defined as 1UL << PAGE_SHIFT.
>>>>>
>>>>>> When val is negative, such as during uncharging of byte-sized stats like
>>>>>> MEMCG_ZSWAP_B, the expression val * unit is a negative long.
>>>>>
>>>>> Right.
>>>>>
>>>>>> Dividing a signed long by an unsigned long causes the signed long to be
>>>>>> promoted to unsigned before division,
>>>>>
>>>>> Right.
>>>>>
>>>>>> resulting in a massive positive
>>>>>> number instead of a small negative one.
>>>>>
>>>>> Let's look at an example (assuming unit is 1).
>>>>>
>>>>> val = val * unit = -16384 (-16 KiB)
>>>>> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
>>>>> max(0x3FFFFFFFFFFFFF, 1UL) = 0x3FFFFFFFFFF
>>>>>
>>>>> Yeah, that's a massive positive number.
>>>>>
>>>>> Hmm but how did it work when it was int?
>>>>>
>>>>> val = val * unit = -16384 (-16KiB)
>>>>> val * unit / PAGE_SIZE = 0xFFFFFFFFFFFFC000 / PAGE_SIZE = 0x3FFFFFFFFFFFFF
>>>>> max(val * unit / PAGE_SIZE, 1UL) = 0x3FFFFFFFFFFFFF
>>>>> (int)0x3FFFFFFFFFFFFF = 0xFFFFFFFF = (-1)
>>>>>
>>>>> That's incorrect. It should have been -4?
>>>>>
>>>>>> Before this change, the function returned an int, which implicitly truncated
>>>>>> the massive unsigned 64-bit result to a 32-bit int, accidentally yielding the
>>>>>> correct negative arithmetic value.
>>>>>
>>>>> So... "accidentally yielding the correct negative arithemetic value"
>>>>> is wrong.
>>>>>
>>>>> Sounds like it's been subtly broken even before this patch and nobody
>>>>> noticed.
>>>>
>>>> Thank you for such a detailed analysis! And I think you are right.
>>>
>>> No problem ;)
>>>
>>>> The memcg_state_val_in_pages() is only to make @val to be in pages, so
>>>> perhaps we can avoid the above problem by taking the absolute value
>>>> first?
>>
>> For more clearly, here is the diff:
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 6491ca74b3398..2b34805b01476 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -787,11 +787,17 @@ static int memcg_page_state_unit(int item);
>> static long memcg_state_val_in_pages(int idx, long val)
>> {
>> int unit = memcg_page_state_unit(idx);
>> + unsigned long uval;
>> + long res;
>>
>> if (!val || unit == PAGE_SIZE)
>> return val;
>> - else
>> - return max(val * unit / PAGE_SIZE, 1UL);
>> +
>> + uval = val < 0 ? -(unsigned long)val : (unsigned long)val;
>
> We could use abs() macro in linux/math.h here :)
>
> val should not be LONG_MIN, but it should be fine
> to assume it won't reach LONG_MIN.
OK, will use abs() macro.
>
>> +
>> + res = max(mult_frac(uval, unit, PAGE_SIZE), 1UL);
>> +
>> + return val < 0 ? -res : res;
>> }
>>
>> #ifdef CONFIG_MEMCG_V1
>>
>>> That would work for memcg_rstat_updated(),
>>> but not for trace_mod_memcg_state()?
>>
>> Why? The trace_mod_memcg_state() seems to accept 'long' type,
>> am I missing something?
>
> Nevermind. Without the diff, I misunderstood what you meant.
OK, will apply this diff and send the v2.
Thanks,
Qi
>
> Thanks!
>
On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
>
> The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to
> reparent non-hierarchical stats, the values passed to them might exceed
> the upper limit of the type int, so correct the val parameter type of them
Why might they? What precipitated this change?
> to long.
Why is it a signed value?
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> include/trace/events/memcg.h | 10 +++++-----
> mm/memcontrol.c | 8 ++++----
> 2 files changed, 9 insertions(+), 9 deletions(-)
>
> diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
> index dfe2f51019b4c..51b62c5931fc2 100644
> --- a/include/trace/events/memcg.h
> +++ b/include/trace/events/memcg.h
> @@ -11,14 +11,14 @@
>
> DECLARE_EVENT_CLASS(memcg_rstat_stats,
>
> - TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> + TP_PROTO(struct mem_cgroup *memcg, int item, long val),
>
> TP_ARGS(memcg, item, val),
>
> TP_STRUCT__entry(
> __field(u64, id)
> __field(int, item)
> - __field(int, val)
> + __field(long, val)
> ),
>
> TP_fast_assign(
> @@ -27,20 +27,20 @@ DECLARE_EVENT_CLASS(memcg_rstat_stats,
> __entry->val = val;
> ),
>
> - TP_printk("memcg_id=%llu item=%d val=%d",
> + TP_printk("memcg_id=%llu item=%d val=%ld",
> __entry->id, __entry->item, __entry->val)
> );
>
> DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state,
>
> - TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> + TP_PROTO(struct mem_cgroup *memcg, int item, long val),
>
> TP_ARGS(memcg, item, val)
> );
>
> DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state,
>
> - TP_PROTO(struct mem_cgroup *memcg, int item, int val),
> + TP_PROTO(struct mem_cgroup *memcg, int item, long val),
>
> TP_ARGS(memcg, item, val)
> );
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7fb9cbc10dfbb..4a78550f6174e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -527,7 +527,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>
> #ifdef CONFIG_MEMCG_V1
> static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> - enum node_stat_item idx, int val);
> + enum node_stat_item idx, long val);
>
> void reparent_memcg_lruvec_state_local(struct mem_cgroup *memcg,
> struct mem_cgroup *parent, int idx)
> @@ -784,7 +784,7 @@ static int memcg_page_state_unit(int item);
> * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
> * up non-zero sub-page updates to 1 page as zero page updates are ignored.
> */
> -static int memcg_state_val_in_pages(int idx, int val)
> +static long memcg_state_val_in_pages(int idx, long val)
> {
> int unit = memcg_page_state_unit(idx);
>
> @@ -831,7 +831,7 @@ static inline void get_non_dying_memcg_end(void)
> #endif
>
> static void __mod_memcg_state(struct mem_cgroup *memcg,
> - enum memcg_stat_item idx, int val)
> + enum memcg_stat_item idx, long val)
> {
> int i = memcg_stats_index(idx);
> int cpu;
> @@ -896,7 +896,7 @@ void reparent_memcg_state_local(struct mem_cgroup *memcg,
> #endif
>
> static void __mod_memcg_lruvec_state(struct mem_cgroup_per_node *pn,
> - enum node_stat_item idx, int val)
> + enum node_stat_item idx, long val)
> {
> struct mem_cgroup *memcg = pn->memcg;
> int i = memcg_stats_index(idx);
> --
> 2.20.1
>
Cheers, Lorenzo
On Tue, Mar 24, 2026 at 12:21:06PM +0000, Lorenzo Stoakes (Oracle) wrote: > On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote: > > From: Qi Zheng <zhengqi.arch@bytedance.com> > > > > The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to > > reparent non-hierarchical stats, the values passed to them might exceed > > the upper limit of the type int, so correct the val parameter type of them > > Why might they? What precipitated this change? > > > to long. > > Why is it a signed value? Because 'mod' can both increase and decrease this counter. It must be signed.
On Tue, Mar 24, 2026 at 02:12:00PM +0000, Matthew Wilcox wrote: > On Tue, Mar 24, 2026 at 12:21:06PM +0000, Lorenzo Stoakes (Oracle) wrote: > > On Tue, Mar 24, 2026 at 07:31:28PM +0800, Qi Zheng wrote: > > > From: Qi Zheng <zhengqi.arch@bytedance.com> > > > > > > The __mod_memcg_state() and __mod_memcg_lruvec_state() were used to > > > reparent non-hierarchical stats, the values passed to them might exceed > > > the upper limit of the type int, so correct the val parameter type of them > > > > Why might they? What precipitated this change? > > > > > to long. > > > > Why is it a signed value? > > Because 'mod' can both increase and decrease this counter. It must be > signed. Right yeah noticed that in another patch :) Cheers, Lorenzo
© 2016 - 2026 Red Hat, Inc.