[RFC 2/5] memcontrol: add boot option to enable memsw account on dfl

Jingxiang Zeng posted 5 patches 9 months ago
[RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Jingxiang Zeng 9 months ago
From: Zeng Jingxiang <linuszeng@tencent.com>

Added cgroup.memsw_account_on_dfl startup parameter, which
is off by default. When enabled in cgroupv2 mode, the memory
accounting mode of swap will be reverted to cgroupv1 mode.

Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
---
 include/linux/memcontrol.h |  4 +++-
 mm/memcontrol.c            | 11 +++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dcb087ee6e8d..96f2fad1c351 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
 
 #ifdef CONFIG_MEMCG
 
+DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
 /* Whether enable memory+swap account in cgroupv2 */
 static inline bool do_memsw_account_on_dfl(void)
 {
-	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
+	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
+				|| static_branch_unlikely(&memsw_account_on_dfl);
 }
 
 #define MEM_CGROUP_ID_SHIFT	16
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 768d6b15dbfa..c1171fb2bfd6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
 subsys_initcall(mem_cgroup_swap_init);
 
 #endif /* CONFIG_SWAP */
+
+DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
+static int __init memsw_account_on_dfl_setup(char *s)
+{
+	if (!strcmp(s, "1"))
+		static_branch_enable(&memsw_account_on_dfl);
+	else if (!strcmp(s, "0"))
+		static_branch_disable(&memsw_account_on_dfl);
+	return 1;
+}
+__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);
+
\ No newline at end of file
-- 
2.41.1
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Shakeel Butt 9 months ago
On Wed, Mar 19, 2025 at 02:41:45PM +0800, Jingxiang Zeng wrote:
> From: Zeng Jingxiang <linuszeng@tencent.com>
> 
> Added cgroup.memsw_account_on_dfl startup parameter, which
> is off by default. When enabled in cgroupv2 mode, the memory
> accounting mode of swap will be reverted to cgroupv1 mode.
> 
> Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
> ---
>  include/linux/memcontrol.h |  4 +++-
>  mm/memcontrol.c            | 11 +++++++++++
>  2 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index dcb087ee6e8d..96f2fad1c351 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
>  
>  #ifdef CONFIG_MEMCG
>  
> +DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
>  /* Whether enable memory+swap account in cgroupv2 */
>  static inline bool do_memsw_account_on_dfl(void)
>  {
> -	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
> +	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
> +				|| static_branch_unlikely(&memsw_account_on_dfl);

Why || in above condition? Shouldn't it be && ?

>  }
>  
>  #define MEM_CGROUP_ID_SHIFT	16
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 768d6b15dbfa..c1171fb2bfd6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
>  subsys_initcall(mem_cgroup_swap_init);
>  
>  #endif /* CONFIG_SWAP */
> +
> +DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> +static int __init memsw_account_on_dfl_setup(char *s)
> +{
> +	if (!strcmp(s, "1"))
> +		static_branch_enable(&memsw_account_on_dfl);
> +	else if (!strcmp(s, "0"))
> +		static_branch_disable(&memsw_account_on_dfl);
> +	return 1;
> +}
> +__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);

Please keep the above in memcontrol-v1.c

> +
> \ No newline at end of file
> -- 
> 2.41.1
>
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by jingxiang zeng 9 months ago
On Thu, 20 Mar 2025 at 03:34, Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Wed, Mar 19, 2025 at 02:41:45PM +0800, Jingxiang Zeng wrote:
> > From: Zeng Jingxiang <linuszeng@tencent.com>
> >
> > Added cgroup.memsw_account_on_dfl startup parameter, which
> > is off by default. When enabled in cgroupv2 mode, the memory
> > accounting mode of swap will be reverted to cgroupv1 mode.
> >
> > Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
> > ---
> >  include/linux/memcontrol.h |  4 +++-
> >  mm/memcontrol.c            | 11 +++++++++++
> >  2 files changed, 14 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index dcb087ee6e8d..96f2fad1c351 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
> >
> >  #ifdef CONFIG_MEMCG
> >
> > +DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> >  /* Whether enable memory+swap account in cgroupv2 */
> >  static inline bool do_memsw_account_on_dfl(void)
> >  {
> > -     return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
> > +     return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
> > +                             || static_branch_unlikely(&memsw_account_on_dfl);
>
> Why || in above condition? Shouldn't it be && ?

It seems that changing it to && is better, Thanks.
>
> >  }
> >
> >  #define MEM_CGROUP_ID_SHIFT  16
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 768d6b15dbfa..c1171fb2bfd6 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
> >  subsys_initcall(mem_cgroup_swap_init);
> >
> >  #endif /* CONFIG_SWAP */
> > +
> > +DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> > +static int __init memsw_account_on_dfl_setup(char *s)
> > +{
> > +     if (!strcmp(s, "1"))
> > +             static_branch_enable(&memsw_account_on_dfl);
> > +     else if (!strcmp(s, "0"))
> > +             static_branch_disable(&memsw_account_on_dfl);
> > +     return 1;
> > +}
> > +__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);
>
> Please keep the above in memcontrol-v1.c
>
> > +
> > \ No newline at end of file
> > --
> > 2.41.1
> >
>
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Roman Gushchin 9 months ago
Shakeel Butt <shakeel.butt@linux.dev> writes:

> On Wed, Mar 19, 2025 at 02:41:45PM +0800, Jingxiang Zeng wrote:
>> From: Zeng Jingxiang <linuszeng@tencent.com>
>> 
>> Added cgroup.memsw_account_on_dfl startup parameter, which
>> is off by default. When enabled in cgroupv2 mode, the memory
>> accounting mode of swap will be reverted to cgroupv1 mode.
>> 
>> Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
>> ---
>>  include/linux/memcontrol.h |  4 +++-
>>  mm/memcontrol.c            | 11 +++++++++++
>>  2 files changed, 14 insertions(+), 1 deletion(-)
>> 
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index dcb087ee6e8d..96f2fad1c351 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
>>  
>>  #ifdef CONFIG_MEMCG
>>  
>> +DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
>>  /* Whether enable memory+swap account in cgroupv2 */
>>  static inline bool do_memsw_account_on_dfl(void)
>>  {
>> -	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
>> +	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
>> +				|| static_branch_unlikely(&memsw_account_on_dfl);
>
> Why || in above condition? Shouldn't it be && ?
>
>>  }
>>  
>>  #define MEM_CGROUP_ID_SHIFT	16
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 768d6b15dbfa..c1171fb2bfd6 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
>>  subsys_initcall(mem_cgroup_swap_init);
>>  
>>  #endif /* CONFIG_SWAP */
>> +
>> +DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
>> +static int __init memsw_account_on_dfl_setup(char *s)
>> +{
>> +	if (!strcmp(s, "1"))
>> +		static_branch_enable(&memsw_account_on_dfl);
>> +	else if (!strcmp(s, "0"))
>> +		static_branch_disable(&memsw_account_on_dfl);
>> +	return 1;
>> +}
>> +__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);
>
> Please keep the above in memcontrol-v1.c

Hm, I'm not sure about this. This feature might be actually useful with
cgroup v2, as some companies are dependent on the old cgroup v1
semantics here but otherwise would prefer to move to v2.
In other words, I see it as a cgroup v2 feature, not as a cgroup v1.
So there is no reason to move it into the cgroup v1 code.

I think it deserves a separate config option (if we're really concerned
about the memory overhead in struct mem_cgroup) or IMO better a
boot/mount time option.

Thanks!
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Johannes Weiner 9 months ago
On Wed, Mar 19, 2025 at 10:30:20PM +0000, Roman Gushchin wrote:
> Shakeel Butt <shakeel.butt@linux.dev> writes:
> 
> > On Wed, Mar 19, 2025 at 02:41:45PM +0800, Jingxiang Zeng wrote:
> >> From: Zeng Jingxiang <linuszeng@tencent.com>
> >> 
> >> Added cgroup.memsw_account_on_dfl startup parameter, which
> >> is off by default. When enabled in cgroupv2 mode, the memory
> >> accounting mode of swap will be reverted to cgroupv1 mode.
> >> 
> >> Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
> >> ---
> >>  include/linux/memcontrol.h |  4 +++-
> >>  mm/memcontrol.c            | 11 +++++++++++
> >>  2 files changed, 14 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> >> index dcb087ee6e8d..96f2fad1c351 100644
> >> --- a/include/linux/memcontrol.h
> >> +++ b/include/linux/memcontrol.h
> >> @@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
> >>  
> >>  #ifdef CONFIG_MEMCG
> >>  
> >> +DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> >>  /* Whether enable memory+swap account in cgroupv2 */
> >>  static inline bool do_memsw_account_on_dfl(void)
> >>  {
> >> -	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
> >> +	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
> >> +				|| static_branch_unlikely(&memsw_account_on_dfl);
> >
> > Why || in above condition? Shouldn't it be && ?
> >
> >>  }
> >>  
> >>  #define MEM_CGROUP_ID_SHIFT	16
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index 768d6b15dbfa..c1171fb2bfd6 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
> >>  subsys_initcall(mem_cgroup_swap_init);
> >>  
> >>  #endif /* CONFIG_SWAP */
> >> +
> >> +DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> >> +static int __init memsw_account_on_dfl_setup(char *s)
> >> +{
> >> +	if (!strcmp(s, "1"))
> >> +		static_branch_enable(&memsw_account_on_dfl);
> >> +	else if (!strcmp(s, "0"))
> >> +		static_branch_disable(&memsw_account_on_dfl);
> >> +	return 1;
> >> +}
> >> +__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);
> >
> > Please keep the above in memcontrol-v1.c
> 
> Hm, I'm not sure about this. This feature might be actually useful with
> cgroup v2, as some companies are dependent on the old cgroup v1
> semantics here but otherwise would prefer to move to v2.
> In other words, I see it as a cgroup v2 feature, not as a cgroup v1.
> So there is no reason to move it into the cgroup v1 code.

Agreed. Let's think of this proposal as making memsw tracking and
control a full-fledged v2 feature.

> I think it deserves a separate config option (if we're really concerned
> about the memory overhead in struct mem_cgroup) or IMO better a
> boot/mount time option.

Yeah, a config option forces distros to enable it :/

I'm hesitant to agree with making it optional in any manner. If you
consider the functionality that is implemented, the overhead should be
fairly minimal. It isn't right now, because page_counter contains a
ton of stuff that isn't applicable to this new user. That overhead is
still paid for unnecessarily by users who _do_ need to enable it.

It seems like a good opportunity to refactor struct page_counter.
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Shakeel Butt 9 months ago
On Thu, Mar 20, 2025 at 10:28:46AM -0400, Johannes Weiner wrote:
> On Wed, Mar 19, 2025 at 10:30:20PM +0000, Roman Gushchin wrote:
> > Shakeel Butt <shakeel.butt@linux.dev> writes:
> > 
> > > On Wed, Mar 19, 2025 at 02:41:45PM +0800, Jingxiang Zeng wrote:
> > >> From: Zeng Jingxiang <linuszeng@tencent.com>
> > >> 
> > >> Added cgroup.memsw_account_on_dfl startup parameter, which
> > >> is off by default. When enabled in cgroupv2 mode, the memory
> > >> accounting mode of swap will be reverted to cgroupv1 mode.
> > >> 
> > >> Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
> > >> ---
> > >>  include/linux/memcontrol.h |  4 +++-
> > >>  mm/memcontrol.c            | 11 +++++++++++
> > >>  2 files changed, 14 insertions(+), 1 deletion(-)
> > >> 
> > >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > >> index dcb087ee6e8d..96f2fad1c351 100644
> > >> --- a/include/linux/memcontrol.h
> > >> +++ b/include/linux/memcontrol.h
> > >> @@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
> > >>  
> > >>  #ifdef CONFIG_MEMCG
> > >>  
> > >> +DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> > >>  /* Whether enable memory+swap account in cgroupv2 */
> > >>  static inline bool do_memsw_account_on_dfl(void)
> > >>  {
> > >> -	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
> > >> +	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
> > >> +				|| static_branch_unlikely(&memsw_account_on_dfl);
> > >
> > > Why || in above condition? Shouldn't it be && ?
> > >
> > >>  }
> > >>  
> > >>  #define MEM_CGROUP_ID_SHIFT	16
> > >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > >> index 768d6b15dbfa..c1171fb2bfd6 100644
> > >> --- a/mm/memcontrol.c
> > >> +++ b/mm/memcontrol.c
> > >> @@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
> > >>  subsys_initcall(mem_cgroup_swap_init);
> > >>  
> > >>  #endif /* CONFIG_SWAP */
> > >> +
> > >> +DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> > >> +static int __init memsw_account_on_dfl_setup(char *s)
> > >> +{
> > >> +	if (!strcmp(s, "1"))
> > >> +		static_branch_enable(&memsw_account_on_dfl);
> > >> +	else if (!strcmp(s, "0"))
> > >> +		static_branch_disable(&memsw_account_on_dfl);
> > >> +	return 1;
> > >> +}
> > >> +__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);
> > >
> > > Please keep the above in memcontrol-v1.c
> > 
> > Hm, I'm not sure about this. This feature might be actually useful with
> > cgroup v2, as some companies are dependent on the old cgroup v1
> > semantics here but otherwise would prefer to move to v2.
> > In other words, I see it as a cgroup v2 feature, not as a cgroup v1.
> > So there is no reason to move it into the cgroup v1 code.
> 
> Agreed. Let's think of this proposal as making memsw tracking and
> control a full-fledged v2 feature.
> 

Sounds good. However I want us to discuss and decide the semantics of
memsw from scratch rather than adopting v1 semantics. Particularly I
don't like the failure of setting memsw limit on v1. Also we should
discuss how memsw and swap limits would interact and what would be the
appropriate default.
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Michal Koutný 8 months, 2 weeks ago
On Thu, Mar 20, 2025 at 08:33:09AM -0700, Shakeel Butt <shakeel.butt@linux.dev> wrote:
> However I want us to discuss and decide the semantics of
> memsw from scratch rather than adopting v1 semantics.

+1

> Also we should discuss how memsw and swap limits would interact and
> what would be the appropriate default.

Besides more complicated implementation, merged memsw won't represent an
actual resource.

So I'd be interested in use cases (other than "used to it from v1") that
cannot be controlled with separate memory. and swap. limits.


0.02€,
Michal
Re: [External] Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Zhongkun He 8 months, 2 weeks ago
On Wed, Apr 2, 2025 at 9:42 PM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Thu, Mar 20, 2025 at 08:33:09AM -0700, Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > However I want us to discuss and decide the semantics of
> > memsw from scratch rather than adopting v1 semantics.
>
> +1
>
> > Also we should discuss how memsw and swap limits would interact and
> > what would be the appropriate default.
>
> Besides more complicated implementation, merged memsw won't represent an
> actual resource.
>
> So I'd be interested in use cases (other than "used to it from v1") that
> cannot be controlled with separate memory. and swap. limits.
>

Hi Michal

We encountered an issue, which is also a real use case. With memory offloading,
we can move some cold pages to swap. Suppose an application’s peak memory
usage at certain times is 10GB, while at other times, it exists in a
combination of
memory and swap. If we set limits on memory or swap separately, it would lack
flexibility—sometimes it needs 1GB memory + 9GB swap, sometimes 5GB
memory + 5GB swap, or even 10GB memory + 0GB swap. Therefore, we strongly
hope to use the mem+swap charging method in cgroupv2

>
> 0.02€,
> Michal
Re: [External] Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by jingxiang zeng 8 months, 2 weeks ago
On Thu, 3 Apr 2025 at 15:47, Zhongkun He <hezhongkun.hzk@bytedance.com> wrote:
>
> On Wed, Apr 2, 2025 at 9:42 PM Michal Koutný <mkoutny@suse.com> wrote:
> >
> > On Thu, Mar 20, 2025 at 08:33:09AM -0700, Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > However I want us to discuss and decide the semantics of
> > > memsw from scratch rather than adopting v1 semantics.
> >
> > +1
> >
> > > Also we should discuss how memsw and swap limits would interact and
> > > what would be the appropriate default.
> >
> > Besides more complicated implementation, merged memsw won't represent an
> > actual resource.
> >
> > So I'd be interested in use cases (other than "used to it from v1") that
> > cannot be controlled with separate memory. and swap. limits.
> >
>
> Hi Michal
>
> We encountered an issue, which is also a real use case. With memory offloading,
> we can move some cold pages to swap. Suppose an application’s peak memory
> usage at certain times is 10GB, while at other times, it exists in a
> combination of
> memory and swap. If we set limits on memory or swap separately, it would lack
> flexibility—sometimes it needs 1GB memory + 9GB swap, sometimes 5GB
> memory + 5GB swap, or even 10GB memory + 0GB swap. Therefore, we strongly
> hope to use the mem+swap charging method in cgroupv2
>
> >
> > 0.02€,
> > Michal

Yes, in the container scenario, if swap is enabled on the server and
the customer's
container requires 10GB of memory, we only need to set
memory.memsw.limit_in_bytes=10GB, and the kernel can automatically swap out
part of the business container's memory to swap according to the server's memory
pressure, and it can be fully guaranteed that the customer's container
will not use
more memory because swap is enabled on the server.
>
Re: [External] Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Michal Koutný 8 months, 1 week ago
On Thu, Apr 03, 2025 at 05:16:45PM +0800, jingxiang zeng <jingxiangzeng.cas@gmail.com> wrote:
> > We encountered an issue, which is also a real use case. With memory offloading,
> > we can move some cold pages to swap. Suppose an application’s peak memory
> > usage at certain times is 10GB, while at other times, it exists in a
> > combination of
> > memory and swap. If we set limits on memory or swap separately, it would lack
> > flexibility—sometimes it needs 1GB memory + 9GB swap, sometimes 5GB
> > memory + 5GB swap, or even 10GB memory + 0GB swap. Therefore, we strongly
> > hope to use the mem+swap charging method in cgroupv2

App's peak need determines memory.max=10G.
The apparent flexibility is dependency on how much competitors the app
has. It can run 5GB memory + 5GB swap with some competition or 1GB
memory + 9 GB with different competition (more memory demanding).
If you want to prevent faulty app to eating up all of swap for itself
(like it's possible with memsw), you may define some memory.swap.max.
(There's no unique correspondence between this and original memsw value
since the cost of mem<->swap is variable.)


> Yes, in the container scenario, if swap is enabled on the server and
> the customer's container requires 10GB of memory, we only need to set
> memory.memsw.limit_in_bytes=10GB, and the kernel can automatically
> swap out part of the business container's memory to swap according to
> the server's memory pressure, and it can be fully guaranteed that the
> customer's container will not use more memory because swap is enabled
> on the server.

This made me consider various causes of the pressure:

- global pressure
  - it doesn't change memcg's total consuption (memsw.usage=const)
  - memsw limit does nothing
- self-memcg pressure
  - new allocations against own limit and memsw.usage hits memsw.limit
  - memsw.limit prevents new allocations that would extend swap
  - achievable with memory.swap.max=0
- ancestral pressure 
  - when sibling needs to allocate but limit is on ancestor
  - similar to global pressure (memsw.usage=const), self memsw.limit
    does nothing

- or there is no outer pressure but you want to prevent new allocations
  when something has been swapped out already
  - swapped out amount is a debt
    - memsw.limit behavior is suboptimal until the debt needs to be
      repaid
      - repay is when someone else needs the swap space

The above is a free flow of thoughts but I'd condense such conversions:
- memory.max := memory.memsw.limit_in_bytes
- memory.swap.max := anything between 0 and memory.memsw.limit_in_bytes

Did I fail to capture some mode where memsw limits were superior?

Thanks,
Michal
Re: [External] Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by jingxiang zeng 8 months ago
On Sat, 12 Apr 2025 at 00:57, Michal Koutný <mkoutny@suse.com> wrote:
>
> On Thu, Apr 03, 2025 at 05:16:45PM +0800, jingxiang zeng <jingxiangzeng.cas@gmail.com> wrote:
> > > We encountered an issue, which is also a real use case. With memory offloading,
> > > we can move some cold pages to swap. Suppose an application’s peak memory
> > > usage at certain times is 10GB, while at other times, it exists in a
> > > combination of
> > > memory and swap. If we set limits on memory or swap separately, it would lack
> > > flexibility—sometimes it needs 1GB memory + 9GB swap, sometimes 5GB
> > > memory + 5GB swap, or even 10GB memory + 0GB swap. Therefore, we strongly
> > > hope to use the mem+swap charging method in cgroupv2
>
> App's peak need determines memory.max=10G.
> The apparent flexibility is dependency on how much competitors the app
> has. It can run 5GB memory + 5GB swap with some competition or 1GB
> memory + 9 GB with different competition (more memory demanding).
> If you want to prevent faulty app to eating up all of swap for itself
> (like it's possible with memsw), you may define some memory.swap.max.
> (There's no unique correspondence between this and original memsw value
> since the cost of mem<->swap is variable.)
>
>
> > Yes, in the container scenario, if swap is enabled on the server and
> > the customer's container requires 10GB of memory, we only need to set
> > memory.memsw.limit_in_bytes=10GB, and the kernel can automatically
> > swap out part of the business container's memory to swap according to
> > the server's memory pressure, and it can be fully guaranteed that the
> > customer's container will not use more memory because swap is enabled
> > on the server.
>
> This made me consider various causes of the pressure:
>
> - global pressure
>   - it doesn't change memcg's total consuption (memsw.usage=const)
>   - memsw limit does nothing
> - self-memcg pressure
>   - new allocations against own limit and memsw.usage hits memsw.limit
>   - memsw.limit prevents new allocations that would extend swap
>   - achievable with memory.swap.max=0
> - ancestral pressure
>   - when sibling needs to allocate but limit is on ancestor
>   - similar to global pressure (memsw.usage=const), self memsw.limit
>     does nothing
>
> - or there is no outer pressure but you want to prevent new allocations
>   when something has been swapped out already
>   - swapped out amount is a debt
>     - memsw.limit behavior is suboptimal until the debt needs to be
>       repaid
>       - repay is when someone else needs the swap space
>
> The above is a free flow of thoughts but I'd condense such conversions:
> - memory.max := memory.memsw.limit_in_bytes
> - memory.swap.max := anything between 0 and memory.memsw.limit_in_bytes
>
> Did I fail to capture some mode where memsw limits were superior?
>
Hi, Michal

In fact, the memsw counter is mainly effective in proactive memory offload
scenarios.

For example, the current container memory usage is as follows:
memory.limit_in_bytes = 10GB
memory.usage_in_bytes = 9GB

Theoretically, through the memory.reclaim proactive reclaim interface, the
memory usage of [0GB, 9GB] can be reclaimed to the swap, so:
memory.limit_in_bytes = 10GB
memory.usage_in_bytes = 9GB - [0GB, 9GB]

In the case of proactive memory offload, the amount of memory that can be
reclaimed is determined by the container's PSI and other indicators. It is
difficult to set an accurate memory.swap.max value.
memory.swap.current = [0GB, 9GB]
memory.swap.max = ?

The memory space saved by swapping out to swap can continue to load
the operation of system components or more workloads.
memory.limit_in_bytes = 10GB
memory.usage_in_bytes = 9GB - [0GB, 9GB]
memory.swap.current = [0GB, 9GB]

The memory usage of memory.usage_in_bytes is reduced due to proactive
offload to swap, which will cause additional problems, such as:
1. There may be some memory leaks or abnormal imported network traffic
in the container, which may cause OOM to fail to trigger or be triggered late;
2. In the oversold scenario, if the container's memory requirement is 10GB,
the container's memory+swap should only use 10GB.

In the above scenario, the memsw counter is very useful:
memory.limit_in_bytes = 10GB
memory.usage_in_bytes = 9GB - [0GB, 9GB]

memory.memsw.limit_in_bytes = 10GB
memory.memsw.usage_in_bytes = 9GB

Above, thanks.
> Thanks,
> Michal
Re: [External] Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Michal Koutný 7 months, 2 weeks ago
(Excuse my long turnarounds, I assume this needs time.)

On Wed, Apr 16, 2025 at 04:29:13PM +0800, jingxiang zeng <jingxiangzeng.cas@gmail.com> wrote:
> ...
> In fact, the memsw counter is mainly effective in proactive memory offload
> scenarios.

Is that some downstream experience or aspiration? Because there's
no kernel where memsw and proactive reclaim coexist.

> It is difficult to set an accurate memory.swap.max value.
> memory.swap.current = [0GB, 9GB]
> memory.swap.max = ?

I likely don't understand, I'd consider the value of 10GB in this
case...

> The memory space saved by swapping out to swap can continue to load
> the operation of system components or more workloads.
> memory.limit_in_bytes = 10GB
> memory.usage_in_bytes = 9GB - [0GB, 9GB]
> memory.swap.current = [0GB, 9GB]
> 
> The memory usage of memory.usage_in_bytes is reduced due to proactive
> offload to swap, which will cause additional problems, such as:
> 1. There may be some memory leaks or abnormal imported network traffic
> in the container, which may cause OOM to fail to trigger or be triggered late;
> 2. In the oversold scenario, if the container's memory requirement is 10GB,
> the container's memory+swap should only use 10GB.

...which would mean per-container usage doesn't exceed 20GB (leaks
remain bound).
Apparently, you could fit less containers but would they be actually
running (memsw doesn't guarantee that). Or what is problematic with
different treatment of overcommit between memsw vs .max and .swap.max?

Regards,
Michal
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by Roman Gushchin 9 months ago
Johannes Weiner <hannes@cmpxchg.org> writes:

> On Wed, Mar 19, 2025 at 10:30:20PM +0000, Roman Gushchin wrote:
>> Shakeel Butt <shakeel.butt@linux.dev> writes:
>> 
>> > On Wed, Mar 19, 2025 at 02:41:45PM +0800, Jingxiang Zeng wrote:
>> >> From: Zeng Jingxiang <linuszeng@tencent.com>
>> >> 
>> >> Added cgroup.memsw_account_on_dfl startup parameter, which
>> >> is off by default. When enabled in cgroupv2 mode, the memory
>> >> accounting mode of swap will be reverted to cgroupv1 mode.
>> >> 
>> >> Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
>> >> ---
>> >>  include/linux/memcontrol.h |  4 +++-
>> >>  mm/memcontrol.c            | 11 +++++++++++
>> >>  2 files changed, 14 insertions(+), 1 deletion(-)
>> >> 
>> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> >> index dcb087ee6e8d..96f2fad1c351 100644
>> >> --- a/include/linux/memcontrol.h
>> >> +++ b/include/linux/memcontrol.h
>> >> @@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
>> >>  
>> >>  #ifdef CONFIG_MEMCG
>> >>  
>> >> +DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
>> >>  /* Whether enable memory+swap account in cgroupv2 */
>> >>  static inline bool do_memsw_account_on_dfl(void)
>> >>  {
>> >> -	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
>> >> +	return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
>> >> +				|| static_branch_unlikely(&memsw_account_on_dfl);
>> >
>> > Why || in above condition? Shouldn't it be && ?
>> >
>> >>  }
>> >>  
>> >>  #define MEM_CGROUP_ID_SHIFT	16
>> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> >> index 768d6b15dbfa..c1171fb2bfd6 100644
>> >> --- a/mm/memcontrol.c
>> >> +++ b/mm/memcontrol.c
>> >> @@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
>> >>  subsys_initcall(mem_cgroup_swap_init);
>> >>  
>> >>  #endif /* CONFIG_SWAP */
>> >> +
>> >> +DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
>> >> +static int __init memsw_account_on_dfl_setup(char *s)
>> >> +{
>> >> +	if (!strcmp(s, "1"))
>> >> +		static_branch_enable(&memsw_account_on_dfl);
>> >> +	else if (!strcmp(s, "0"))
>> >> +		static_branch_disable(&memsw_account_on_dfl);
>> >> +	return 1;
>> >> +}
>> >> +__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);
>> >
>> > Please keep the above in memcontrol-v1.c
>> 
>> Hm, I'm not sure about this. This feature might be actually useful with
>> cgroup v2, as some companies are dependent on the old cgroup v1
>> semantics here but otherwise would prefer to move to v2.
>> In other words, I see it as a cgroup v2 feature, not as a cgroup v1.
>> So there is no reason to move it into the cgroup v1 code.
>
> Agreed. Let's think of this proposal as making memsw tracking and
> control a full-fledged v2 feature.
>
>> I think it deserves a separate config option (if we're really concerned
>> about the memory overhead in struct mem_cgroup) or IMO better a
>> boot/mount time option.
>
> Yeah, a config option forces distros to enable it :/
>
> I'm hesitant to agree with making it optional in any manner. If you
> consider the functionality that is implemented, the overhead should be
> fairly minimal. It isn't right now, because page_counter contains a
> ton of stuff that isn't applicable to this new user. That overhead is
> still paid for unnecessarily by users who _do_ need to enable it.

Agree. Memcg is already huge, so another page_counter won't add a lot
percentage-wise.

>
> It seems like a good opportunity to refactor struct page_counter.

I don't think it's a hard dependency here, but otherwise fully agree.

Thanks!
Re: [RFC 2/5] memcontrol: add boot option to enable memsw account on dfl
Posted by jingxiang zeng 9 months ago
On Thu, 20 Mar 2025 at 06:32, Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> Shakeel Butt <shakeel.butt@linux.dev> writes:
>
> > On Wed, Mar 19, 2025 at 02:41:45PM +0800, Jingxiang Zeng wrote:
> >> From: Zeng Jingxiang <linuszeng@tencent.com>
> >>
> >> Added cgroup.memsw_account_on_dfl startup parameter, which
> >> is off by default. When enabled in cgroupv2 mode, the memory
> >> accounting mode of swap will be reverted to cgroupv1 mode.
> >>
> >> Signed-off-by: Zeng Jingxiang <linuszeng@tencent.com>
> >> ---
> >>  include/linux/memcontrol.h |  4 +++-
> >>  mm/memcontrol.c            | 11 +++++++++++
> >>  2 files changed, 14 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> >> index dcb087ee6e8d..96f2fad1c351 100644
> >> --- a/include/linux/memcontrol.h
> >> +++ b/include/linux/memcontrol.h
> >> @@ -62,10 +62,12 @@ struct mem_cgroup_reclaim_cookie {
> >>
> >>  #ifdef CONFIG_MEMCG
> >>
> >> +DECLARE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> >>  /* Whether enable memory+swap account in cgroupv2 */
> >>  static inline bool do_memsw_account_on_dfl(void)
> >>  {
> >> -    return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL);
> >> +    return IS_ENABLED(CONFIG_MEMSW_ACCOUNT_ON_DFL)
> >> +                            || static_branch_unlikely(&memsw_account_on_dfl);
> >
> > Why || in above condition? Shouldn't it be && ?
> >
> >>  }
> >>
> >>  #define MEM_CGROUP_ID_SHIFT 16
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index 768d6b15dbfa..c1171fb2bfd6 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -5478,3 +5478,14 @@ static int __init mem_cgroup_swap_init(void)
> >>  subsys_initcall(mem_cgroup_swap_init);
> >>
> >>  #endif /* CONFIG_SWAP */
> >> +
> >> +DEFINE_STATIC_KEY_FALSE(memsw_account_on_dfl);
> >> +static int __init memsw_account_on_dfl_setup(char *s)
> >> +{
> >> +    if (!strcmp(s, "1"))
> >> +            static_branch_enable(&memsw_account_on_dfl);
> >> +    else if (!strcmp(s, "0"))
> >> +            static_branch_disable(&memsw_account_on_dfl);
> >> +    return 1;
> >> +}
> >> +__setup("cgroup.memsw_account_on_dfl=", memsw_account_on_dfl_setup);
> >
> > Please keep the above in memcontrol-v1.c
>
> Hm, I'm not sure about this. This feature might be actually useful with
> cgroup v2, as some companies are dependent on the old cgroup v1
> semantics here but otherwise would prefer to move to v2.
> In other words, I see it as a cgroup v2 feature, not as a cgroup v1.
> So there is no reason to move it into the cgroup v1 code.

Yes, this is mainly intended for use with v2.
>
> I think it deserves a separate config option (if we're really concerned
> about the memory overhead in struct mem_cgroup) or IMO better a
> boot/mount time option.
>
> Thanks!
>