When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
sk->sk_memcg based on the current task.
MPTCP subflow socket creation is triggered from userspace or
an in-kernel worker.
In the latter case, sk->sk_memcg is not what we want. So, we fix
it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
under #ifdef CONFIG_SOCK_CGROUP_DATA.
The two configs are orthogonal. If CONFIG_MEMCG is enabled without
CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
correctly.
Let's wrap sock_create_kern() for subflow with set_active_memcg()
using the parent sk->sk_memcg.
Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
Suggested-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
mm/memcontrol.c | 5 ++++-
net/mptcp/subflow.c | 11 +++--------
2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8dd7fbed5a94..450862e7fd7a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
if (!in_task())
return;
+ memcg = current->active_memcg;
+
rcu_read_lock();
- memcg = mem_cgroup_from_task(current);
+ if (likely(!memcg))
+ memcg = mem_cgroup_from_task(current);
if (mem_cgroup_is_root(memcg))
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 3f1b62a9fe88..a4809054ea6c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1717,14 +1717,6 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
/* only the additional subflows created by kworkers have to be modified */
if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
cgroup_id(sock_cgroup_ptr(child_skcd))) {
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = parent->sk_memcg;
-
- mem_cgroup_sk_free(child);
- if (memcg && css_tryget(&memcg->css))
- child->sk_memcg = memcg;
-#endif /* CONFIG_MEMCG */
-
cgroup_sk_free(child_skcd);
*child_skcd = *parent_skcd;
cgroup_sk_clone(child_skcd);
@@ -1757,6 +1749,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
{
struct mptcp_subflow_context *subflow;
struct net *net = sock_net(sk);
+ struct mem_cgroup *memcg;
struct socket *sf;
int err;
@@ -1766,7 +1759,9 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
if (unlikely(!sk->sk_socket))
return -EINVAL;
+ memcg = set_active_memcg(sk->sk_memcg);
err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf);
+ set_active_memcg(memcg);
if (err)
return err;
--
2.51.0.rc1.163.g2494970778-goog
On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
> When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
> sk->sk_memcg based on the current task.
>
> MPTCP subflow socket creation is triggered from userspace or
> an in-kernel worker.
>
> In the latter case, sk->sk_memcg is not what we want. So, we fix
> it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
>
> Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
> under #ifdef CONFIG_SOCK_CGROUP_DATA.
>
> The two configs are orthogonal. If CONFIG_MEMCG is enabled without
> CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
> correctly.
>
> Let's wrap sock_create_kern() for subflow with set_active_memcg()
> using the parent sk->sk_memcg.
>
> Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
> Suggested-by: Michal Koutný <mkoutny@suse.com>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> ---
> mm/memcontrol.c | 5 ++++-
> net/mptcp/subflow.c | 11 +++--------
> 2 files changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8dd7fbed5a94..450862e7fd7a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
> if (!in_task())
> return;
>
> + memcg = current->active_memcg;
> +
Use active_memcg() instead of current->active_memcg and do before the
!in_task() check.
Basically something like following:
memcg = active_memcg();
/* Do not associate the sock with unrelated interrupted task's memcg. */
if (!in_task() && !memcg)
return;
> rcu_read_lock();
> - memcg = mem_cgroup_from_task(current);
> + if (likely(!memcg))
> + memcg = mem_cgroup_from_task(current);
> if (mem_cgroup_is_root(memcg))
> goto out;
> if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
> diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
> index 3f1b62a9fe88..a4809054ea6c 100644
> --- a/net/mptcp/subflow.c
> +++ b/net/mptcp/subflow.c
> @@ -1717,14 +1717,6 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
> /* only the additional subflows created by kworkers have to be modified */
> if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
> cgroup_id(sock_cgroup_ptr(child_skcd))) {
> -#ifdef CONFIG_MEMCG
> - struct mem_cgroup *memcg = parent->sk_memcg;
> -
> - mem_cgroup_sk_free(child);
> - if (memcg && css_tryget(&memcg->css))
> - child->sk_memcg = memcg;
> -#endif /* CONFIG_MEMCG */
> -
> cgroup_sk_free(child_skcd);
> *child_skcd = *parent_skcd;
> cgroup_sk_clone(child_skcd);
> @@ -1757,6 +1749,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
> {
> struct mptcp_subflow_context *subflow;
> struct net *net = sock_net(sk);
> + struct mem_cgroup *memcg;
> struct socket *sf;
> int err;
>
> @@ -1766,7 +1759,9 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
> if (unlikely(!sk->sk_socket))
> return -EINVAL;
>
> + memcg = set_active_memcg(sk->sk_memcg);
> err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf);
> + set_active_memcg(memcg);
> if (err)
> return err;
>
> --
> 2.51.0.rc1.163.g2494970778-goog
>
On Thu, Aug 14, 2025 at 2:44 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
> > When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
> > sk->sk_memcg based on the current task.
> >
> > MPTCP subflow socket creation is triggered from userspace or
> > an in-kernel worker.
> >
> > In the latter case, sk->sk_memcg is not what we want. So, we fix
> > it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
> >
> > Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
> > under #ifdef CONFIG_SOCK_CGROUP_DATA.
> >
> > The two configs are orthogonal. If CONFIG_MEMCG is enabled without
> > CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
> > correctly.
> >
> > Let's wrap sock_create_kern() for subflow with set_active_memcg()
> > using the parent sk->sk_memcg.
> >
> > Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
> > Suggested-by: Michal Koutný <mkoutny@suse.com>
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> > ---
> > mm/memcontrol.c | 5 ++++-
> > net/mptcp/subflow.c | 11 +++--------
> > 2 files changed, 7 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 8dd7fbed5a94..450862e7fd7a 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
> > if (!in_task())
> > return;
> >
> > + memcg = current->active_memcg;
> > +
>
> Use active_memcg() instead of current->active_memcg and do before the
> !in_task() check.
Why not reuse the !in_task() check here ?
We never use int_active_memcg for socket and also
know int_active_memcg is always NULL here.
>
> Basically something like following:
>
> memcg = active_memcg();
> /* Do not associate the sock with unrelated interrupted task's memcg. */
> if (!in_task() && !memcg)
> return;
On Thu, Aug 14, 2025 at 04:27:31PM -0700, Kuniyuki Iwashima wrote:
> On Thu, Aug 14, 2025 at 2:44 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
> > > When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
> > > sk->sk_memcg based on the current task.
> > >
> > > MPTCP subflow socket creation is triggered from userspace or
> > > an in-kernel worker.
> > >
> > > In the latter case, sk->sk_memcg is not what we want. So, we fix
> > > it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
> > >
> > > Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
> > > under #ifdef CONFIG_SOCK_CGROUP_DATA.
> > >
> > > The two configs are orthogonal. If CONFIG_MEMCG is enabled without
> > > CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
> > > correctly.
> > >
> > > Let's wrap sock_create_kern() for subflow with set_active_memcg()
> > > using the parent sk->sk_memcg.
> > >
> > > Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
> > > Suggested-by: Michal Koutný <mkoutny@suse.com>
> > > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> > > ---
> > > mm/memcontrol.c | 5 ++++-
> > > net/mptcp/subflow.c | 11 +++--------
> > > 2 files changed, 7 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 8dd7fbed5a94..450862e7fd7a 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
> > > if (!in_task())
> > > return;
> > >
> > > + memcg = current->active_memcg;
> > > +
> >
> > Use active_memcg() instead of current->active_memcg and do before the
> > !in_task() check.
>
> Why not reuse the !in_task() check here ?
> We never use int_active_memcg for socket and also
> know int_active_memcg is always NULL here.
>
If we are making mem_cgroup_sk_alloc() work with set_active_memcg()
infra then make it work for both in_task() and !in_task() contexts.
On Thu, Aug 14, 2025 at 4:46 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Aug 14, 2025 at 04:27:31PM -0700, Kuniyuki Iwashima wrote:
> > On Thu, Aug 14, 2025 at 2:44 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
> > > > When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
> > > > sk->sk_memcg based on the current task.
> > > >
> > > > MPTCP subflow socket creation is triggered from userspace or
> > > > an in-kernel worker.
> > > >
> > > > In the latter case, sk->sk_memcg is not what we want. So, we fix
> > > > it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
> > > >
> > > > Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
> > > > under #ifdef CONFIG_SOCK_CGROUP_DATA.
> > > >
> > > > The two configs are orthogonal. If CONFIG_MEMCG is enabled without
> > > > CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
> > > > correctly.
> > > >
> > > > Let's wrap sock_create_kern() for subflow with set_active_memcg()
> > > > using the parent sk->sk_memcg.
> > > >
> > > > Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
> > > > Suggested-by: Michal Koutný <mkoutny@suse.com>
> > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> > > > ---
> > > > mm/memcontrol.c | 5 ++++-
> > > > net/mptcp/subflow.c | 11 +++--------
> > > > 2 files changed, 7 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > index 8dd7fbed5a94..450862e7fd7a 100644
> > > > --- a/mm/memcontrol.c
> > > > +++ b/mm/memcontrol.c
> > > > @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
> > > > if (!in_task())
> > > > return;
> > > >
> > > > + memcg = current->active_memcg;
> > > > +
> > >
> > > Use active_memcg() instead of current->active_memcg and do before the
> > > !in_task() check.
> >
> > Why not reuse the !in_task() check here ?
> > We never use int_active_memcg for socket and also
> > know int_active_memcg is always NULL here.
> >
>
> If we are making mem_cgroup_sk_alloc() work with set_active_memcg()
> infra then make it work for both in_task() and !in_task() contexts.
Considering e876ecc67db80, then I think we should add
set_active_memcg_in_task() and active_memcg_in_task().
or at least we need WARN_ON() if we want to place active_memcg()
before the in_task() check, but this looks ugly.
memcg = active_memcg();
if (!in_task() && !memcg)
return;
DEBUG_NET_WARN_ON_ONCE(!in_task() && memcg))
On Thu, Aug 14, 2025 at 05:05:56PM -0700, Kuniyuki Iwashima wrote:
> On Thu, Aug 14, 2025 at 4:46 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Thu, Aug 14, 2025 at 04:27:31PM -0700, Kuniyuki Iwashima wrote:
> > > On Thu, Aug 14, 2025 at 2:44 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > >
> > > > On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
> > > > > When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
> > > > > sk->sk_memcg based on the current task.
> > > > >
> > > > > MPTCP subflow socket creation is triggered from userspace or
> > > > > an in-kernel worker.
> > > > >
> > > > > In the latter case, sk->sk_memcg is not what we want. So, we fix
> > > > > it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
> > > > >
> > > > > Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
> > > > > under #ifdef CONFIG_SOCK_CGROUP_DATA.
> > > > >
> > > > > The two configs are orthogonal. If CONFIG_MEMCG is enabled without
> > > > > CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
> > > > > correctly.
> > > > >
> > > > > Let's wrap sock_create_kern() for subflow with set_active_memcg()
> > > > > using the parent sk->sk_memcg.
> > > > >
> > > > > Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
> > > > > Suggested-by: Michal Koutný <mkoutny@suse.com>
> > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> > > > > ---
> > > > > mm/memcontrol.c | 5 ++++-
> > > > > net/mptcp/subflow.c | 11 +++--------
> > > > > 2 files changed, 7 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > index 8dd7fbed5a94..450862e7fd7a 100644
> > > > > --- a/mm/memcontrol.c
> > > > > +++ b/mm/memcontrol.c
> > > > > @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
> > > > > if (!in_task())
> > > > > return;
> > > > >
> > > > > + memcg = current->active_memcg;
> > > > > +
> > > >
> > > > Use active_memcg() instead of current->active_memcg and do before the
> > > > !in_task() check.
> > >
> > > Why not reuse the !in_task() check here ?
> > > We never use int_active_memcg for socket and also
> > > know int_active_memcg is always NULL here.
> > >
> >
> > If we are making mem_cgroup_sk_alloc() work with set_active_memcg()
> > infra then make it work for both in_task() and !in_task() contexts.
>
> Considering e876ecc67db80, then I think we should add
> set_active_memcg_in_task() and active_memcg_in_task().
>
> or at least we need WARN_ON() if we want to place active_memcg()
> before the in_task() check, but this looks ugly.
>
> memcg = active_memcg();
> if (!in_task() && !memcg)
> return;
> DEBUG_NET_WARN_ON_ONCE(!in_task() && memcg))
You don't have to use the code as is. It is just an example. Basically I
am asking if in future someone does the following:
// in !in_task() context
old_memcg = set_active_memcg(new_memcg);
sk = sk_alloc();
set_active_memcg(old_memcg);
mem_cgroup_sk_alloc() should work and associate the sk with the
new_memcg.
You can manually inline active_memcg() function to avoid multiple
in_task() checks like below:
void mem_cgroup_sk_alloc(struct sock *sk)
{
struct mem_cgroup *memcg;
if (!mem_cgroup_sockets_enabled)
return;
if (!in_task()) {
memcg = this_cpu_read(int_active_memcg);
/*
* Do not associate the sock with unrelated interrupted
* task's memcg.
*/
if (!memcg)
return;
} else {
memcg = current->active_memcg;
}
rcu_read_lock();
if (likely(!memcg))
memcg = mem_cgroup_from_task(current);
if (mem_cgroup_is_root(memcg))
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
goto out;
if (css_tryget(&memcg->css))
sk->sk_memcg = memcg;
out:
rcu_read_unlock();
}
On Thu, Aug 14, 2025 at 6:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Aug 14, 2025 at 05:05:56PM -0700, Kuniyuki Iwashima wrote:
> > On Thu, Aug 14, 2025 at 4:46 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Thu, Aug 14, 2025 at 04:27:31PM -0700, Kuniyuki Iwashima wrote:
> > > > On Thu, Aug 14, 2025 at 2:44 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > > >
> > > > > On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
> > > > > > When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
> > > > > > sk->sk_memcg based on the current task.
> > > > > >
> > > > > > MPTCP subflow socket creation is triggered from userspace or
> > > > > > an in-kernel worker.
> > > > > >
> > > > > > In the latter case, sk->sk_memcg is not what we want. So, we fix
> > > > > > it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
> > > > > >
> > > > > > Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
> > > > > > under #ifdef CONFIG_SOCK_CGROUP_DATA.
> > > > > >
> > > > > > The two configs are orthogonal. If CONFIG_MEMCG is enabled without
> > > > > > CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
> > > > > > correctly.
> > > > > >
> > > > > > Let's wrap sock_create_kern() for subflow with set_active_memcg()
> > > > > > using the parent sk->sk_memcg.
> > > > > >
> > > > > > Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
> > > > > > Suggested-by: Michal Koutný <mkoutny@suse.com>
> > > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> > > > > > ---
> > > > > > mm/memcontrol.c | 5 ++++-
> > > > > > net/mptcp/subflow.c | 11 +++--------
> > > > > > 2 files changed, 7 insertions(+), 9 deletions(-)
> > > > > >
> > > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > > index 8dd7fbed5a94..450862e7fd7a 100644
> > > > > > --- a/mm/memcontrol.c
> > > > > > +++ b/mm/memcontrol.c
> > > > > > @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
> > > > > > if (!in_task())
> > > > > > return;
> > > > > >
> > > > > > + memcg = current->active_memcg;
> > > > > > +
> > > > >
> > > > > Use active_memcg() instead of current->active_memcg and do before the
> > > > > !in_task() check.
> > > >
> > > > Why not reuse the !in_task() check here ?
> > > > We never use int_active_memcg for socket and also
> > > > know int_active_memcg is always NULL here.
> > > >
> > >
> > > If we are making mem_cgroup_sk_alloc() work with set_active_memcg()
> > > infra then make it work for both in_task() and !in_task() contexts.
> >
> > Considering e876ecc67db80, then I think we should add
> > set_active_memcg_in_task() and active_memcg_in_task().
> >
> > or at least we need WARN_ON() if we want to place active_memcg()
> > before the in_task() check, but this looks ugly.
> >
> > memcg = active_memcg();
> > if (!in_task() && !memcg)
> > return;
> > DEBUG_NET_WARN_ON_ONCE(!in_task() && memcg))
>
> You don't have to use the code as is. It is just an example. Basically I
> am asking if in future someone does the following:
>
> // in !in_task() context
> old_memcg = set_active_memcg(new_memcg);
> sk = sk_alloc();
> set_active_memcg(old_memcg);
>
> mem_cgroup_sk_alloc() should work and associate the sk with the
> new_memcg.
>
> You can manually inline active_memcg() function to avoid multiple
> in_task() checks like below:
Will do so, thanks!
>
> void mem_cgroup_sk_alloc(struct sock *sk)
> {
> struct mem_cgroup *memcg;
>
> if (!mem_cgroup_sockets_enabled)
> return;
>
> if (!in_task()) {
> memcg = this_cpu_read(int_active_memcg);
>
> /*
> * Do not associate the sock with unrelated interrupted
> * task's memcg.
> */
> if (!memcg)
> return;
> } else {
> memcg = current->active_memcg;
> }
>
> rcu_read_lock();
> if (likely(!memcg))
> memcg = mem_cgroup_from_task(current);
> if (mem_cgroup_is_root(memcg))
> goto out;
> if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
> goto out;
> if (css_tryget(&memcg->css))
> sk->sk_memcg = memcg;
> out:
> rcu_read_unlock();
> }
>
On Thu, Aug 14, 2025 at 7:31 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>
> On Thu, Aug 14, 2025 at 6:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Thu, Aug 14, 2025 at 05:05:56PM -0700, Kuniyuki Iwashima wrote:
> > > On Thu, Aug 14, 2025 at 4:46 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > >
> > > > On Thu, Aug 14, 2025 at 04:27:31PM -0700, Kuniyuki Iwashima wrote:
> > > > > On Thu, Aug 14, 2025 at 2:44 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > > > >
> > > > > > On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
> > > > > > > When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
> > > > > > > sk->sk_memcg based on the current task.
> > > > > > >
> > > > > > > MPTCP subflow socket creation is triggered from userspace or
> > > > > > > an in-kernel worker.
> > > > > > >
> > > > > > > In the latter case, sk->sk_memcg is not what we want. So, we fix
> > > > > > > it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
> > > > > > >
> > > > > > > Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
> > > > > > > under #ifdef CONFIG_SOCK_CGROUP_DATA.
> > > > > > >
> > > > > > > The two configs are orthogonal. If CONFIG_MEMCG is enabled without
> > > > > > > CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
> > > > > > > correctly.
> > > > > > >
> > > > > > > Let's wrap sock_create_kern() for subflow with set_active_memcg()
> > > > > > > using the parent sk->sk_memcg.
> > > > > > >
> > > > > > > Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
> > > > > > > Suggested-by: Michal Koutný <mkoutny@suse.com>
> > > > > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> > > > > > > ---
> > > > > > > mm/memcontrol.c | 5 ++++-
> > > > > > > net/mptcp/subflow.c | 11 +++--------
> > > > > > > 2 files changed, 7 insertions(+), 9 deletions(-)
> > > > > > >
> > > > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > > > > index 8dd7fbed5a94..450862e7fd7a 100644
> > > > > > > --- a/mm/memcontrol.c
> > > > > > > +++ b/mm/memcontrol.c
> > > > > > > @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
> > > > > > > if (!in_task())
> > > > > > > return;
> > > > > > >
> > > > > > > + memcg = current->active_memcg;
> > > > > > > +
> > > > > >
> > > > > > Use active_memcg() instead of current->active_memcg and do before the
> > > > > > !in_task() check.
> > > > >
> > > > > Why not reuse the !in_task() check here ?
> > > > > We never use int_active_memcg for socket and also
> > > > > know int_active_memcg is always NULL here.
> > > > >
> > > >
> > > > If we are making mem_cgroup_sk_alloc() work with set_active_memcg()
> > > > infra then make it work for both in_task() and !in_task() contexts.
> > >
> > > Considering e876ecc67db80, then I think we should add
> > > set_active_memcg_in_task() and active_memcg_in_task().
> > >
> > > or at least we need WARN_ON() if we want to place active_memcg()
> > > before the in_task() check, but this looks ugly.
> > >
> > > memcg = active_memcg();
> > > if (!in_task() && !memcg)
> > > return;
> > > DEBUG_NET_WARN_ON_ONCE(!in_task() && memcg))
> >
> > You don't have to use the code as is. It is just an example. Basically I
> > am asking if in future someone does the following:
> >
> > // in !in_task() context
> > old_memcg = set_active_memcg(new_memcg);
> > sk = sk_alloc();
> > set_active_memcg(old_memcg);
> >
> > mem_cgroup_sk_alloc() should work and associate the sk with the
> > new_memcg.
> >
> > You can manually inline active_memcg() function to avoid multiple
> > in_task() checks like below:
>
> Will do so, thanks!
I noticed this won't work with the bpf approach as the
hook is only called for !sk_kern socket (MPTCP subflow
is sk_kern == 1) and we need to manually copy the
memcg anyway.. so I'll use the original patch 1 in the
next version.
> >
> > void mem_cgroup_sk_alloc(struct sock *sk)
> > {
> > struct mem_cgroup *memcg;
> >
> > if (!mem_cgroup_sockets_enabled)
> > return;
> >
> > if (!in_task()) {
> > memcg = this_cpu_read(int_active_memcg);
> >
> > /*
> > * Do not associate the sock with unrelated interrupted
> > * task's memcg.
> > */
> > if (!memcg)
> > return;
> > } else {
> > memcg = current->active_memcg;
> > }
> >
> > rcu_read_lock();
> > if (likely(!memcg))
> > memcg = mem_cgroup_from_task(current);
> > if (mem_cgroup_is_root(memcg))
> > goto out;
> > if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg1_tcpmem_active(memcg))
> > goto out;
> > if (css_tryget(&memcg->css))
> > sk->sk_memcg = memcg;
> > out:
> > rcu_read_unlock();
> > }
> >
On Fri, Aug 15, 2025 at 10:24:12AM -0700, Kuniyuki Iwashima <kuniyu@google.com> wrote: > and we need to manually copy the memcg anyway.. (Is that in a later patch?) But fair enough :-/ Michal
Hi Kuniyuki,
On 15/08/2025 19:24, Kuniyuki Iwashima wrote:
> On Thu, Aug 14, 2025 at 7:31 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>>
>> On Thu, Aug 14, 2025 at 6:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>>>
>>> On Thu, Aug 14, 2025 at 05:05:56PM -0700, Kuniyuki Iwashima wrote:
>>>> On Thu, Aug 14, 2025 at 4:46 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>>>>>
>>>>> On Thu, Aug 14, 2025 at 04:27:31PM -0700, Kuniyuki Iwashima wrote:
>>>>>> On Thu, Aug 14, 2025 at 2:44 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>>>>>>>
>>>>>>> On Thu, Aug 14, 2025 at 08:08:33PM +0000, Kuniyuki Iwashima wrote:
>>>>>>>> When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets
>>>>>>>> sk->sk_memcg based on the current task.
>>>>>>>>
>>>>>>>> MPTCP subflow socket creation is triggered from userspace or
>>>>>>>> an in-kernel worker.
>>>>>>>>
>>>>>>>> In the latter case, sk->sk_memcg is not what we want. So, we fix
>>>>>>>> it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup().
>>>>>>>>
>>>>>>>> Although the code is placed under #ifdef CONFIG_MEMCG, it is buried
>>>>>>>> under #ifdef CONFIG_SOCK_CGROUP_DATA.
>>>>>>>>
>>>>>>>> The two configs are orthogonal. If CONFIG_MEMCG is enabled without
>>>>>>>> CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged
>>>>>>>> correctly.
>>>>>>>>
>>>>>>>> Let's wrap sock_create_kern() for subflow with set_active_memcg()
>>>>>>>> using the parent sk->sk_memcg.
>>>>>>>>
>>>>>>>> Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup")
>>>>>>>> Suggested-by: Michal Koutný <mkoutny@suse.com>
>>>>>>>> Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
>>>>>>>> ---
>>>>>>>> mm/memcontrol.c | 5 ++++-
>>>>>>>> net/mptcp/subflow.c | 11 +++--------
>>>>>>>> 2 files changed, 7 insertions(+), 9 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>>>>>>> index 8dd7fbed5a94..450862e7fd7a 100644
>>>>>>>> --- a/mm/memcontrol.c
>>>>>>>> +++ b/mm/memcontrol.c
>>>>>>>> @@ -5006,8 +5006,11 @@ void mem_cgroup_sk_alloc(struct sock *sk)
>>>>>>>> if (!in_task())
>>>>>>>> return;
>>>>>>>>
>>>>>>>> + memcg = current->active_memcg;
>>>>>>>> +
>>>>>>>
>>>>>>> Use active_memcg() instead of current->active_memcg and do before the
>>>>>>> !in_task() check.
>>>>>>
>>>>>> Why not reuse the !in_task() check here ?
>>>>>> We never use int_active_memcg for socket and also
>>>>>> know int_active_memcg is always NULL here.
>>>>>>
>>>>>
>>>>> If we are making mem_cgroup_sk_alloc() work with set_active_memcg()
>>>>> infra then make it work for both in_task() and !in_task() contexts.
>>>>
>>>> Considering e876ecc67db80, then I think we should add
>>>> set_active_memcg_in_task() and active_memcg_in_task().
>>>>
>>>> or at least we need WARN_ON() if we want to place active_memcg()
>>>> before the in_task() check, but this looks ugly.
>>>>
>>>> memcg = active_memcg();
>>>> if (!in_task() && !memcg)
>>>> return;
>>>> DEBUG_NET_WARN_ON_ONCE(!in_task() && memcg))
>>>
>>> You don't have to use the code as is. It is just an example. Basically I
>>> am asking if in future someone does the following:
>>>
>>> // in !in_task() context
>>> old_memcg = set_active_memcg(new_memcg);
>>> sk = sk_alloc();
>>> set_active_memcg(old_memcg);
>>>
>>> mem_cgroup_sk_alloc() should work and associate the sk with the
>>> new_memcg.
>>>
>>> You can manually inline active_memcg() function to avoid multiple
>>> in_task() checks like below:
>>
>> Will do so, thanks!
>
> I noticed this won't work with the bpf approach as the
> hook is only called for !sk_kern socket (MPTCP subflow
> is sk_kern == 1) and we need to manually copy the
> memcg anyway.. so I'll use the original patch 1 in the
> next version.
Thank you for having checked that!
Cheers,
Matt
--
Sponsored by the NGI0 Core fund.
© 2016 - 2026 Red Hat, Inc.