include/linux/memcontrol.h | 23 ++++++++++++++++------- mm/memcontrol.c | 7 ++++--- 2 files changed, 20 insertions(+), 10 deletions(-)
Generally memcg charging is allowed from all the contexts including NMI
where even spinning on spinlock can cause locking issues. However one
call chain was missed during the addition of memcg charging from any
context support. That is try_charge_memcg() -> memcg_memory_event() ->
cgroup_file_notify().
The possible function call tree under cgroup_file_notify() can acquire
many different spin locks in spinning mode. Some of them are
cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
just skip cgroup_file_notify() from memcg charging if the context does
not allow spinning.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
include/linux/memcontrol.h | 23 ++++++++++++++++-------
mm/memcontrol.c | 7 ++++---
2 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9dc5b52672a6..054fa34c936a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
count_memcg_events_mm(mm, idx, 1);
}
-static inline void memcg_memory_event(struct mem_cgroup *memcg,
- enum memcg_memory_event event)
+static inline void __memcg_memory_event(struct mem_cgroup *memcg,
+ enum memcg_memory_event event,
+ bool allow_spinning)
{
bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
event == MEMCG_SWAP_FAIL;
atomic_long_inc(&memcg->memory_events_local[event]);
- if (!swap_event)
+ if (!swap_event && allow_spinning)
cgroup_file_notify(&memcg->events_local_file);
do {
atomic_long_inc(&memcg->memory_events[event]);
- if (swap_event)
- cgroup_file_notify(&memcg->swap_events_file);
- else
- cgroup_file_notify(&memcg->events_file);
+ if (allow_spinning) {
+ if (swap_event)
+ cgroup_file_notify(&memcg->swap_events_file);
+ else
+ cgroup_file_notify(&memcg->events_file);
+ }
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
break;
@@ -1018,6 +1021,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
!mem_cgroup_is_root(memcg));
}
+static inline void memcg_memory_event(struct mem_cgroup *memcg,
+ enum memcg_memory_event event)
+{
+ __memcg_memory_event(memcg, event, true);
+}
+
static inline void memcg_memory_event_mm(struct mm_struct *mm,
enum memcg_memory_event event)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 257d2c76b730..dd5cd9d352f3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2306,12 +2306,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
+ bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
retry:
if (consume_stock(memcg, nr_pages))
return 0;
- if (!gfpflags_allow_spinning(gfp_mask))
+ if (!allow_spinning)
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
@@ -2347,7 +2348,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
raised_max_event = true;
psi_memstall_enter(&pflags);
@@ -2414,7 +2415,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* a MEMCG_MAX event.
*/
if (!raised_max_event)
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
/*
* The allocation either can't fail or will lead to more memory
--
2.47.3
On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> Generally memcg charging is allowed from all the contexts including NMI
> where even spinning on spinlock can cause locking issues. However one
> call chain was missed during the addition of memcg charging from any
> context support. That is try_charge_memcg() -> memcg_memory_event() ->
> cgroup_file_notify().
>
> The possible function call tree under cgroup_file_notify() can acquire
> many different spin locks in spinning mode. Some of them are
> cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> just skip cgroup_file_notify() from memcg charging if the context does
> not allow spinning.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Here I am just pasting the irq_work based prototype which is build
tested only for now and sharing it early to show how it looks. Overall I
think it is adding too much complexity which is not worth it. We have to
add per-cpu irq_work and then for each memcg we have to add per-cpu
lockless node to queue the deferred event update. Also more reasoning is
needed to make sure the updates are not missed by the deferred work.
Anyways, this is the early prototype. Unless there are comments on how
to make it better, I will ask Andrew to just pick the previous patch I
sent.
From d58d772f306454f0dffa94bfb32195496c450892 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Thu, 18 Sep 2025 19:25:37 -0700
Subject: [PATCH] memcg: add support for deferred max memcg event
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
include/linux/memcontrol.h | 3 ++
mm/memcontrol.c | 85 ++++++++++++++++++++++++++++++++++++--
2 files changed, 84 insertions(+), 4 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 16fe0306e50e..3f803957e05d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,6 +69,7 @@ struct mem_cgroup_id {
refcount_t ref;
};
+struct deferred_events_percpu;
struct memcg_vmstats_percpu;
struct memcg1_events_percpu;
struct memcg_vmstats;
@@ -268,6 +269,8 @@ struct mem_cgroup {
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
+ struct deferred_events_percpu __percpu *deferred_events;
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
struct wb_domain cgwb_domain;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e090f29eb03b..a34cb728c5c6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -132,6 +132,63 @@ bool mem_cgroup_kmem_disabled(void)
return cgroup_memory_nokmem;
}
+struct deferred_events_percpu {
+ atomic_t max_events;
+ struct mem_cgroup *memcg_owner;
+ struct llist_node lnode;
+};
+
+struct defer_memcg_events {
+ struct llist_head memcg_llist;
+ struct irq_work work;
+};
+
+static void process_deferred_events(struct irq_work *work)
+{
+ struct defer_memcg_events *events = container_of(work,
+ struct defer_memcg_events, work);
+ struct llist_node *lnode;
+
+ while (lnode = llist_del_first_init(&events->memcg_llist)) {
+ int i, num;
+ struct deferred_events_percpu *eventsc;
+
+ eventsc = container_of(lnode, struct deferred_events_percpu, lnode);
+
+ if (!atomic_read(&eventsc->max_events))
+ continue;
+ num = atomic_xchg(&eventsc->max_events, 0);
+ if (!num)
+ continue;
+ for (i = 0; i < num; i++)
+ memcg_memory_event(eventsc->memcg_owner, MEMCG_MAX);
+ }
+}
+
+static DEFINE_PER_CPU(struct defer_memcg_events, postpone_events) = {
+ .memcg_llist = LLIST_HEAD_INIT(memcg_llist),
+ .work = IRQ_WORK_INIT(process_deferred_events),
+};
+
+static void memcg_memory_max_event_queue(struct mem_cgroup *memcg)
+{
+ int cpu;
+ struct defer_memcg_events *devents;
+ struct deferred_events_percpu *dmemcg_events;
+
+ cpu = get_cpu();
+ devents = per_cpu_ptr(&postpone_events, cpu);
+ dmemcg_events = per_cpu_ptr(memcg->deferred_events, cpu);
+
+ atomic_inc(&dmemcg_events->max_events);
+ // barrier here to make sure that if following llist_add returns false,
+ // the corresponding llist_del_first_init will see our increment.
+ if (llist_add(&dmemcg_events->lnode, &devents->memcg_llist))
+ irq_work_queue(&devents->work);
+
+ put_cpu();
+}
+
static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
static void obj_cgroup_release(struct percpu_ref *ref)
@@ -2307,12 +2364,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
+ bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
retry:
if (consume_stock(memcg, nr_pages))
return 0;
- if (!gfpflags_allow_spinning(gfp_mask))
+ if (!allow_spinning)
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
@@ -2348,7 +2406,10 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ if (allow_spinning)
+ memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ else
+ memcg_memory_max_event_queue(mem_over_limit);
raised_max_event = true;
psi_memstall_enter(&pflags);
@@ -2414,8 +2475,12 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* If the allocation has to be enforced, don't forget to raise
* a MEMCG_MAX event.
*/
- if (!raised_max_event)
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ if (!raised_max_event) {
+ if (allow_spinning)
+ memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ else
+ memcg_memory_max_event_queue(mem_over_limit);
+ }
/*
* The allocation either can't fail or will lead to more memory
@@ -3689,6 +3754,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
memcg1_free_events(memcg);
kfree(memcg->vmstats);
+ free_percpu(memcg->deferred_events);
free_percpu(memcg->vmstats_percpu);
kfree(memcg);
}
@@ -3704,6 +3770,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
struct memcg_vmstats_percpu *statc;
struct memcg_vmstats_percpu __percpu *pstatc_pcpu;
+ struct deferred_events_percpu *devents;
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
@@ -3729,6 +3796,11 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg->vmstats_percpu)
goto fail;
+ memcg->deferred_events = alloc_percpu_gfp(struct deferred_events_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!memcg->deferred_events)
+ goto fail;
+
if (!memcg1_alloc_events(memcg))
goto fail;
@@ -3738,6 +3810,11 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
statc->parent_pcpu = parent ? pstatc_pcpu : NULL;
statc->vmstats = memcg->vmstats;
+
+ devents = per_cpu_ptr(memcg->deferred_events, cpu);
+ atomic_set(&devents->max_events, 0);
+ devents->memcg_owner = memcg;
+ init_llist_node(&devents->lnode);
}
for_each_node(node)
--
2.47.3
On Thu, Sep 18, 2025 at 7:49 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> >
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
>
> Here I am just pasting the irq_work based prototype which is build
> tested only for now and sharing it early to show how it looks. Overall I
> think it is adding too much complexity which is not worth it. We have to
> add per-cpu irq_work and then for each memcg we have to add per-cpu
> lockless node to queue the deferred event update. Also more reasoning is
> needed to make sure the updates are not missed by the deferred work.
>
> Anyways, this is the early prototype. Unless there are comments on how
> to make it better, I will ask Andrew to just pick the previous patch I
> sent.
>
>
> From d58d772f306454f0dffa94bfb32195496c450892 Mon Sep 17 00:00:00 2001
> From: Shakeel Butt <shakeel.butt@linux.dev>
> Date: Thu, 18 Sep 2025 19:25:37 -0700
> Subject: [PATCH] memcg: add support for deferred max memcg event
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
> include/linux/memcontrol.h | 3 ++
> mm/memcontrol.c | 85 ++++++++++++++++++++++++++++++++++++--
> 2 files changed, 84 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 16fe0306e50e..3f803957e05d 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -69,6 +69,7 @@ struct mem_cgroup_id {
> refcount_t ref;
> };
>
> +struct deferred_events_percpu;
> struct memcg_vmstats_percpu;
> struct memcg1_events_percpu;
> struct memcg_vmstats;
> @@ -268,6 +269,8 @@ struct mem_cgroup {
>
> struct memcg_vmstats_percpu __percpu *vmstats_percpu;
>
> + struct deferred_events_percpu __percpu *deferred_events;
> +
> #ifdef CONFIG_CGROUP_WRITEBACK
> struct list_head cgwb_list;
> struct wb_domain cgwb_domain;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e090f29eb03b..a34cb728c5c6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -132,6 +132,63 @@ bool mem_cgroup_kmem_disabled(void)
> return cgroup_memory_nokmem;
> }
>
> +struct deferred_events_percpu {
> + atomic_t max_events;
> + struct mem_cgroup *memcg_owner;
> + struct llist_node lnode;
> +};
> +
> +struct defer_memcg_events {
> + struct llist_head memcg_llist;
> + struct irq_work work;
> +};
> +
> +static void process_deferred_events(struct irq_work *work)
> +{
> + struct defer_memcg_events *events = container_of(work,
> + struct defer_memcg_events, work);
> + struct llist_node *lnode;
> +
> + while (lnode = llist_del_first_init(&events->memcg_llist)) {
> + int i, num;
> + struct deferred_events_percpu *eventsc;
> +
> + eventsc = container_of(lnode, struct deferred_events_percpu, lnode);
> +
> + if (!atomic_read(&eventsc->max_events))
> + continue;
> + num = atomic_xchg(&eventsc->max_events, 0);
> + if (!num)
> + continue;
> + for (i = 0; i < num; i++)
> + memcg_memory_event(eventsc->memcg_owner, MEMCG_MAX);
> + }
> +}
> +
> +static DEFINE_PER_CPU(struct defer_memcg_events, postpone_events) = {
> + .memcg_llist = LLIST_HEAD_INIT(memcg_llist),
> + .work = IRQ_WORK_INIT(process_deferred_events),
> +};
Why all these per cpu stuff ? I don't understand what it helps with.
To have max_events per-cpu?
Just one global llist and irq_work will do just fine.
and global max_events too.
In some previous thread there was a question about atomiciting
of atomic_long. It's normal 32-bit atomic on 32-bit archs.
On Fri 05-09-25 13:16:06, Shakeel Butt wrote:
> Generally memcg charging is allowed from all the contexts including NMI
> where even spinning on spinlock can cause locking issues. However one
> call chain was missed during the addition of memcg charging from any
> context support. That is try_charge_memcg() -> memcg_memory_event() ->
> cgroup_file_notify().
>
> The possible function call tree under cgroup_file_notify() can acquire
> many different spin locks in spinning mode. Some of them are
> cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> just skip cgroup_file_notify() from memcg charging if the context does
> not allow spinning.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
> ---
> include/linux/memcontrol.h | 23 ++++++++++++++++-------
> mm/memcontrol.c | 7 ++++---
> 2 files changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9dc5b52672a6..054fa34c936a 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
> count_memcg_events_mm(mm, idx, 1);
> }
>
> -static inline void memcg_memory_event(struct mem_cgroup *memcg,
> - enum memcg_memory_event event)
> +static inline void __memcg_memory_event(struct mem_cgroup *memcg,
> + enum memcg_memory_event event,
> + bool allow_spinning)
> {
> bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
> event == MEMCG_SWAP_FAIL;
>
> atomic_long_inc(&memcg->memory_events_local[event]);
Doesn't this involve locking on 32b? I guess we do not care all that
much but we might want to bail out early on those arches for
!allow_spinning
> - if (!swap_event)
> + if (!swap_event && allow_spinning)
> cgroup_file_notify(&memcg->events_local_file);
>
> do {
> atomic_long_inc(&memcg->memory_events[event]);
> - if (swap_event)
> - cgroup_file_notify(&memcg->swap_events_file);
> - else
> - cgroup_file_notify(&memcg->events_file);
> + if (allow_spinning) {
> + if (swap_event)
> + cgroup_file_notify(&memcg->swap_events_file);
> + else
> + cgroup_file_notify(&memcg->events_file);
> + }
>
> if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
> break;
> @@ -1018,6 +1021,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
> !mem_cgroup_is_root(memcg));
> }
>
> +static inline void memcg_memory_event(struct mem_cgroup *memcg,
> + enum memcg_memory_event event)
> +{
> + __memcg_memory_event(memcg, event, true);
> +}
> +
> static inline void memcg_memory_event_mm(struct mm_struct *mm,
> enum memcg_memory_event event)
> {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 257d2c76b730..dd5cd9d352f3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2306,12 +2306,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> bool drained = false;
> bool raised_max_event = false;
> unsigned long pflags;
> + bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
>
> retry:
> if (consume_stock(memcg, nr_pages))
> return 0;
>
> - if (!gfpflags_allow_spinning(gfp_mask))
> + if (!allow_spinning)
> /* Avoid the refill and flush of the older stock */
> batch = nr_pages;
>
> @@ -2347,7 +2348,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> if (!gfpflags_allow_blocking(gfp_mask))
> goto nomem;
>
> - memcg_memory_event(mem_over_limit, MEMCG_MAX);
> + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
> raised_max_event = true;
>
> psi_memstall_enter(&pflags);
> @@ -2414,7 +2415,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> * a MEMCG_MAX event.
> */
> if (!raised_max_event)
> - memcg_memory_event(mem_over_limit, MEMCG_MAX);
> + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
>
> /*
> * The allocation either can't fail or will lead to more memory
> --
> 2.47.3
>
--
Michal Hocko
SUSE Labs
On Mon, Sep 08, 2025 at 11:28:20AM +0200, Michal Hocko wrote:
> On Fri 05-09-25 13:16:06, Shakeel Butt wrote:
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> >
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
>
> Acked-by: Michal Hocko <mhocko@suse.com>
Thanks.
>
> > ---
> > include/linux/memcontrol.h | 23 ++++++++++++++++-------
> > mm/memcontrol.c | 7 ++++---
> > 2 files changed, 20 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 9dc5b52672a6..054fa34c936a 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
> > count_memcg_events_mm(mm, idx, 1);
> > }
> >
> > -static inline void memcg_memory_event(struct mem_cgroup *memcg,
> > - enum memcg_memory_event event)
> > +static inline void __memcg_memory_event(struct mem_cgroup *memcg,
> > + enum memcg_memory_event event,
> > + bool allow_spinning)
> > {
> > bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
> > event == MEMCG_SWAP_FAIL;
> >
> > atomic_long_inc(&memcg->memory_events_local[event]);
>
> Doesn't this involve locking on 32b? I guess we do not care all that
> much but we might want to bail out early on those arches for
> !allow_spinning
>
I am prototyping irq_work based approach and if that looks good then we
might not need to worry about 32b at all.
Shakeel Butt <shakeel.butt@linux.dev> writes: > Generally memcg charging is allowed from all the contexts including NMI > where even spinning on spinlock can cause locking issues. However one > call chain was missed during the addition of memcg charging from any > context support. That is try_charge_memcg() -> memcg_memory_event() -> > cgroup_file_notify(). > > The possible function call tree under cgroup_file_notify() can acquire > many different spin locks in spinning mode. Some of them are > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > just skip cgroup_file_notify() from memcg charging if the context does > not allow spinning. Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look like a bit deal, but OOM events can be way more important. Should we instead preserve the event (e.g. as a pending_event_mask) and raise it on the next occasion / from a different context? Thanks
On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > Generally memcg charging is allowed from all the contexts including NMI > > where even spinning on spinlock can cause locking issues. However one > > call chain was missed during the addition of memcg charging from any > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > cgroup_file_notify(). > > > > The possible function call tree under cgroup_file_notify() can acquire > > many different spin locks in spinning mode. Some of them are > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > just skip cgroup_file_notify() from memcg charging if the context does > > not allow spinning. > > Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > like a bit deal, but OOM events can be way more important. > > Should we instead preserve the event (e.g. as a pending_event_mask) and > raise it on the next occasion / from a different context? > Thanks for the review. For now only MAX can happen in non-spinning context. All others only happen in process context. Maybe with BPF OOM, OOM might be possible in a different context (is that what you are thinking?). I think we can add the complexity of preserving the event when the actual need arise.
Shakeel Butt <shakeel.butt@linux.dev> writes: > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: >> Shakeel Butt <shakeel.butt@linux.dev> writes: >> >> > Generally memcg charging is allowed from all the contexts including NMI >> > where even spinning on spinlock can cause locking issues. However one >> > call chain was missed during the addition of memcg charging from any >> > context support. That is try_charge_memcg() -> memcg_memory_event() -> >> > cgroup_file_notify(). >> > >> > The possible function call tree under cgroup_file_notify() can acquire >> > many different spin locks in spinning mode. Some of them are >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's >> > just skip cgroup_file_notify() from memcg charging if the context does >> > not allow spinning. >> >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look >> like a bit deal, but OOM events can be way more important. >> >> Should we instead preserve the event (e.g. as a pending_event_mask) and >> raise it on the next occasion / from a different context? >> > > Thanks for the review. For now only MAX can happen in non-spinning > context. All others only happen in process context. Maybe with BPF OOM, > OOM might be possible in a different context (is that what you are > thinking?). I think we can add the complexity of preserving the event > when the actual need arise. No, I haven't thought about any particular use case, just a bit worried about silently dropping some events. It might be not an issue now, but might be easy to miss a moment when it becomes a problem. So in my opinion using some delayed delivery mechanism is better than just dropping these events.
On Fri, Sep 05, 2025 at 02:42:01PM -0700, Roman Gushchin wrote: > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > >> Shakeel Butt <shakeel.butt@linux.dev> writes: > >> > >> > Generally memcg charging is allowed from all the contexts including NMI > >> > where even spinning on spinlock can cause locking issues. However one > >> > call chain was missed during the addition of memcg charging from any > >> > context support. That is try_charge_memcg() -> memcg_memory_event() -> > >> > cgroup_file_notify(). > >> > > >> > The possible function call tree under cgroup_file_notify() can acquire > >> > many different spin locks in spinning mode. Some of them are > >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > >> > just skip cgroup_file_notify() from memcg charging if the context does > >> > not allow spinning. > >> > >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > >> like a bit deal, but OOM events can be way more important. > >> > >> Should we instead preserve the event (e.g. as a pending_event_mask) and > >> raise it on the next occasion / from a different context? > >> > > > > Thanks for the review. For now only MAX can happen in non-spinning > > context. All others only happen in process context. Maybe with BPF OOM, > > OOM might be possible in a different context (is that what you are > > thinking?). I think we can add the complexity of preserving the event > > when the actual need arise. > > No, I haven't thought about any particular use case, just a bit > worried about silently dropping some events. It might be not an issue > now, but might be easy to miss a moment when it becomes a problem. > Only the notification can be dropped and not the event (i.e. we are still incrementing the counters). Also for MAX only but I got your point. > So in my opinion using some delayed delivery mechanism is better > than just dropping these events. Let me see how doing this irq_work looks like and will update here.
Shakeel Butt <shakeel.butt@linux.dev> writes: > On Fri, Sep 05, 2025 at 02:42:01PM -0700, Roman Gushchin wrote: >> Shakeel Butt <shakeel.butt@linux.dev> writes: >> >> > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: >> >> Shakeel Butt <shakeel.butt@linux.dev> writes: >> >> >> >> > Generally memcg charging is allowed from all the contexts including NMI >> >> > where even spinning on spinlock can cause locking issues. However one >> >> > call chain was missed during the addition of memcg charging from any >> >> > context support. That is try_charge_memcg() -> memcg_memory_event() -> >> >> > cgroup_file_notify(). >> >> > >> >> > The possible function call tree under cgroup_file_notify() can acquire >> >> > many different spin locks in spinning mode. Some of them are >> >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's >> >> > just skip cgroup_file_notify() from memcg charging if the context does >> >> > not allow spinning. >> >> >> >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look >> >> like a bit deal, but OOM events can be way more important. >> >> >> >> Should we instead preserve the event (e.g. as a pending_event_mask) and >> >> raise it on the next occasion / from a different context? >> >> >> > >> > Thanks for the review. For now only MAX can happen in non-spinning >> > context. All others only happen in process context. Maybe with BPF OOM, >> > OOM might be possible in a different context (is that what you are >> > thinking?). I think we can add the complexity of preserving the event >> > when the actual need arise. >> >> No, I haven't thought about any particular use case, just a bit >> worried about silently dropping some events. It might be not an issue >> now, but might be easy to miss a moment when it becomes a problem. >> > > Only the notification can be dropped and not the event (i.e. we are > still incrementing the counters). Also for MAX only but I got your > point. > >> So in my opinion using some delayed delivery mechanism is better >> than just dropping these events. > > Let me see how doing this irq_work looks like and will update here. Thanks! If it won't work out for some reason, maybe at least explicitly narrow it down to the MEMCG_MAX events.
On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > Generally memcg charging is allowed from all the contexts including NMI > > where even spinning on spinlock can cause locking issues. However one > > call chain was missed during the addition of memcg charging from any > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > cgroup_file_notify(). > > > > The possible function call tree under cgroup_file_notify() can acquire > > many different spin locks in spinning mode. Some of them are > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > just skip cgroup_file_notify() from memcg charging if the context does > > not allow spinning. > > Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > like a bit deal, but OOM events can be way more important. > > Should we instead preserve the event (e.g. as a pending_event_mask) and > raise it on the next occasion / from a different context? Maybe punt with an irq_work? Thanks. -- tejun
On Fri, Sep 05, 2025 at 11:25:34AM -1000, Tejun Heo wrote: > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > > > Generally memcg charging is allowed from all the contexts including NMI > > > where even spinning on spinlock can cause locking issues. However one > > > call chain was missed during the addition of memcg charging from any > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > cgroup_file_notify(). > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > many different spin locks in spinning mode. Some of them are > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > just skip cgroup_file_notify() from memcg charging if the context does > > > not allow spinning. > > > > Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > > like a bit deal, but OOM events can be way more important. > > > > Should we instead preserve the event (e.g. as a pending_event_mask) and > > raise it on the next occasion / from a different context? > > Maybe punt with an irq_work? Oh good suggestion, I will check irq_work as well. Though I think that can be a followup work.
On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> Generally memcg charging is allowed from all the contexts including NMI
> where even spinning on spinlock can cause locking issues. However one
> call chain was missed during the addition of memcg charging from any
> context support. That is try_charge_memcg() -> memcg_memory_event() ->
> cgroup_file_notify().
>
> The possible function call tree under cgroup_file_notify() can acquire
> many different spin locks in spinning mode. Some of them are
> cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> just skip cgroup_file_notify() from memcg charging if the context does
> not allow spinning.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Tested-by: Peilin Ye <yepeilin@google.com>
The repro described in [1] no longer triggers locking issues after
applying this patch and making __bpf_async_init() use __GFP_HIGH
instead of GFP_ATOMIC:
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
}
/* allocate hrtimer via map_kmalloc to use memcg accounting */
- cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+ cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
[1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t
Thanks,
Peilin Ye
On Fri 05-09-25 20:48:46, Peilin Ye wrote:
> On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> >
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
>
> Tested-by: Peilin Ye <yepeilin@google.com>
>
> The repro described in [1] no longer triggers locking issues after
> applying this patch and making __bpf_async_init() use __GFP_HIGH
> instead of GFP_ATOMIC:
>
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> }
>
> /* allocate hrtimer via map_kmalloc to use memcg accounting */
> - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
> + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be
used instead here?
> if (!cb) {
> ret = -ENOMEM;
> goto out;
>
> [1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t
>
> Thanks,
> Peilin Ye
--
Michal Hocko
SUSE Labs
On Mon, Sep 8, 2025 at 2:08 AM Michal Hocko <mhocko@suse.com> wrote: > > On Fri 05-09-25 20:48:46, Peilin Ye wrote: > > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > > Generally memcg charging is allowed from all the contexts including NMI > > > where even spinning on spinlock can cause locking issues. However one > > > call chain was missed during the addition of memcg charging from any > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > cgroup_file_notify(). > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > many different spin locks in spinning mode. Some of them are > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > just skip cgroup_file_notify() from memcg charging if the context does > > > not allow spinning. > > > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > Tested-by: Peilin Ye <yepeilin@google.com> > > > > The repro described in [1] no longer triggers locking issues after > > applying this patch and making __bpf_async_init() use __GFP_HIGH > > instead of GFP_ATOMIC: > > > > --- a/kernel/bpf/helpers.c > > +++ b/kernel/bpf/helpers.c > > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u > > } > > > > /* allocate hrtimer via map_kmalloc to use memcg accounting */ > > - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); > > + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); > > Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be > used instead here? Yes. That's a plan. We'll convert most of bpf allocations to kmalloc_nolock() when it lands.
On Mon 08-09-25 10:11:29, Alexei Starovoitov wrote: > On Mon, Sep 8, 2025 at 2:08 AM Michal Hocko <mhocko@suse.com> wrote: > > > > On Fri 05-09-25 20:48:46, Peilin Ye wrote: > > > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > > > Generally memcg charging is allowed from all the contexts including NMI > > > > where even spinning on spinlock can cause locking issues. However one > > > > call chain was missed during the addition of memcg charging from any > > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > > cgroup_file_notify(). > > > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > > many different spin locks in spinning mode. Some of them are > > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > > just skip cgroup_file_notify() from memcg charging if the context does > > > > not allow spinning. > > > > > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > > > Tested-by: Peilin Ye <yepeilin@google.com> > > > > > > The repro described in [1] no longer triggers locking issues after > > > applying this patch and making __bpf_async_init() use __GFP_HIGH > > > instead of GFP_ATOMIC: > > > > > > --- a/kernel/bpf/helpers.c > > > +++ b/kernel/bpf/helpers.c > > > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u > > > } > > > > > > /* allocate hrtimer via map_kmalloc to use memcg accounting */ > > > - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); > > > + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); > > > > Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be > > used instead here? > > Yes. That's a plan. We'll convert most of bpf allocations to kmalloc_nolock() > when it lands. OK, I thought this was merged already. I suspect __GFP_HIGH is used here as a result of manual GFP_ATOMIC & ~GFP_RECLAIM. A TODO/FIXME referring to kmalloc_nolock would clarify the situation. -- Michal Hocko SUSE Labs
On Fri, Sep 05, 2025 at 08:48:46PM +0000, Peilin Ye wrote:
> On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> >
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
>
> Tested-by: Peilin Ye <yepeilin@google.com>
Thanks Peilin. When you post the official patch for __GFP_HIGH in
__bpf_async_init(), please add a comment on why __GFP_HIGH is used
instead of GFP_ATOMIC.
>
> The repro described in [1] no longer triggers locking issues after
> applying this patch and making __bpf_async_init() use __GFP_HIGH
> instead of GFP_ATOMIC:
>
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> }
>
> /* allocate hrtimer via map_kmalloc to use memcg accounting */
> - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
> + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
> if (!cb) {
> ret = -ENOMEM;
> goto out;
>
> [1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t
>
> Thanks,
> Peilin Ye
>
On Fri, Sep 05, 2025 at 02:33:16PM -0700, Shakeel Butt wrote: > On Fri, Sep 05, 2025 at 08:48:46PM +0000, Peilin Ye wrote: > > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > > Generally memcg charging is allowed from all the contexts including NMI > > > where even spinning on spinlock can cause locking issues. However one > > > call chain was missed during the addition of memcg charging from any > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > cgroup_file_notify(). > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > many different spin locks in spinning mode. Some of them are > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > just skip cgroup_file_notify() from memcg charging if the context does > > > not allow spinning. > > > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > Tested-by: Peilin Ye <yepeilin@google.com> > > Thanks Peilin. When you post the official patch for __GFP_HIGH in > __bpf_async_init(), please add a comment on why __GFP_HIGH is used > instead of GFP_ATOMIC. Got it! I'll schedule to have that done today. Thanks, Peilin Ye
© 2016 - 2026 Red Hat, Inc.