[PATCH] memcg: skip cgroup_file_notify if spinning is not allowed

Shakeel Butt posted 1 patch 4 days, 1 hour ago
include/linux/memcontrol.h | 23 ++++++++++++++++-------
mm/memcontrol.c            |  7 ++++---
2 files changed, 20 insertions(+), 10 deletions(-)
[PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Shakeel Butt 4 days, 1 hour ago
Generally memcg charging is allowed from all the contexts including NMI
where even spinning on spinlock can cause locking issues. However one
call chain was missed during the addition of memcg charging from any
context support. That is try_charge_memcg() -> memcg_memory_event() ->
cgroup_file_notify().

The possible function call tree under cgroup_file_notify() can acquire
many different spin locks in spinning mode. Some of them are
cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
just skip cgroup_file_notify() from memcg charging if the context does
not allow spinning.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 include/linux/memcontrol.h | 23 ++++++++++++++++-------
 mm/memcontrol.c            |  7 ++++---
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9dc5b52672a6..054fa34c936a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 	count_memcg_events_mm(mm, idx, 1);
 }
 
-static inline void memcg_memory_event(struct mem_cgroup *memcg,
-				      enum memcg_memory_event event)
+static inline void __memcg_memory_event(struct mem_cgroup *memcg,
+					enum memcg_memory_event event,
+					bool allow_spinning)
 {
 	bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
 			  event == MEMCG_SWAP_FAIL;
 
 	atomic_long_inc(&memcg->memory_events_local[event]);
-	if (!swap_event)
+	if (!swap_event && allow_spinning)
 		cgroup_file_notify(&memcg->events_local_file);
 
 	do {
 		atomic_long_inc(&memcg->memory_events[event]);
-		if (swap_event)
-			cgroup_file_notify(&memcg->swap_events_file);
-		else
-			cgroup_file_notify(&memcg->events_file);
+		if (allow_spinning) {
+			if (swap_event)
+				cgroup_file_notify(&memcg->swap_events_file);
+			else
+				cgroup_file_notify(&memcg->events_file);
+		}
 
 		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
 			break;
@@ -1018,6 +1021,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
 		 !mem_cgroup_is_root(memcg));
 }
 
+static inline void memcg_memory_event(struct mem_cgroup *memcg,
+				      enum memcg_memory_event event)
+{
+	__memcg_memory_event(memcg, event, true);
+}
+
 static inline void memcg_memory_event_mm(struct mm_struct *mm,
 					 enum memcg_memory_event event)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 257d2c76b730..dd5cd9d352f3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2306,12 +2306,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	bool drained = false;
 	bool raised_max_event = false;
 	unsigned long pflags;
+	bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
 
 retry:
 	if (consume_stock(memcg, nr_pages))
 		return 0;
 
-	if (!gfpflags_allow_spinning(gfp_mask))
+	if (!allow_spinning)
 		/* Avoid the refill and flush of the older stock */
 		batch = nr_pages;
 
@@ -2347,7 +2348,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (!gfpflags_allow_blocking(gfp_mask))
 		goto nomem;
 
-	memcg_memory_event(mem_over_limit, MEMCG_MAX);
+	__memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
 	raised_max_event = true;
 
 	psi_memstall_enter(&pflags);
@@ -2414,7 +2415,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * a MEMCG_MAX event.
 	 */
 	if (!raised_max_event)
-		memcg_memory_event(mem_over_limit, MEMCG_MAX);
+		__memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
 
 	/*
 	 * The allocation either can't fail or will lead to more memory
-- 
2.47.3
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Michal Hocko 1 day, 12 hours ago
On Fri 05-09-25 13:16:06, Shakeel Butt wrote:
> Generally memcg charging is allowed from all the contexts including NMI
> where even spinning on spinlock can cause locking issues. However one
> call chain was missed during the addition of memcg charging from any
> context support. That is try_charge_memcg() -> memcg_memory_event() ->
> cgroup_file_notify().
> 
> The possible function call tree under cgroup_file_notify() can acquire
> many different spin locks in spinning mode. Some of them are
> cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> just skip cgroup_file_notify() from memcg charging if the context does
> not allow spinning.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  include/linux/memcontrol.h | 23 ++++++++++++++++-------
>  mm/memcontrol.c            |  7 ++++---
>  2 files changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9dc5b52672a6..054fa34c936a 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
>  	count_memcg_events_mm(mm, idx, 1);
>  }
>  
> -static inline void memcg_memory_event(struct mem_cgroup *memcg,
> -				      enum memcg_memory_event event)
> +static inline void __memcg_memory_event(struct mem_cgroup *memcg,
> +					enum memcg_memory_event event,
> +					bool allow_spinning)
>  {
>  	bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
>  			  event == MEMCG_SWAP_FAIL;
>  
>  	atomic_long_inc(&memcg->memory_events_local[event]);

Doesn't this involve locking on 32b? I guess we do not care all that
much but we might want to bail out early on those arches for
!allow_spinning

> -	if (!swap_event)
> +	if (!swap_event && allow_spinning)
>  		cgroup_file_notify(&memcg->events_local_file);
>  
>  	do {
>  		atomic_long_inc(&memcg->memory_events[event]);
> -		if (swap_event)
> -			cgroup_file_notify(&memcg->swap_events_file);
> -		else
> -			cgroup_file_notify(&memcg->events_file);
> +		if (allow_spinning) {
> +			if (swap_event)
> +				cgroup_file_notify(&memcg->swap_events_file);
> +			else
> +				cgroup_file_notify(&memcg->events_file);
> +		}
>  
>  		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
>  			break;
> @@ -1018,6 +1021,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
>  		 !mem_cgroup_is_root(memcg));
>  }
>  
> +static inline void memcg_memory_event(struct mem_cgroup *memcg,
> +				      enum memcg_memory_event event)
> +{
> +	__memcg_memory_event(memcg, event, true);
> +}
> +
>  static inline void memcg_memory_event_mm(struct mm_struct *mm,
>  					 enum memcg_memory_event event)
>  {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 257d2c76b730..dd5cd9d352f3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2306,12 +2306,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	bool drained = false;
>  	bool raised_max_event = false;
>  	unsigned long pflags;
> +	bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
>  
>  retry:
>  	if (consume_stock(memcg, nr_pages))
>  		return 0;
>  
> -	if (!gfpflags_allow_spinning(gfp_mask))
> +	if (!allow_spinning)
>  		/* Avoid the refill and flush of the older stock */
>  		batch = nr_pages;
>  
> @@ -2347,7 +2348,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	if (!gfpflags_allow_blocking(gfp_mask))
>  		goto nomem;
>  
> -	memcg_memory_event(mem_over_limit, MEMCG_MAX);
> +	__memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
>  	raised_max_event = true;
>  
>  	psi_memstall_enter(&pflags);
> @@ -2414,7 +2415,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	 * a MEMCG_MAX event.
>  	 */
>  	if (!raised_max_event)
> -		memcg_memory_event(mem_over_limit, MEMCG_MAX);
> +		__memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
>  
>  	/*
>  	 * The allocation either can't fail or will lead to more memory
> -- 
> 2.47.3
> 

-- 
Michal Hocko
SUSE Labs
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Shakeel Butt 1 day, 3 hours ago
On Mon, Sep 08, 2025 at 11:28:20AM +0200, Michal Hocko wrote:
> On Fri 05-09-25 13:16:06, Shakeel Butt wrote:
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> > 
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> > 
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> Acked-by: Michal Hocko <mhocko@suse.com>

Thanks.

> 
> > ---
> >  include/linux/memcontrol.h | 23 ++++++++++++++++-------
> >  mm/memcontrol.c            |  7 ++++---
> >  2 files changed, 20 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 9dc5b52672a6..054fa34c936a 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
> >  	count_memcg_events_mm(mm, idx, 1);
> >  }
> >  
> > -static inline void memcg_memory_event(struct mem_cgroup *memcg,
> > -				      enum memcg_memory_event event)
> > +static inline void __memcg_memory_event(struct mem_cgroup *memcg,
> > +					enum memcg_memory_event event,
> > +					bool allow_spinning)
> >  {
> >  	bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
> >  			  event == MEMCG_SWAP_FAIL;
> >  
> >  	atomic_long_inc(&memcg->memory_events_local[event]);
> 
> Doesn't this involve locking on 32b? I guess we do not care all that
> much but we might want to bail out early on those arches for
> !allow_spinning
> 

I am prototyping irq_work based approach and if that looks good then we
might not need to worry about 32b at all.
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Roman Gushchin 4 days ago
Shakeel Butt <shakeel.butt@linux.dev> writes:

> Generally memcg charging is allowed from all the contexts including NMI
> where even spinning on spinlock can cause locking issues. However one
> call chain was missed during the addition of memcg charging from any
> context support. That is try_charge_memcg() -> memcg_memory_event() ->
> cgroup_file_notify().
>
> The possible function call tree under cgroup_file_notify() can acquire
> many different spin locks in spinning mode. Some of them are
> cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> just skip cgroup_file_notify() from memcg charging if the context does
> not allow spinning.

Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look
like a bit deal, but OOM events can be way more important.

Should we instead preserve the event (e.g. as a pending_event_mask) and
raise it on the next occasion / from a different context?

Thanks
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Shakeel Butt 3 days, 23 hours ago
On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote:
> Shakeel Butt <shakeel.butt@linux.dev> writes:
> 
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> >
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> 
> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look
> like a bit deal, but OOM events can be way more important.
> 
> Should we instead preserve the event (e.g. as a pending_event_mask) and
> raise it on the next occasion / from a different context?
>

Thanks for the review. For now only MAX can happen in non-spinning
context. All others only happen in process context. Maybe with BPF OOM,
OOM might be possible in a different context (is that what you are
thinking?). I think we can add the complexity of preserving the event
when the actual need arise.
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Roman Gushchin 3 days, 23 hours ago
Shakeel Butt <shakeel.butt@linux.dev> writes:

> On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote:
>> Shakeel Butt <shakeel.butt@linux.dev> writes:
>> 
>> > Generally memcg charging is allowed from all the contexts including NMI
>> > where even spinning on spinlock can cause locking issues. However one
>> > call chain was missed during the addition of memcg charging from any
>> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
>> > cgroup_file_notify().
>> >
>> > The possible function call tree under cgroup_file_notify() can acquire
>> > many different spin locks in spinning mode. Some of them are
>> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
>> > just skip cgroup_file_notify() from memcg charging if the context does
>> > not allow spinning.
>> 
>> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look
>> like a bit deal, but OOM events can be way more important.
>> 
>> Should we instead preserve the event (e.g. as a pending_event_mask) and
>> raise it on the next occasion / from a different context?
>>
>
> Thanks for the review. For now only MAX can happen in non-spinning
> context. All others only happen in process context. Maybe with BPF OOM,
> OOM might be possible in a different context (is that what you are
> thinking?). I think we can add the complexity of preserving the event
> when the actual need arise.

No, I haven't thought about any particular use case, just a bit
worried about silently dropping some events. It might be not an issue
now, but might be easy to miss a moment when it becomes a problem.

So in my opinion using some delayed delivery mechanism is better
than just dropping these events.
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Shakeel Butt 3 days, 23 hours ago
On Fri, Sep 05, 2025 at 02:42:01PM -0700, Roman Gushchin wrote:
> Shakeel Butt <shakeel.butt@linux.dev> writes:
> 
> > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote:
> >> Shakeel Butt <shakeel.butt@linux.dev> writes:
> >> 
> >> > Generally memcg charging is allowed from all the contexts including NMI
> >> > where even spinning on spinlock can cause locking issues. However one
> >> > call chain was missed during the addition of memcg charging from any
> >> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> >> > cgroup_file_notify().
> >> >
> >> > The possible function call tree under cgroup_file_notify() can acquire
> >> > many different spin locks in spinning mode. Some of them are
> >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> >> > just skip cgroup_file_notify() from memcg charging if the context does
> >> > not allow spinning.
> >> 
> >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look
> >> like a bit deal, but OOM events can be way more important.
> >> 
> >> Should we instead preserve the event (e.g. as a pending_event_mask) and
> >> raise it on the next occasion / from a different context?
> >>
> >
> > Thanks for the review. For now only MAX can happen in non-spinning
> > context. All others only happen in process context. Maybe with BPF OOM,
> > OOM might be possible in a different context (is that what you are
> > thinking?). I think we can add the complexity of preserving the event
> > when the actual need arise.
> 
> No, I haven't thought about any particular use case, just a bit
> worried about silently dropping some events. It might be not an issue
> now, but might be easy to miss a moment when it becomes a problem.
> 

Only the notification can be dropped and not the event (i.e. we are
still incrementing the counters). Also for MAX only but I got your
point.

> So in my opinion using some delayed delivery mechanism is better
> than just dropping these events.

Let me see how doing this irq_work looks like and will update here.
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Roman Gushchin 3 days, 22 hours ago
Shakeel Butt <shakeel.butt@linux.dev> writes:

> On Fri, Sep 05, 2025 at 02:42:01PM -0700, Roman Gushchin wrote:
>> Shakeel Butt <shakeel.butt@linux.dev> writes:
>> 
>> > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote:
>> >> Shakeel Butt <shakeel.butt@linux.dev> writes:
>> >> 
>> >> > Generally memcg charging is allowed from all the contexts including NMI
>> >> > where even spinning on spinlock can cause locking issues. However one
>> >> > call chain was missed during the addition of memcg charging from any
>> >> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
>> >> > cgroup_file_notify().
>> >> >
>> >> > The possible function call tree under cgroup_file_notify() can acquire
>> >> > many different spin locks in spinning mode. Some of them are
>> >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
>> >> > just skip cgroup_file_notify() from memcg charging if the context does
>> >> > not allow spinning.
>> >> 
>> >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look
>> >> like a bit deal, but OOM events can be way more important.
>> >> 
>> >> Should we instead preserve the event (e.g. as a pending_event_mask) and
>> >> raise it on the next occasion / from a different context?
>> >>
>> >
>> > Thanks for the review. For now only MAX can happen in non-spinning
>> > context. All others only happen in process context. Maybe with BPF OOM,
>> > OOM might be possible in a different context (is that what you are
>> > thinking?). I think we can add the complexity of preserving the event
>> > when the actual need arise.
>> 
>> No, I haven't thought about any particular use case, just a bit
>> worried about silently dropping some events. It might be not an issue
>> now, but might be easy to miss a moment when it becomes a problem.
>> 
>
> Only the notification can be dropped and not the event (i.e. we are
> still incrementing the counters). Also for MAX only but I got your
> point.
>
>> So in my opinion using some delayed delivery mechanism is better
>> than just dropping these events.
>
> Let me see how doing this irq_work looks like and will update here.

Thanks!

If it won't work out for some reason, maybe at least explicitly
narrow it down to the MEMCG_MAX events.
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Tejun Heo 4 days ago
On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote:
> Shakeel Butt <shakeel.butt@linux.dev> writes:
> 
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> >
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> 
> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look
> like a bit deal, but OOM events can be way more important.
> 
> Should we instead preserve the event (e.g. as a pending_event_mask) and
> raise it on the next occasion / from a different context?

Maybe punt with an irq_work?

Thanks.

-- 
tejun
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Shakeel Butt 3 days, 23 hours ago
On Fri, Sep 05, 2025 at 11:25:34AM -1000, Tejun Heo wrote:
> On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote:
> > Shakeel Butt <shakeel.butt@linux.dev> writes:
> > 
> > > Generally memcg charging is allowed from all the contexts including NMI
> > > where even spinning on spinlock can cause locking issues. However one
> > > call chain was missed during the addition of memcg charging from any
> > > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > > cgroup_file_notify().
> > >
> > > The possible function call tree under cgroup_file_notify() can acquire
> > > many different spin locks in spinning mode. Some of them are
> > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > > just skip cgroup_file_notify() from memcg charging if the context does
> > > not allow spinning.
> > 
> > Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look
> > like a bit deal, but OOM events can be way more important.
> > 
> > Should we instead preserve the event (e.g. as a pending_event_mask) and
> > raise it on the next occasion / from a different context?
> 
> Maybe punt with an irq_work?

Oh good suggestion, I will check irq_work as well. Though I think that
can be a followup work.
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Peilin Ye 4 days ago
On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> Generally memcg charging is allowed from all the contexts including NMI
> where even spinning on spinlock can cause locking issues. However one
> call chain was missed during the addition of memcg charging from any
> context support. That is try_charge_memcg() -> memcg_memory_event() ->
> cgroup_file_notify().
> 
> The possible function call tree under cgroup_file_notify() can acquire
> many different spin locks in spinning mode. Some of them are
> cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> just skip cgroup_file_notify() from memcg charging if the context does
> not allow spinning.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Tested-by: Peilin Ye <yepeilin@google.com>

The repro described in [1] no longer triggers locking issues after
applying this patch and making __bpf_async_init() use __GFP_HIGH
instead of GFP_ATOMIC:

--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
        }

        /* allocate hrtimer via map_kmalloc to use memcg accounting */
-       cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+       cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
        if (!cb) {
                ret = -ENOMEM;
                goto out;

[1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t

Thanks,
Peilin Ye
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Michal Hocko 1 day, 12 hours ago
On Fri 05-09-25 20:48:46, Peilin Ye wrote:
> On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> > 
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> > 
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> Tested-by: Peilin Ye <yepeilin@google.com>
> 
> The repro described in [1] no longer triggers locking issues after
> applying this patch and making __bpf_async_init() use __GFP_HIGH
> instead of GFP_ATOMIC:
> 
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
>         }
> 
>         /* allocate hrtimer via map_kmalloc to use memcg accounting */
> -       cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
> +       cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);

Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be
used instead here?

>         if (!cb) {
>                 ret = -ENOMEM;
>                 goto out;
> 
> [1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t
> 
> Thanks,
> Peilin Ye

-- 
Michal Hocko
SUSE Labs
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Alexei Starovoitov 1 day, 4 hours ago
On Mon, Sep 8, 2025 at 2:08 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Fri 05-09-25 20:48:46, Peilin Ye wrote:
> > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > > Generally memcg charging is allowed from all the contexts including NMI
> > > where even spinning on spinlock can cause locking issues. However one
> > > call chain was missed during the addition of memcg charging from any
> > > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > > cgroup_file_notify().
> > >
> > > The possible function call tree under cgroup_file_notify() can acquire
> > > many different spin locks in spinning mode. Some of them are
> > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > > just skip cgroup_file_notify() from memcg charging if the context does
> > > not allow spinning.
> > >
> > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> >
> > Tested-by: Peilin Ye <yepeilin@google.com>
> >
> > The repro described in [1] no longer triggers locking issues after
> > applying this patch and making __bpf_async_init() use __GFP_HIGH
> > instead of GFP_ATOMIC:
> >
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> >         }
> >
> >         /* allocate hrtimer via map_kmalloc to use memcg accounting */
> > -       cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
> > +       cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
>
> Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be
> used instead here?

Yes. That's a plan. We'll convert most of bpf allocations to kmalloc_nolock()
when it lands.
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Michal Hocko 15 hours ago
On Mon 08-09-25 10:11:29, Alexei Starovoitov wrote:
> On Mon, Sep 8, 2025 at 2:08 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Fri 05-09-25 20:48:46, Peilin Ye wrote:
> > > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > > > Generally memcg charging is allowed from all the contexts including NMI
> > > > where even spinning on spinlock can cause locking issues. However one
> > > > call chain was missed during the addition of memcg charging from any
> > > > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > > > cgroup_file_notify().
> > > >
> > > > The possible function call tree under cgroup_file_notify() can acquire
> > > > many different spin locks in spinning mode. Some of them are
> > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > > > just skip cgroup_file_notify() from memcg charging if the context does
> > > > not allow spinning.
> > > >
> > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> > >
> > > Tested-by: Peilin Ye <yepeilin@google.com>
> > >
> > > The repro described in [1] no longer triggers locking issues after
> > > applying this patch and making __bpf_async_init() use __GFP_HIGH
> > > instead of GFP_ATOMIC:
> > >
> > > --- a/kernel/bpf/helpers.c
> > > +++ b/kernel/bpf/helpers.c
> > > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
> > >         }
> > >
> > >         /* allocate hrtimer via map_kmalloc to use memcg accounting */
> > > -       cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
> > > +       cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
> >
> > Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be
> > used instead here?
> 
> Yes. That's a plan. We'll convert most of bpf allocations to kmalloc_nolock()
> when it lands.

OK, I thought this was merged already. I suspect __GFP_HIGH is used here
as a result of manual GFP_ATOMIC & ~GFP_RECLAIM. A TODO/FIXME referring
to kmalloc_nolock would clarify the situation.

-- 
Michal Hocko
SUSE Labs
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Shakeel Butt 3 days, 23 hours ago
On Fri, Sep 05, 2025 at 08:48:46PM +0000, Peilin Ye wrote:
> On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > Generally memcg charging is allowed from all the contexts including NMI
> > where even spinning on spinlock can cause locking issues. However one
> > call chain was missed during the addition of memcg charging from any
> > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > cgroup_file_notify().
> > 
> > The possible function call tree under cgroup_file_notify() can acquire
> > many different spin locks in spinning mode. Some of them are
> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > just skip cgroup_file_notify() from memcg charging if the context does
> > not allow spinning.
> > 
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> Tested-by: Peilin Ye <yepeilin@google.com>

Thanks Peilin. When you post the official patch for __GFP_HIGH in
__bpf_async_init(), please add a comment on why __GFP_HIGH is used
instead of GFP_ATOMIC.

> 
> The repro described in [1] no longer triggers locking issues after
> applying this patch and making __bpf_async_init() use __GFP_HIGH
> instead of GFP_ATOMIC:
> 
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
>         }
> 
>         /* allocate hrtimer via map_kmalloc to use memcg accounting */
> -       cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
> +       cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
>         if (!cb) {
>                 ret = -ENOMEM;
>                 goto out;
> 
> [1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t
> 
> Thanks,
> Peilin Ye
>
Re: [PATCH] memcg: skip cgroup_file_notify if spinning is not allowed
Posted by Peilin Ye 3 days, 23 hours ago
On Fri, Sep 05, 2025 at 02:33:16PM -0700, Shakeel Butt wrote:
> On Fri, Sep 05, 2025 at 08:48:46PM +0000, Peilin Ye wrote:
> > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote:
> > > Generally memcg charging is allowed from all the contexts including NMI
> > > where even spinning on spinlock can cause locking issues. However one
> > > call chain was missed during the addition of memcg charging from any
> > > context support. That is try_charge_memcg() -> memcg_memory_event() ->
> > > cgroup_file_notify().
> > > 
> > > The possible function call tree under cgroup_file_notify() can acquire
> > > many different spin locks in spinning mode. Some of them are
> > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
> > > just skip cgroup_file_notify() from memcg charging if the context does
> > > not allow spinning.
> > > 
> > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> > 
> > Tested-by: Peilin Ye <yepeilin@google.com>
> 
> Thanks Peilin. When you post the official patch for __GFP_HIGH in
> __bpf_async_init(), please add a comment on why __GFP_HIGH is used
> instead of GFP_ATOMIC.

Got it!  I'll schedule to have that done today.

Thanks,
Peilin Ye