include/linux/memcontrol.h | 23 ++++++++++++++++------- mm/memcontrol.c | 7 ++++--- 2 files changed, 20 insertions(+), 10 deletions(-)
Generally memcg charging is allowed from all the contexts including NMI
where even spinning on spinlock can cause locking issues. However one
call chain was missed during the addition of memcg charging from any
context support. That is try_charge_memcg() -> memcg_memory_event() ->
cgroup_file_notify().
The possible function call tree under cgroup_file_notify() can acquire
many different spin locks in spinning mode. Some of them are
cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's
just skip cgroup_file_notify() from memcg charging if the context does
not allow spinning.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
include/linux/memcontrol.h | 23 ++++++++++++++++-------
mm/memcontrol.c | 7 ++++---
2 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9dc5b52672a6..054fa34c936a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
count_memcg_events_mm(mm, idx, 1);
}
-static inline void memcg_memory_event(struct mem_cgroup *memcg,
- enum memcg_memory_event event)
+static inline void __memcg_memory_event(struct mem_cgroup *memcg,
+ enum memcg_memory_event event,
+ bool allow_spinning)
{
bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
event == MEMCG_SWAP_FAIL;
atomic_long_inc(&memcg->memory_events_local[event]);
- if (!swap_event)
+ if (!swap_event && allow_spinning)
cgroup_file_notify(&memcg->events_local_file);
do {
atomic_long_inc(&memcg->memory_events[event]);
- if (swap_event)
- cgroup_file_notify(&memcg->swap_events_file);
- else
- cgroup_file_notify(&memcg->events_file);
+ if (allow_spinning) {
+ if (swap_event)
+ cgroup_file_notify(&memcg->swap_events_file);
+ else
+ cgroup_file_notify(&memcg->events_file);
+ }
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
break;
@@ -1018,6 +1021,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
!mem_cgroup_is_root(memcg));
}
+static inline void memcg_memory_event(struct mem_cgroup *memcg,
+ enum memcg_memory_event event)
+{
+ __memcg_memory_event(memcg, event, true);
+}
+
static inline void memcg_memory_event_mm(struct mm_struct *mm,
enum memcg_memory_event event)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 257d2c76b730..dd5cd9d352f3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2306,12 +2306,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
+ bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
retry:
if (consume_stock(memcg, nr_pages))
return 0;
- if (!gfpflags_allow_spinning(gfp_mask))
+ if (!allow_spinning)
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
@@ -2347,7 +2348,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
raised_max_event = true;
psi_memstall_enter(&pflags);
@@ -2414,7 +2415,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* a MEMCG_MAX event.
*/
if (!raised_max_event)
- memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning);
/*
* The allocation either can't fail or will lead to more memory
--
2.47.3
On Fri 05-09-25 13:16:06, Shakeel Butt wrote: > Generally memcg charging is allowed from all the contexts including NMI > where even spinning on spinlock can cause locking issues. However one > call chain was missed during the addition of memcg charging from any > context support. That is try_charge_memcg() -> memcg_memory_event() -> > cgroup_file_notify(). > > The possible function call tree under cgroup_file_notify() can acquire > many different spin locks in spinning mode. Some of them are > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > just skip cgroup_file_notify() from memcg charging if the context does > not allow spinning. > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> Acked-by: Michal Hocko <mhocko@suse.com> > --- > include/linux/memcontrol.h | 23 ++++++++++++++++------- > mm/memcontrol.c | 7 ++++--- > 2 files changed, 20 insertions(+), 10 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 9dc5b52672a6..054fa34c936a 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, > count_memcg_events_mm(mm, idx, 1); > } > > -static inline void memcg_memory_event(struct mem_cgroup *memcg, > - enum memcg_memory_event event) > +static inline void __memcg_memory_event(struct mem_cgroup *memcg, > + enum memcg_memory_event event, > + bool allow_spinning) > { > bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || > event == MEMCG_SWAP_FAIL; > > atomic_long_inc(&memcg->memory_events_local[event]); Doesn't this involve locking on 32b? I guess we do not care all that much but we might want to bail out early on those arches for !allow_spinning > - if (!swap_event) > + if (!swap_event && allow_spinning) > cgroup_file_notify(&memcg->events_local_file); > > do { > atomic_long_inc(&memcg->memory_events[event]); > - if (swap_event) > - cgroup_file_notify(&memcg->swap_events_file); > - else > - cgroup_file_notify(&memcg->events_file); > + if (allow_spinning) { > + if (swap_event) > + cgroup_file_notify(&memcg->swap_events_file); > + else > + cgroup_file_notify(&memcg->events_file); > + } > > if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) > break; > @@ -1018,6 +1021,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, > !mem_cgroup_is_root(memcg)); > } > > +static inline void memcg_memory_event(struct mem_cgroup *memcg, > + enum memcg_memory_event event) > +{ > + __memcg_memory_event(memcg, event, true); > +} > + > static inline void memcg_memory_event_mm(struct mm_struct *mm, > enum memcg_memory_event event) > { > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 257d2c76b730..dd5cd9d352f3 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2306,12 +2306,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, > bool drained = false; > bool raised_max_event = false; > unsigned long pflags; > + bool allow_spinning = gfpflags_allow_spinning(gfp_mask); > > retry: > if (consume_stock(memcg, nr_pages)) > return 0; > > - if (!gfpflags_allow_spinning(gfp_mask)) > + if (!allow_spinning) > /* Avoid the refill and flush of the older stock */ > batch = nr_pages; > > @@ -2347,7 +2348,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, > if (!gfpflags_allow_blocking(gfp_mask)) > goto nomem; > > - memcg_memory_event(mem_over_limit, MEMCG_MAX); > + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning); > raised_max_event = true; > > psi_memstall_enter(&pflags); > @@ -2414,7 +2415,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, > * a MEMCG_MAX event. > */ > if (!raised_max_event) > - memcg_memory_event(mem_over_limit, MEMCG_MAX); > + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning); > > /* > * The allocation either can't fail or will lead to more memory > -- > 2.47.3 > -- Michal Hocko SUSE Labs
On Mon, Sep 08, 2025 at 11:28:20AM +0200, Michal Hocko wrote: > On Fri 05-09-25 13:16:06, Shakeel Butt wrote: > > Generally memcg charging is allowed from all the contexts including NMI > > where even spinning on spinlock can cause locking issues. However one > > call chain was missed during the addition of memcg charging from any > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > cgroup_file_notify(). > > > > The possible function call tree under cgroup_file_notify() can acquire > > many different spin locks in spinning mode. Some of them are > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > just skip cgroup_file_notify() from memcg charging if the context does > > not allow spinning. > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > Acked-by: Michal Hocko <mhocko@suse.com> Thanks. > > > --- > > include/linux/memcontrol.h | 23 ++++++++++++++++------- > > mm/memcontrol.c | 7 ++++--- > > 2 files changed, 20 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > > index 9dc5b52672a6..054fa34c936a 100644 > > --- a/include/linux/memcontrol.h > > +++ b/include/linux/memcontrol.h > > @@ -993,22 +993,25 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, > > count_memcg_events_mm(mm, idx, 1); > > } > > > > -static inline void memcg_memory_event(struct mem_cgroup *memcg, > > - enum memcg_memory_event event) > > +static inline void __memcg_memory_event(struct mem_cgroup *memcg, > > + enum memcg_memory_event event, > > + bool allow_spinning) > > { > > bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || > > event == MEMCG_SWAP_FAIL; > > > > atomic_long_inc(&memcg->memory_events_local[event]); > > Doesn't this involve locking on 32b? I guess we do not care all that > much but we might want to bail out early on those arches for > !allow_spinning > I am prototyping irq_work based approach and if that looks good then we might not need to worry about 32b at all.
Shakeel Butt <shakeel.butt@linux.dev> writes: > Generally memcg charging is allowed from all the contexts including NMI > where even spinning on spinlock can cause locking issues. However one > call chain was missed during the addition of memcg charging from any > context support. That is try_charge_memcg() -> memcg_memory_event() -> > cgroup_file_notify(). > > The possible function call tree under cgroup_file_notify() can acquire > many different spin locks in spinning mode. Some of them are > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > just skip cgroup_file_notify() from memcg charging if the context does > not allow spinning. Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look like a bit deal, but OOM events can be way more important. Should we instead preserve the event (e.g. as a pending_event_mask) and raise it on the next occasion / from a different context? Thanks
On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > Generally memcg charging is allowed from all the contexts including NMI > > where even spinning on spinlock can cause locking issues. However one > > call chain was missed during the addition of memcg charging from any > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > cgroup_file_notify(). > > > > The possible function call tree under cgroup_file_notify() can acquire > > many different spin locks in spinning mode. Some of them are > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > just skip cgroup_file_notify() from memcg charging if the context does > > not allow spinning. > > Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > like a bit deal, but OOM events can be way more important. > > Should we instead preserve the event (e.g. as a pending_event_mask) and > raise it on the next occasion / from a different context? > Thanks for the review. For now only MAX can happen in non-spinning context. All others only happen in process context. Maybe with BPF OOM, OOM might be possible in a different context (is that what you are thinking?). I think we can add the complexity of preserving the event when the actual need arise.
Shakeel Butt <shakeel.butt@linux.dev> writes: > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: >> Shakeel Butt <shakeel.butt@linux.dev> writes: >> >> > Generally memcg charging is allowed from all the contexts including NMI >> > where even spinning on spinlock can cause locking issues. However one >> > call chain was missed during the addition of memcg charging from any >> > context support. That is try_charge_memcg() -> memcg_memory_event() -> >> > cgroup_file_notify(). >> > >> > The possible function call tree under cgroup_file_notify() can acquire >> > many different spin locks in spinning mode. Some of them are >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's >> > just skip cgroup_file_notify() from memcg charging if the context does >> > not allow spinning. >> >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look >> like a bit deal, but OOM events can be way more important. >> >> Should we instead preserve the event (e.g. as a pending_event_mask) and >> raise it on the next occasion / from a different context? >> > > Thanks for the review. For now only MAX can happen in non-spinning > context. All others only happen in process context. Maybe with BPF OOM, > OOM might be possible in a different context (is that what you are > thinking?). I think we can add the complexity of preserving the event > when the actual need arise. No, I haven't thought about any particular use case, just a bit worried about silently dropping some events. It might be not an issue now, but might be easy to miss a moment when it becomes a problem. So in my opinion using some delayed delivery mechanism is better than just dropping these events.
On Fri, Sep 05, 2025 at 02:42:01PM -0700, Roman Gushchin wrote: > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > >> Shakeel Butt <shakeel.butt@linux.dev> writes: > >> > >> > Generally memcg charging is allowed from all the contexts including NMI > >> > where even spinning on spinlock can cause locking issues. However one > >> > call chain was missed during the addition of memcg charging from any > >> > context support. That is try_charge_memcg() -> memcg_memory_event() -> > >> > cgroup_file_notify(). > >> > > >> > The possible function call tree under cgroup_file_notify() can acquire > >> > many different spin locks in spinning mode. Some of them are > >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > >> > just skip cgroup_file_notify() from memcg charging if the context does > >> > not allow spinning. > >> > >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > >> like a bit deal, but OOM events can be way more important. > >> > >> Should we instead preserve the event (e.g. as a pending_event_mask) and > >> raise it on the next occasion / from a different context? > >> > > > > Thanks for the review. For now only MAX can happen in non-spinning > > context. All others only happen in process context. Maybe with BPF OOM, > > OOM might be possible in a different context (is that what you are > > thinking?). I think we can add the complexity of preserving the event > > when the actual need arise. > > No, I haven't thought about any particular use case, just a bit > worried about silently dropping some events. It might be not an issue > now, but might be easy to miss a moment when it becomes a problem. > Only the notification can be dropped and not the event (i.e. we are still incrementing the counters). Also for MAX only but I got your point. > So in my opinion using some delayed delivery mechanism is better > than just dropping these events. Let me see how doing this irq_work looks like and will update here.
Shakeel Butt <shakeel.butt@linux.dev> writes: > On Fri, Sep 05, 2025 at 02:42:01PM -0700, Roman Gushchin wrote: >> Shakeel Butt <shakeel.butt@linux.dev> writes: >> >> > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: >> >> Shakeel Butt <shakeel.butt@linux.dev> writes: >> >> >> >> > Generally memcg charging is allowed from all the contexts including NMI >> >> > where even spinning on spinlock can cause locking issues. However one >> >> > call chain was missed during the addition of memcg charging from any >> >> > context support. That is try_charge_memcg() -> memcg_memory_event() -> >> >> > cgroup_file_notify(). >> >> > >> >> > The possible function call tree under cgroup_file_notify() can acquire >> >> > many different spin locks in spinning mode. Some of them are >> >> > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's >> >> > just skip cgroup_file_notify() from memcg charging if the context does >> >> > not allow spinning. >> >> >> >> Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look >> >> like a bit deal, but OOM events can be way more important. >> >> >> >> Should we instead preserve the event (e.g. as a pending_event_mask) and >> >> raise it on the next occasion / from a different context? >> >> >> > >> > Thanks for the review. For now only MAX can happen in non-spinning >> > context. All others only happen in process context. Maybe with BPF OOM, >> > OOM might be possible in a different context (is that what you are >> > thinking?). I think we can add the complexity of preserving the event >> > when the actual need arise. >> >> No, I haven't thought about any particular use case, just a bit >> worried about silently dropping some events. It might be not an issue >> now, but might be easy to miss a moment when it becomes a problem. >> > > Only the notification can be dropped and not the event (i.e. we are > still incrementing the counters). Also for MAX only but I got your > point. > >> So in my opinion using some delayed delivery mechanism is better >> than just dropping these events. > > Let me see how doing this irq_work looks like and will update here. Thanks! If it won't work out for some reason, maybe at least explicitly narrow it down to the MEMCG_MAX events.
On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > Generally memcg charging is allowed from all the contexts including NMI > > where even spinning on spinlock can cause locking issues. However one > > call chain was missed during the addition of memcg charging from any > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > cgroup_file_notify(). > > > > The possible function call tree under cgroup_file_notify() can acquire > > many different spin locks in spinning mode. Some of them are > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > just skip cgroup_file_notify() from memcg charging if the context does > > not allow spinning. > > Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > like a bit deal, but OOM events can be way more important. > > Should we instead preserve the event (e.g. as a pending_event_mask) and > raise it on the next occasion / from a different context? Maybe punt with an irq_work? Thanks. -- tejun
On Fri, Sep 05, 2025 at 11:25:34AM -1000, Tejun Heo wrote: > On Fri, Sep 05, 2025 at 02:20:46PM -0700, Roman Gushchin wrote: > > Shakeel Butt <shakeel.butt@linux.dev> writes: > > > > > Generally memcg charging is allowed from all the contexts including NMI > > > where even spinning on spinlock can cause locking issues. However one > > > call chain was missed during the addition of memcg charging from any > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > cgroup_file_notify(). > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > many different spin locks in spinning mode. Some of them are > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > just skip cgroup_file_notify() from memcg charging if the context does > > > not allow spinning. > > > > Hmm, what about OOM events? Losing something like MEMCG_LOW doesn't look > > like a bit deal, but OOM events can be way more important. > > > > Should we instead preserve the event (e.g. as a pending_event_mask) and > > raise it on the next occasion / from a different context? > > Maybe punt with an irq_work? Oh good suggestion, I will check irq_work as well. Though I think that can be a followup work.
On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > Generally memcg charging is allowed from all the contexts including NMI > where even spinning on spinlock can cause locking issues. However one > call chain was missed during the addition of memcg charging from any > context support. That is try_charge_memcg() -> memcg_memory_event() -> > cgroup_file_notify(). > > The possible function call tree under cgroup_file_notify() can acquire > many different spin locks in spinning mode. Some of them are > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > just skip cgroup_file_notify() from memcg charging if the context does > not allow spinning. > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> Tested-by: Peilin Ye <yepeilin@google.com> The repro described in [1] no longer triggers locking issues after applying this patch and making __bpf_async_init() use __GFP_HIGH instead of GFP_ATOMIC: --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u } /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; [1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t Thanks, Peilin Ye
On Fri 05-09-25 20:48:46, Peilin Ye wrote: > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > Generally memcg charging is allowed from all the contexts including NMI > > where even spinning on spinlock can cause locking issues. However one > > call chain was missed during the addition of memcg charging from any > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > cgroup_file_notify(). > > > > The possible function call tree under cgroup_file_notify() can acquire > > many different spin locks in spinning mode. Some of them are > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > just skip cgroup_file_notify() from memcg charging if the context does > > not allow spinning. > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > Tested-by: Peilin Ye <yepeilin@google.com> > > The repro described in [1] no longer triggers locking issues after > applying this patch and making __bpf_async_init() use __GFP_HIGH > instead of GFP_ATOMIC: > > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u > } > > /* allocate hrtimer via map_kmalloc to use memcg accounting */ > - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); > + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be used instead here? > if (!cb) { > ret = -ENOMEM; > goto out; > > [1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t > > Thanks, > Peilin Ye -- Michal Hocko SUSE Labs
On Mon, Sep 8, 2025 at 2:08 AM Michal Hocko <mhocko@suse.com> wrote: > > On Fri 05-09-25 20:48:46, Peilin Ye wrote: > > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > > Generally memcg charging is allowed from all the contexts including NMI > > > where even spinning on spinlock can cause locking issues. However one > > > call chain was missed during the addition of memcg charging from any > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > cgroup_file_notify(). > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > many different spin locks in spinning mode. Some of them are > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > just skip cgroup_file_notify() from memcg charging if the context does > > > not allow spinning. > > > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > Tested-by: Peilin Ye <yepeilin@google.com> > > > > The repro described in [1] no longer triggers locking issues after > > applying this patch and making __bpf_async_init() use __GFP_HIGH > > instead of GFP_ATOMIC: > > > > --- a/kernel/bpf/helpers.c > > +++ b/kernel/bpf/helpers.c > > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u > > } > > > > /* allocate hrtimer via map_kmalloc to use memcg accounting */ > > - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); > > + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); > > Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be > used instead here? Yes. That's a plan. We'll convert most of bpf allocations to kmalloc_nolock() when it lands.
On Mon 08-09-25 10:11:29, Alexei Starovoitov wrote: > On Mon, Sep 8, 2025 at 2:08 AM Michal Hocko <mhocko@suse.com> wrote: > > > > On Fri 05-09-25 20:48:46, Peilin Ye wrote: > > > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > > > Generally memcg charging is allowed from all the contexts including NMI > > > > where even spinning on spinlock can cause locking issues. However one > > > > call chain was missed during the addition of memcg charging from any > > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > > cgroup_file_notify(). > > > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > > many different spin locks in spinning mode. Some of them are > > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > > just skip cgroup_file_notify() from memcg charging if the context does > > > > not allow spinning. > > > > > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > > > Tested-by: Peilin Ye <yepeilin@google.com> > > > > > > The repro described in [1] no longer triggers locking issues after > > > applying this patch and making __bpf_async_init() use __GFP_HIGH > > > instead of GFP_ATOMIC: > > > > > > --- a/kernel/bpf/helpers.c > > > +++ b/kernel/bpf/helpers.c > > > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u > > > } > > > > > > /* allocate hrtimer via map_kmalloc to use memcg accounting */ > > > - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); > > > + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); > > > > Why do you need to consume memory reserves? Shouldn't kmalloc_nolock be > > used instead here? > > Yes. That's a plan. We'll convert most of bpf allocations to kmalloc_nolock() > when it lands. OK, I thought this was merged already. I suspect __GFP_HIGH is used here as a result of manual GFP_ATOMIC & ~GFP_RECLAIM. A TODO/FIXME referring to kmalloc_nolock would clarify the situation. -- Michal Hocko SUSE Labs
On Fri, Sep 05, 2025 at 08:48:46PM +0000, Peilin Ye wrote: > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > Generally memcg charging is allowed from all the contexts including NMI > > where even spinning on spinlock can cause locking issues. However one > > call chain was missed during the addition of memcg charging from any > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > cgroup_file_notify(). > > > > The possible function call tree under cgroup_file_notify() can acquire > > many different spin locks in spinning mode. Some of them are > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > just skip cgroup_file_notify() from memcg charging if the context does > > not allow spinning. > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > Tested-by: Peilin Ye <yepeilin@google.com> Thanks Peilin. When you post the official patch for __GFP_HIGH in __bpf_async_init(), please add a comment on why __GFP_HIGH is used instead of GFP_ATOMIC. > > The repro described in [1] no longer triggers locking issues after > applying this patch and making __bpf_async_init() use __GFP_HIGH > instead of GFP_ATOMIC: > > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -1275,7 +1275,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u > } > > /* allocate hrtimer via map_kmalloc to use memcg accounting */ > - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); > + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); > if (!cb) { > ret = -ENOMEM; > goto out; > > [1] https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/#t > > Thanks, > Peilin Ye >
On Fri, Sep 05, 2025 at 02:33:16PM -0700, Shakeel Butt wrote: > On Fri, Sep 05, 2025 at 08:48:46PM +0000, Peilin Ye wrote: > > On Fri, Sep 05, 2025 at 01:16:06PM -0700, Shakeel Butt wrote: > > > Generally memcg charging is allowed from all the contexts including NMI > > > where even spinning on spinlock can cause locking issues. However one > > > call chain was missed during the addition of memcg charging from any > > > context support. That is try_charge_memcg() -> memcg_memory_event() -> > > > cgroup_file_notify(). > > > > > > The possible function call tree under cgroup_file_notify() can acquire > > > many different spin locks in spinning mode. Some of them are > > > cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's > > > just skip cgroup_file_notify() from memcg charging if the context does > > > not allow spinning. > > > > > > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > Tested-by: Peilin Ye <yepeilin@google.com> > > Thanks Peilin. When you post the official patch for __GFP_HIGH in > __bpf_async_init(), please add a comment on why __GFP_HIGH is used > instead of GFP_ATOMIC. Got it! I'll schedule to have that done today. Thanks, Peilin Ye
© 2016 - 2025 Red Hat, Inc.