__vmalloc_area_node() may call free_vmap_area() or vfree() on
error paths, both of which can sleep. This becomes problematic
if the function is invoked from an atomic context, such as when
GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.
To fix this, unify error paths and defer the cleanup of partly
initialized vm_struct objects to a workqueue. This ensures that
freeing happens in a process context and avoids invalid sleeps
in atomic regions.
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
include/linux/vmalloc.h | 6 +++++-
mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++---
2 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index fdc9aeb74a44..b1425fae8cbf 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */
#endif
struct vm_struct {
- struct vm_struct *next;
+ union {
+ struct vm_struct *next; /* Early registration of vm_areas. */
+ struct llist_node llnode; /* Asynchronous freeing on error paths. */
+ };
+
void *addr;
unsigned long size;
unsigned long flags;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7f48a54ec108..2424f80d524a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
return nr_allocated;
}
+static LLIST_HEAD(pending_vm_area_cleanup);
+static void cleanup_vm_area_work(struct work_struct *work)
+{
+ struct vm_struct *area, *tmp;
+ struct llist_node *head;
+
+ head = llist_del_all(&pending_vm_area_cleanup);
+ if (!head)
+ return;
+
+ llist_for_each_entry_safe(area, tmp, head, llnode) {
+ if (!area->pages)
+ free_vm_area(area);
+ else
+ vfree(area->addr);
+ }
+}
+
+/*
+ * Helper for __vmalloc_area_node() to defer cleanup
+ * of partially initialized vm_struct in error paths.
+ */
+static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
+static void defer_vm_area_cleanup(struct vm_struct *area)
+{
+ if (llist_add(&area->llnode, &pending_vm_area_cleanup))
+ schedule_work(&cleanup_vm_area);
+}
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, unsigned int page_shift,
int node)
@@ -3711,8 +3740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
- free_vm_area(area);
- return NULL;
+ goto fail;
}
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
@@ -3789,7 +3817,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- vfree(area->addr);
+ defer_vm_area_cleanup(area);
return NULL;
}
--
2.39.5
On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote: > __vmalloc_area_node() may call free_vmap_area() or vfree() on > error paths, both of which can sleep. This becomes problematic > if the function is invoked from an atomic context, such as when > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. > > To fix this, unify error paths and defer the cleanup of partly > initialized vm_struct objects to a workqueue. This ensures that > freeing happens in a process context and avoids invalid sleeps > in atomic regions. > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > --- > include/linux/vmalloc.h | 6 +++++- > mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- > 2 files changed, 36 insertions(+), 4 deletions(-) > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > index fdc9aeb74a44..b1425fae8cbf 100644 > --- a/include/linux/vmalloc.h > +++ b/include/linux/vmalloc.h > @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ > #endif > > struct vm_struct { > - struct vm_struct *next; > + union { > + struct vm_struct *next; /* Early registration of vm_areas. */ > + struct llist_node llnode; /* Asynchronous freeing on error paths. */ > + }; > + > void *addr; > unsigned long size; > unsigned long flags; > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 7f48a54ec108..2424f80d524a 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, > return nr_allocated; > } > > +static LLIST_HEAD(pending_vm_area_cleanup); > +static void cleanup_vm_area_work(struct work_struct *work) > +{ > + struct vm_struct *area, *tmp; > + struct llist_node *head; > + > + head = llist_del_all(&pending_vm_area_cleanup); > + if (!head) > + return; > + > + llist_for_each_entry_safe(area, tmp, head, llnode) { > + if (!area->pages) > + free_vm_area(area); > + else > + vfree(area->addr); > + } > +} > + > +/* > + * Helper for __vmalloc_area_node() to defer cleanup > + * of partially initialized vm_struct in error paths. > + */ > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); > +static void defer_vm_area_cleanup(struct vm_struct *area) > +{ > + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) > + schedule_work(&cleanup_vm_area); > +} Wondering why here we need call schudule_work() when pending_vm_area_cleanup was empty before adding new entry. Shouldn't it be as below to schedule the job? Not sure if I miss anything. if (!llist_add(&area->llnode, &pending_vm_area_cleanup)) schedule_work(&cleanup_vm_area); ===== /** * llist_add - add a new entry * @new: new entry to be added * @head: the head for your lock-less list * * Returns true if the list was empty prior to adding this entry. */ static inline bool llist_add(struct llist_node *new, struct llist_head *head) { return llist_add_batch(new, new, head); } ===== > + > static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, > pgprot_t prot, unsigned int page_shift, > int node) > @@ -3711,8 +3740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, > warn_alloc(gfp_mask, NULL, > "vmalloc error: size %lu, failed to allocated page array size %lu", > nr_small_pages * PAGE_SIZE, array_size); > - free_vm_area(area); > - return NULL; > + goto fail; > } > > set_vm_area_page_order(area, page_shift - PAGE_SHIFT); > @@ -3789,7 +3817,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, > return area->addr; > > fail: > - vfree(area->addr); > + defer_vm_area_cleanup(area); > return NULL; > } > > -- > 2.39.5 >
On Mon, Aug 18, 2025 at 12:21:15PM +0800, Baoquan He wrote: > On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote: > > __vmalloc_area_node() may call free_vmap_area() or vfree() on > > error paths, both of which can sleep. This becomes problematic > > if the function is invoked from an atomic context, such as when > > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. > > > > To fix this, unify error paths and defer the cleanup of partly > > initialized vm_struct objects to a workqueue. This ensures that > > freeing happens in a process context and avoids invalid sleeps > > in atomic regions. > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > --- > > include/linux/vmalloc.h | 6 +++++- > > mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- > > 2 files changed, 36 insertions(+), 4 deletions(-) > > > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > > index fdc9aeb74a44..b1425fae8cbf 100644 > > --- a/include/linux/vmalloc.h > > +++ b/include/linux/vmalloc.h > > @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ > > #endif > > > > struct vm_struct { > > - struct vm_struct *next; > > + union { > > + struct vm_struct *next; /* Early registration of vm_areas. */ > > + struct llist_node llnode; /* Asynchronous freeing on error paths. */ > > + }; > > + > > void *addr; > > unsigned long size; > > unsigned long flags; > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > > index 7f48a54ec108..2424f80d524a 100644 > > --- a/mm/vmalloc.c > > +++ b/mm/vmalloc.c > > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, > > return nr_allocated; > > } > > > > +static LLIST_HEAD(pending_vm_area_cleanup); > > +static void cleanup_vm_area_work(struct work_struct *work) > > +{ > > + struct vm_struct *area, *tmp; > > + struct llist_node *head; > > + > > + head = llist_del_all(&pending_vm_area_cleanup); > > + if (!head) > > + return; > > + > > + llist_for_each_entry_safe(area, tmp, head, llnode) { > > + if (!area->pages) > > + free_vm_area(area); > > + else > > + vfree(area->addr); > > + } > > +} > > + > > +/* > > + * Helper for __vmalloc_area_node() to defer cleanup > > + * of partially initialized vm_struct in error paths. > > + */ > > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); > > +static void defer_vm_area_cleanup(struct vm_struct *area) > > +{ > > + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) > > + schedule_work(&cleanup_vm_area); > > +} > > Wondering why here we need call schudule_work() when > pending_vm_area_cleanup was empty before adding new entry. Shouldn't > it be as below to schedule the job? Not sure if I miss anything. > > if (!llist_add(&area->llnode, &pending_vm_area_cleanup)) > schedule_work(&cleanup_vm_area); > > ===== > /** > * llist_add - add a new entry > * @new: new entry to be added > * @head: the head for your lock-less list > * > * Returns true if the list was empty prior to adding this entry. > */ > static inline bool llist_add(struct llist_node *new, struct llist_head *head) > { > return llist_add_batch(new, new, head); > } > ===== > But then you will not schedule. If the list is empty, we add one element llist_add() returns 1, but your condition expects 0. How it works: If someone keeps adding to the llist and it is not empty we should not trigger a new work, because a current work is in flight(it will cover new comers), i.e. it has been scheduled but it has not yet completed llist_del_all() on the head. Once it is done, a new comer will trigger a work again only if it sees NULL, i.e. when the list is empty. -- Uladzislau Rezki
On 08/18/25 at 03:02pm, Uladzislau Rezki wrote: > On Mon, Aug 18, 2025 at 12:21:15PM +0800, Baoquan He wrote: > > On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote: > > > __vmalloc_area_node() may call free_vmap_area() or vfree() on > > > error paths, both of which can sleep. This becomes problematic > > > if the function is invoked from an atomic context, such as when > > > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. > > > > > > To fix this, unify error paths and defer the cleanup of partly > > > initialized vm_struct objects to a workqueue. This ensures that > > > freeing happens in a process context and avoids invalid sleeps > > > in atomic regions. > > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > > --- > > > include/linux/vmalloc.h | 6 +++++- > > > mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- > > > 2 files changed, 36 insertions(+), 4 deletions(-) > > > > > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > > > index fdc9aeb74a44..b1425fae8cbf 100644 > > > --- a/include/linux/vmalloc.h > > > +++ b/include/linux/vmalloc.h > > > @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ > > > #endif > > > > > > struct vm_struct { > > > - struct vm_struct *next; > > > + union { > > > + struct vm_struct *next; /* Early registration of vm_areas. */ > > > + struct llist_node llnode; /* Asynchronous freeing on error paths. */ > > > + }; > > > + > > > void *addr; > > > unsigned long size; > > > unsigned long flags; > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > > > index 7f48a54ec108..2424f80d524a 100644 > > > --- a/mm/vmalloc.c > > > +++ b/mm/vmalloc.c > > > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, > > > return nr_allocated; > > > } > > > > > > +static LLIST_HEAD(pending_vm_area_cleanup); > > > +static void cleanup_vm_area_work(struct work_struct *work) > > > +{ > > > + struct vm_struct *area, *tmp; > > > + struct llist_node *head; > > > + > > > + head = llist_del_all(&pending_vm_area_cleanup); > > > + if (!head) > > > + return; > > > + > > > + llist_for_each_entry_safe(area, tmp, head, llnode) { > > > + if (!area->pages) > > > + free_vm_area(area); > > > + else > > > + vfree(area->addr); > > > + } > > > +} > > > + > > > +/* > > > + * Helper for __vmalloc_area_node() to defer cleanup > > > + * of partially initialized vm_struct in error paths. > > > + */ > > > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); > > > +static void defer_vm_area_cleanup(struct vm_struct *area) > > > +{ > > > + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) > > > + schedule_work(&cleanup_vm_area); > > > +} > > > > Wondering why here we need call schudule_work() when > > pending_vm_area_cleanup was empty before adding new entry. Shouldn't > > it be as below to schedule the job? Not sure if I miss anything. > > > > if (!llist_add(&area->llnode, &pending_vm_area_cleanup)) > > schedule_work(&cleanup_vm_area); > > > > ===== > > /** > > * llist_add - add a new entry > > * @new: new entry to be added > > * @head: the head for your lock-less list > > * > > * Returns true if the list was empty prior to adding this entry. > > */ > > static inline bool llist_add(struct llist_node *new, struct llist_head *head) > > { > > return llist_add_batch(new, new, head); > > } > > ===== > > > But then you will not schedule. If the list is empty, we add one element > llist_add() returns 1, but your condition expects 0. > > How it works: > > If someone keeps adding to the llist and it is not empty we should not > trigger a new work, because a current work is in flight(it will cover new comers), > i.e. it has been scheduled but it has not yet completed llist_del_all() on > the head. > > Once it is done, a new comer will trigger a work again only if it sees NULL, > i.e. when the list is empty. Fair enough. I thought it's a deferring work, in fact it's aiming to put the error handling in a workqueue, but not the current atomic context. Thanks for the explanation.
On Tue, Aug 19, 2025 at 04:56:25PM +0800, Baoquan He wrote: > On 08/18/25 at 03:02pm, Uladzislau Rezki wrote: > > On Mon, Aug 18, 2025 at 12:21:15PM +0800, Baoquan He wrote: > > > On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote: > > > > __vmalloc_area_node() may call free_vmap_area() or vfree() on > > > > error paths, both of which can sleep. This becomes problematic > > > > if the function is invoked from an atomic context, such as when > > > > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. > > > > > > > > To fix this, unify error paths and defer the cleanup of partly > > > > initialized vm_struct objects to a workqueue. This ensures that > > > > freeing happens in a process context and avoids invalid sleeps > > > > in atomic regions. > > > > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > > > --- > > > > include/linux/vmalloc.h | 6 +++++- > > > > mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- > > > > 2 files changed, 36 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > > > > index fdc9aeb74a44..b1425fae8cbf 100644 > > > > --- a/include/linux/vmalloc.h > > > > +++ b/include/linux/vmalloc.h > > > > @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ > > > > #endif > > > > > > > > struct vm_struct { > > > > - struct vm_struct *next; > > > > + union { > > > > + struct vm_struct *next; /* Early registration of vm_areas. */ > > > > + struct llist_node llnode; /* Asynchronous freeing on error paths. */ > > > > + }; > > > > + > > > > void *addr; > > > > unsigned long size; > > > > unsigned long flags; > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > > > > index 7f48a54ec108..2424f80d524a 100644 > > > > --- a/mm/vmalloc.c > > > > +++ b/mm/vmalloc.c > > > > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, > > > > return nr_allocated; > > > > } > > > > > > > > +static LLIST_HEAD(pending_vm_area_cleanup); > > > > +static void cleanup_vm_area_work(struct work_struct *work) > > > > +{ > > > > + struct vm_struct *area, *tmp; > > > > + struct llist_node *head; > > > > + > > > > + head = llist_del_all(&pending_vm_area_cleanup); > > > > + if (!head) > > > > + return; > > > > + > > > > + llist_for_each_entry_safe(area, tmp, head, llnode) { > > > > + if (!area->pages) > > > > + free_vm_area(area); > > > > + else > > > > + vfree(area->addr); > > > > + } > > > > +} > > > > + > > > > +/* > > > > + * Helper for __vmalloc_area_node() to defer cleanup > > > > + * of partially initialized vm_struct in error paths. > > > > + */ > > > > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); > > > > +static void defer_vm_area_cleanup(struct vm_struct *area) > > > > +{ > > > > + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) > > > > + schedule_work(&cleanup_vm_area); > > > > +} > > > > > > Wondering why here we need call schudule_work() when > > > pending_vm_area_cleanup was empty before adding new entry. Shouldn't > > > it be as below to schedule the job? Not sure if I miss anything. > > > > > > if (!llist_add(&area->llnode, &pending_vm_area_cleanup)) > > > schedule_work(&cleanup_vm_area); > > > > > > ===== > > > /** > > > * llist_add - add a new entry > > > * @new: new entry to be added > > > * @head: the head for your lock-less list > > > * > > > * Returns true if the list was empty prior to adding this entry. > > > */ > > > static inline bool llist_add(struct llist_node *new, struct llist_head *head) > > > { > > > return llist_add_batch(new, new, head); > > > } > > > ===== > > > > > But then you will not schedule. If the list is empty, we add one element > > llist_add() returns 1, but your condition expects 0. > > > > How it works: > > > > If someone keeps adding to the llist and it is not empty we should not > > trigger a new work, because a current work is in flight(it will cover new comers), > > i.e. it has been scheduled but it has not yet completed llist_del_all() on > > the head. > > > > Once it is done, a new comer will trigger a work again only if it sees NULL, > > i.e. when the list is empty. > > Fair enough. I thought it's a deferring work, in fact it's aiming to put the > error handling in a workqueue, but not the current atomic context. > Thanks for the explanation. > You are welcome! -- Uladzislau Rezki
On Thu 07-08-25 09:58:08, Uladzislau Rezki wrote: > __vmalloc_area_node() may call free_vmap_area() or vfree() on > error paths, both of which can sleep. This becomes problematic > if the function is invoked from an atomic context, such as when > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. > > To fix this, unify error paths and defer the cleanup of partly > initialized vm_struct objects to a workqueue. This ensures that > freeing happens in a process context and avoids invalid sleeps > in atomic regions. > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> LGTM Acked-by: Michal Hocko <mhocko@suse.com> Thanks! > --- > include/linux/vmalloc.h | 6 +++++- > mm/vmalloc.c | 34 +++++++++++++++++++++++++++++++--- > 2 files changed, 36 insertions(+), 4 deletions(-) > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > index fdc9aeb74a44..b1425fae8cbf 100644 > --- a/include/linux/vmalloc.h > +++ b/include/linux/vmalloc.h > @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ > #endif > > struct vm_struct { > - struct vm_struct *next; > + union { > + struct vm_struct *next; /* Early registration of vm_areas. */ > + struct llist_node llnode; /* Asynchronous freeing on error paths. */ > + }; > + > void *addr; > unsigned long size; > unsigned long flags; > diff --git a/mm/vmalloc.c b/mm/vmalloc.c > index 7f48a54ec108..2424f80d524a 100644 > --- a/mm/vmalloc.c > +++ b/mm/vmalloc.c > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid, > return nr_allocated; > } > > +static LLIST_HEAD(pending_vm_area_cleanup); > +static void cleanup_vm_area_work(struct work_struct *work) > +{ > + struct vm_struct *area, *tmp; > + struct llist_node *head; > + > + head = llist_del_all(&pending_vm_area_cleanup); > + if (!head) > + return; > + > + llist_for_each_entry_safe(area, tmp, head, llnode) { > + if (!area->pages) > + free_vm_area(area); > + else > + vfree(area->addr); > + } > +} > + > +/* > + * Helper for __vmalloc_area_node() to defer cleanup > + * of partially initialized vm_struct in error paths. > + */ > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); > +static void defer_vm_area_cleanup(struct vm_struct *area) > +{ > + if (llist_add(&area->llnode, &pending_vm_area_cleanup)) > + schedule_work(&cleanup_vm_area); > +} > + > static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, > pgprot_t prot, unsigned int page_shift, > int node) > @@ -3711,8 +3740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, > warn_alloc(gfp_mask, NULL, > "vmalloc error: size %lu, failed to allocated page array size %lu", > nr_small_pages * PAGE_SIZE, array_size); > - free_vm_area(area); > - return NULL; > + goto fail; > } > > set_vm_area_page_order(area, page_shift - PAGE_SHIFT); > @@ -3789,7 +3817,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, > return area->addr; > > fail: > - vfree(area->addr); > + defer_vm_area_cleanup(area); > return NULL; > } > > -- > 2.39.5 -- Michal Hocko SUSE Labs
On Thu, Aug 07, 2025 at 01:25:01PM +0200, Michal Hocko wrote: > On Thu 07-08-25 09:58:08, Uladzislau Rezki wrote: > > __vmalloc_area_node() may call free_vmap_area() or vfree() on > > error paths, both of which can sleep. This becomes problematic > > if the function is invoked from an atomic context, such as when > > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. > > > > To fix this, unify error paths and defer the cleanup of partly > > initialized vm_struct objects to a workqueue. This ensures that > > freeing happens in a process context and avoids invalid sleeps > > in atomic regions. > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> > > LGTM > Acked-by: Michal Hocko <mhocko@suse.com> > Thanks! > Thanks, applied Acked-by. -- Uladzislau Rezki
© 2016 - 2025 Red Hat, Inc.