__vmalloc() and no-block support

[PATCH 6/8] mm/vmalloc: Defer freeing partly initialized vm_struct

Posted by Uladzislau Rezki (Sony) 6 months ago

__vmalloc_area_node() may call free_vmap_area() or vfree() on
error paths, both of which can sleep. This becomes problematic
if the function is invoked from an atomic context, such as when
GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.

To fix this, unify error paths and defer the cleanup of partly
initialized vm_struct objects to a workqueue. This ensures that
freeing happens in a process context and avoids invalid sleeps
in atomic regions.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/vmalloc.h |  6 +++++-
 mm/vmalloc.c            | 34 +++++++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index fdc9aeb74a44..b1425fae8cbf 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -50,7 +50,11 @@ struct iov_iter;		/* in uio.h */
 #endif
 
 struct vm_struct {
-	struct vm_struct	*next;
+	union {
+		struct vm_struct *next;	  /* Early registration of vm_areas. */
+		struct llist_node llnode; /* Asynchronous freeing on error paths. */
+	};
+
 	void			*addr;
 	unsigned long		size;
 	unsigned long		flags;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7f48a54ec108..2424f80d524a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
 	return nr_allocated;
 }
 
+static LLIST_HEAD(pending_vm_area_cleanup);
+static void cleanup_vm_area_work(struct work_struct *work)
+{
+	struct vm_struct *area, *tmp;
+	struct llist_node *head;
+
+	head = llist_del_all(&pending_vm_area_cleanup);
+	if (!head)
+		return;
+
+	llist_for_each_entry_safe(area, tmp, head, llnode) {
+		if (!area->pages)
+			free_vm_area(area);
+		else
+			vfree(area->addr);
+	}
+}
+
+/*
+ * Helper for __vmalloc_area_node() to defer cleanup
+ * of partially initialized vm_struct in error paths.
+ */
+static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
+static void defer_vm_area_cleanup(struct vm_struct *area)
+{
+	if (llist_add(&area->llnode, &pending_vm_area_cleanup))
+		schedule_work(&cleanup_vm_area);
+}
+
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, unsigned int page_shift,
 				 int node)
@@ -3711,8 +3740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		warn_alloc(gfp_mask, NULL,
 			"vmalloc error: size %lu, failed to allocated page array size %lu",
 			nr_small_pages * PAGE_SIZE, array_size);
-		free_vm_area(area);
-		return NULL;
+		goto fail;
 	}
 
 	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
@@ -3789,7 +3817,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	return area->addr;
 
 fail:
-	vfree(area->addr);
+	defer_vm_area_cleanup(area);
 	return NULL;
 }
 
-- 
2.39.5

Re: [PATCH 6/8] mm/vmalloc: Defer freeing partly initialized vm_struct

Posted by Baoquan He 5 months, 3 weeks ago

On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote:
> __vmalloc_area_node() may call free_vmap_area() or vfree() on
> error paths, both of which can sleep. This becomes problematic
> if the function is invoked from an atomic context, such as when
> GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.
> 
> To fix this, unify error paths and defer the cleanup of partly
> initialized vm_struct objects to a workqueue. This ensures that
> freeing happens in a process context and avoids invalid sleeps
> in atomic regions.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  include/linux/vmalloc.h |  6 +++++-
>  mm/vmalloc.c            | 34 +++++++++++++++++++++++++++++++---
>  2 files changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index fdc9aeb74a44..b1425fae8cbf 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -50,7 +50,11 @@ struct iov_iter;		/* in uio.h */
>  #endif
>  
>  struct vm_struct {
> -	struct vm_struct	*next;
> +	union {
> +		struct vm_struct *next;	  /* Early registration of vm_areas. */
> +		struct llist_node llnode; /* Asynchronous freeing on error paths. */
> +	};
> +
>  	void			*addr;
>  	unsigned long		size;
>  	unsigned long		flags;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 7f48a54ec108..2424f80d524a 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>  	return nr_allocated;
>  }
>  
> +static LLIST_HEAD(pending_vm_area_cleanup);
> +static void cleanup_vm_area_work(struct work_struct *work)
> +{
> +	struct vm_struct *area, *tmp;
> +	struct llist_node *head;
> +
> +	head = llist_del_all(&pending_vm_area_cleanup);
> +	if (!head)
> +		return;
> +
> +	llist_for_each_entry_safe(area, tmp, head, llnode) {
> +		if (!area->pages)
> +			free_vm_area(area);
> +		else
> +			vfree(area->addr);
> +	}
> +}
> +
> +/*
> + * Helper for __vmalloc_area_node() to defer cleanup
> + * of partially initialized vm_struct in error paths.
> + */
> +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
> +static void defer_vm_area_cleanup(struct vm_struct *area)
> +{
> +	if (llist_add(&area->llnode, &pending_vm_area_cleanup))
> +		schedule_work(&cleanup_vm_area);
> +}

Wondering why here we need call schudule_work() when
pending_vm_area_cleanup was empty before adding new entry. Shouldn't
it be as below to schedule the job? Not sure if I miss anything.

	if (!llist_add(&area->llnode, &pending_vm_area_cleanup))
		schedule_work(&cleanup_vm_area);

=====
/**
 * llist_add - add a new entry
 * @new:        new entry to be added
 * @head:       the head for your lock-less list
 *
 * Returns true if the list was empty prior to adding this entry.
 */
static inline bool llist_add(struct llist_node *new, struct llist_head *head)
{
        return llist_add_batch(new, new, head);
}
=====

> +
>  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  				 pgprot_t prot, unsigned int page_shift,
>  				 int node)
> @@ -3711,8 +3740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  		warn_alloc(gfp_mask, NULL,
>  			"vmalloc error: size %lu, failed to allocated page array size %lu",
>  			nr_small_pages * PAGE_SIZE, array_size);
> -		free_vm_area(area);
> -		return NULL;
> +		goto fail;
>  	}
>  
>  	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
> @@ -3789,7 +3817,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	return area->addr;
>  
>  fail:
> -	vfree(area->addr);
> +	defer_vm_area_cleanup(area);
>  	return NULL;
>  }
>  
> -- 
> 2.39.5
>

Re: [PATCH 6/8] mm/vmalloc: Defer freeing partly initialized vm_struct

Posted by Uladzislau Rezki 5 months, 3 weeks ago

On Mon, Aug 18, 2025 at 12:21:15PM +0800, Baoquan He wrote:
> On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote:
> > __vmalloc_area_node() may call free_vmap_area() or vfree() on
> > error paths, both of which can sleep. This becomes problematic
> > if the function is invoked from an atomic context, such as when
> > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.
> > 
> > To fix this, unify error paths and defer the cleanup of partly
> > initialized vm_struct objects to a workqueue. This ensures that
> > freeing happens in a process context and avoids invalid sleeps
> > in atomic regions.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  include/linux/vmalloc.h |  6 +++++-
> >  mm/vmalloc.c            | 34 +++++++++++++++++++++++++++++++---
> >  2 files changed, 36 insertions(+), 4 deletions(-)
> > 
> > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> > index fdc9aeb74a44..b1425fae8cbf 100644
> > --- a/include/linux/vmalloc.h
> > +++ b/include/linux/vmalloc.h
> > @@ -50,7 +50,11 @@ struct iov_iter;		/* in uio.h */
> >  #endif
> >  
> >  struct vm_struct {
> > -	struct vm_struct	*next;
> > +	union {
> > +		struct vm_struct *next;	  /* Early registration of vm_areas. */
> > +		struct llist_node llnode; /* Asynchronous freeing on error paths. */
> > +	};
> > +
> >  	void			*addr;
> >  	unsigned long		size;
> >  	unsigned long		flags;
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 7f48a54ec108..2424f80d524a 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
> >  	return nr_allocated;
> >  }
> >  
> > +static LLIST_HEAD(pending_vm_area_cleanup);
> > +static void cleanup_vm_area_work(struct work_struct *work)
> > +{
> > +	struct vm_struct *area, *tmp;
> > +	struct llist_node *head;
> > +
> > +	head = llist_del_all(&pending_vm_area_cleanup);
> > +	if (!head)
> > +		return;
> > +
> > +	llist_for_each_entry_safe(area, tmp, head, llnode) {
> > +		if (!area->pages)
> > +			free_vm_area(area);
> > +		else
> > +			vfree(area->addr);
> > +	}
> > +}
> > +
> > +/*
> > + * Helper for __vmalloc_area_node() to defer cleanup
> > + * of partially initialized vm_struct in error paths.
> > + */
> > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
> > +static void defer_vm_area_cleanup(struct vm_struct *area)
> > +{
> > +	if (llist_add(&area->llnode, &pending_vm_area_cleanup))
> > +		schedule_work(&cleanup_vm_area);
> > +}
> 
> Wondering why here we need call schudule_work() when
> pending_vm_area_cleanup was empty before adding new entry. Shouldn't
> it be as below to schedule the job? Not sure if I miss anything.
> 
> 	if (!llist_add(&area->llnode, &pending_vm_area_cleanup))
> 		schedule_work(&cleanup_vm_area);
> 
> =====
> /**
>  * llist_add - add a new entry
>  * @new:        new entry to be added
>  * @head:       the head for your lock-less list
>  *
>  * Returns true if the list was empty prior to adding this entry.
>  */
> static inline bool llist_add(struct llist_node *new, struct llist_head *head)
> {
>         return llist_add_batch(new, new, head);
> }
> =====
> 
But then you will not schedule. If the list is empty, we add one element
llist_add() returns 1, but your condition expects 0.

How it works:

If someone keeps adding to the llist and it is not empty we should not
trigger a new work, because a current work is in flight(it will cover new comers),
i.e. it has been scheduled but it has not yet completed llist_del_all() on
the head.

Once it is done, a new comer will trigger a work again only if it sees NULL,
i.e. when the list is empty.

--
Uladzislau Rezki

Re: [PATCH 6/8] mm/vmalloc: Defer freeing partly initialized vm_struct

Posted by Baoquan He 5 months, 3 weeks ago

On 08/18/25 at 03:02pm, Uladzislau Rezki wrote:
> On Mon, Aug 18, 2025 at 12:21:15PM +0800, Baoquan He wrote:
> > On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote:
> > > __vmalloc_area_node() may call free_vmap_area() or vfree() on
> > > error paths, both of which can sleep. This becomes problematic
> > > if the function is invoked from an atomic context, such as when
> > > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.
> > > 
> > > To fix this, unify error paths and defer the cleanup of partly
> > > initialized vm_struct objects to a workqueue. This ensures that
> > > freeing happens in a process context and avoids invalid sleeps
> > > in atomic regions.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > ---
> > >  include/linux/vmalloc.h |  6 +++++-
> > >  mm/vmalloc.c            | 34 +++++++++++++++++++++++++++++++---
> > >  2 files changed, 36 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> > > index fdc9aeb74a44..b1425fae8cbf 100644
> > > --- a/include/linux/vmalloc.h
> > > +++ b/include/linux/vmalloc.h
> > > @@ -50,7 +50,11 @@ struct iov_iter;		/* in uio.h */
> > >  #endif
> > >  
> > >  struct vm_struct {
> > > -	struct vm_struct	*next;
> > > +	union {
> > > +		struct vm_struct *next;	  /* Early registration of vm_areas. */
> > > +		struct llist_node llnode; /* Asynchronous freeing on error paths. */
> > > +	};
> > > +
> > >  	void			*addr;
> > >  	unsigned long		size;
> > >  	unsigned long		flags;
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index 7f48a54ec108..2424f80d524a 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
> > >  	return nr_allocated;
> > >  }
> > >  
> > > +static LLIST_HEAD(pending_vm_area_cleanup);
> > > +static void cleanup_vm_area_work(struct work_struct *work)
> > > +{
> > > +	struct vm_struct *area, *tmp;
> > > +	struct llist_node *head;
> > > +
> > > +	head = llist_del_all(&pending_vm_area_cleanup);
> > > +	if (!head)
> > > +		return;
> > > +
> > > +	llist_for_each_entry_safe(area, tmp, head, llnode) {
> > > +		if (!area->pages)
> > > +			free_vm_area(area);
> > > +		else
> > > +			vfree(area->addr);
> > > +	}
> > > +}
> > > +
> > > +/*
> > > + * Helper for __vmalloc_area_node() to defer cleanup
> > > + * of partially initialized vm_struct in error paths.
> > > + */
> > > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
> > > +static void defer_vm_area_cleanup(struct vm_struct *area)
> > > +{
> > > +	if (llist_add(&area->llnode, &pending_vm_area_cleanup))
> > > +		schedule_work(&cleanup_vm_area);
> > > +}
> > 
> > Wondering why here we need call schudule_work() when
> > pending_vm_area_cleanup was empty before adding new entry. Shouldn't
> > it be as below to schedule the job? Not sure if I miss anything.
> > 
> > 	if (!llist_add(&area->llnode, &pending_vm_area_cleanup))
> > 		schedule_work(&cleanup_vm_area);
> > 
> > =====
> > /**
> >  * llist_add - add a new entry
> >  * @new:        new entry to be added
> >  * @head:       the head for your lock-less list
> >  *
> >  * Returns true if the list was empty prior to adding this entry.
> >  */
> > static inline bool llist_add(struct llist_node *new, struct llist_head *head)
> > {
> >         return llist_add_batch(new, new, head);
> > }
> > =====
> > 
> But then you will not schedule. If the list is empty, we add one element
> llist_add() returns 1, but your condition expects 0.
> 
> How it works:
> 
> If someone keeps adding to the llist and it is not empty we should not
> trigger a new work, because a current work is in flight(it will cover new comers),
> i.e. it has been scheduled but it has not yet completed llist_del_all() on
> the head.
> 
> Once it is done, a new comer will trigger a work again only if it sees NULL,
> i.e. when the list is empty.

Fair enough. I thought it's a deferring work, in fact it's aiming to put the
error handling in a workqueue, but not the current atomic context.
Thanks for the explanation.

Re: [PATCH 6/8] mm/vmalloc: Defer freeing partly initialized vm_struct

Posted by Uladzislau Rezki 5 months, 3 weeks ago

On Tue, Aug 19, 2025 at 04:56:25PM +0800, Baoquan He wrote:
> On 08/18/25 at 03:02pm, Uladzislau Rezki wrote:
> > On Mon, Aug 18, 2025 at 12:21:15PM +0800, Baoquan He wrote:
> > > On 08/07/25 at 09:58am, Uladzislau Rezki (Sony) wrote:
> > > > __vmalloc_area_node() may call free_vmap_area() or vfree() on
> > > > error paths, both of which can sleep. This becomes problematic
> > > > if the function is invoked from an atomic context, such as when
> > > > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.
> > > > 
> > > > To fix this, unify error paths and defer the cleanup of partly
> > > > initialized vm_struct objects to a workqueue. This ensures that
> > > > freeing happens in a process context and avoids invalid sleeps
> > > > in atomic regions.
> > > > 
> > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > ---
> > > >  include/linux/vmalloc.h |  6 +++++-
> > > >  mm/vmalloc.c            | 34 +++++++++++++++++++++++++++++++---
> > > >  2 files changed, 36 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> > > > index fdc9aeb74a44..b1425fae8cbf 100644
> > > > --- a/include/linux/vmalloc.h
> > > > +++ b/include/linux/vmalloc.h
> > > > @@ -50,7 +50,11 @@ struct iov_iter;		/* in uio.h */
> > > >  #endif
> > > >  
> > > >  struct vm_struct {
> > > > -	struct vm_struct	*next;
> > > > +	union {
> > > > +		struct vm_struct *next;	  /* Early registration of vm_areas. */
> > > > +		struct llist_node llnode; /* Asynchronous freeing on error paths. */
> > > > +	};
> > > > +
> > > >  	void			*addr;
> > > >  	unsigned long		size;
> > > >  	unsigned long		flags;
> > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > index 7f48a54ec108..2424f80d524a 100644
> > > > --- a/mm/vmalloc.c
> > > > +++ b/mm/vmalloc.c
> > > > @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
> > > >  	return nr_allocated;
> > > >  }
> > > >  
> > > > +static LLIST_HEAD(pending_vm_area_cleanup);
> > > > +static void cleanup_vm_area_work(struct work_struct *work)
> > > > +{
> > > > +	struct vm_struct *area, *tmp;
> > > > +	struct llist_node *head;
> > > > +
> > > > +	head = llist_del_all(&pending_vm_area_cleanup);
> > > > +	if (!head)
> > > > +		return;
> > > > +
> > > > +	llist_for_each_entry_safe(area, tmp, head, llnode) {
> > > > +		if (!area->pages)
> > > > +			free_vm_area(area);
> > > > +		else
> > > > +			vfree(area->addr);
> > > > +	}
> > > > +}
> > > > +
> > > > +/*
> > > > + * Helper for __vmalloc_area_node() to defer cleanup
> > > > + * of partially initialized vm_struct in error paths.
> > > > + */
> > > > +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
> > > > +static void defer_vm_area_cleanup(struct vm_struct *area)
> > > > +{
> > > > +	if (llist_add(&area->llnode, &pending_vm_area_cleanup))
> > > > +		schedule_work(&cleanup_vm_area);
> > > > +}
> > > 
> > > Wondering why here we need call schudule_work() when
> > > pending_vm_area_cleanup was empty before adding new entry. Shouldn't
> > > it be as below to schedule the job? Not sure if I miss anything.
> > > 
> > > 	if (!llist_add(&area->llnode, &pending_vm_area_cleanup))
> > > 		schedule_work(&cleanup_vm_area);
> > > 
> > > =====
> > > /**
> > >  * llist_add - add a new entry
> > >  * @new:        new entry to be added
> > >  * @head:       the head for your lock-less list
> > >  *
> > >  * Returns true if the list was empty prior to adding this entry.
> > >  */
> > > static inline bool llist_add(struct llist_node *new, struct llist_head *head)
> > > {
> > >         return llist_add_batch(new, new, head);
> > > }
> > > =====
> > > 
> > But then you will not schedule. If the list is empty, we add one element
> > llist_add() returns 1, but your condition expects 0.
> > 
> > How it works:
> > 
> > If someone keeps adding to the llist and it is not empty we should not
> > trigger a new work, because a current work is in flight(it will cover new comers),
> > i.e. it has been scheduled but it has not yet completed llist_del_all() on
> > the head.
> > 
> > Once it is done, a new comer will trigger a work again only if it sees NULL,
> > i.e. when the list is empty.
> 
> Fair enough. I thought it's a deferring work, in fact it's aiming to put the
> error handling in a workqueue, but not the current atomic context.
> Thanks for the explanation.
> 
You are welcome!

--
Uladzislau Rezki

Re: [PATCH 6/8] mm/vmalloc: Defer freeing partly initialized vm_struct

Posted by Michal Hocko 6 months ago

On Thu 07-08-25 09:58:08, Uladzislau Rezki wrote:
> __vmalloc_area_node() may call free_vmap_area() or vfree() on
> error paths, both of which can sleep. This becomes problematic
> if the function is invoked from an atomic context, such as when
> GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.
> 
> To fix this, unify error paths and defer the cleanup of partly
> initialized vm_struct objects to a workqueue. This ensures that
> freeing happens in a process context and avoids invalid sleeps
> in atomic regions.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>

LGTM
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks!

> ---
>  include/linux/vmalloc.h |  6 +++++-
>  mm/vmalloc.c            | 34 +++++++++++++++++++++++++++++++---
>  2 files changed, 36 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index fdc9aeb74a44..b1425fae8cbf 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -50,7 +50,11 @@ struct iov_iter;		/* in uio.h */
>  #endif
>  
>  struct vm_struct {
> -	struct vm_struct	*next;
> +	union {
> +		struct vm_struct *next;	  /* Early registration of vm_areas. */
> +		struct llist_node llnode; /* Asynchronous freeing on error paths. */
> +	};
> +
>  	void			*addr;
>  	unsigned long		size;
>  	unsigned long		flags;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 7f48a54ec108..2424f80d524a 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3680,6 +3680,35 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
>  	return nr_allocated;
>  }
>  
> +static LLIST_HEAD(pending_vm_area_cleanup);
> +static void cleanup_vm_area_work(struct work_struct *work)
> +{
> +	struct vm_struct *area, *tmp;
> +	struct llist_node *head;
> +
> +	head = llist_del_all(&pending_vm_area_cleanup);
> +	if (!head)
> +		return;
> +
> +	llist_for_each_entry_safe(area, tmp, head, llnode) {
> +		if (!area->pages)
> +			free_vm_area(area);
> +		else
> +			vfree(area->addr);
> +	}
> +}
> +
> +/*
> + * Helper for __vmalloc_area_node() to defer cleanup
> + * of partially initialized vm_struct in error paths.
> + */
> +static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
> +static void defer_vm_area_cleanup(struct vm_struct *area)
> +{
> +	if (llist_add(&area->llnode, &pending_vm_area_cleanup))
> +		schedule_work(&cleanup_vm_area);
> +}
> +
>  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  				 pgprot_t prot, unsigned int page_shift,
>  				 int node)
> @@ -3711,8 +3740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  		warn_alloc(gfp_mask, NULL,
>  			"vmalloc error: size %lu, failed to allocated page array size %lu",
>  			nr_small_pages * PAGE_SIZE, array_size);
> -		free_vm_area(area);
> -		return NULL;
> +		goto fail;
>  	}
>  
>  	set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
> @@ -3789,7 +3817,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	return area->addr;
>  
>  fail:
> -	vfree(area->addr);
> +	defer_vm_area_cleanup(area);
>  	return NULL;
>  }
>  
> -- 
> 2.39.5

-- 
Michal Hocko
SUSE Labs

Re: [PATCH 6/8] mm/vmalloc: Defer freeing partly initialized vm_struct

Posted by Uladzislau Rezki 6 months ago

On Thu, Aug 07, 2025 at 01:25:01PM +0200, Michal Hocko wrote:
> On Thu 07-08-25 09:58:08, Uladzislau Rezki wrote:
> > __vmalloc_area_node() may call free_vmap_area() or vfree() on
> > error paths, both of which can sleep. This becomes problematic
> > if the function is invoked from an atomic context, such as when
> > GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask.
> > 
> > To fix this, unify error paths and defer the cleanup of partly
> > initialized vm_struct objects to a workqueue. This ensures that
> > freeing happens in a process context and avoids invalid sleeps
> > in atomic regions.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> 
> LGTM
> Acked-by: Michal Hocko <mhocko@suse.com>
> Thanks!
> 
Thanks, applied Acked-by.

--
Uladzislau Rezki