[PATCH v2] vduse: avoid using __GFP_NOFAIL

Jason Wang posted 1 patch 1 year, 4 months ago
drivers/vdpa/vdpa_user/iova_domain.c | 19 +++++++++++--------
drivers/vdpa/vdpa_user/iova_domain.h |  1 +
2 files changed, 12 insertions(+), 8 deletions(-)
[PATCH v2] vduse: avoid using __GFP_NOFAIL
Posted by Jason Wang 1 year, 4 months ago
Barry said [1]:

"""
mm doesn't support non-blockable __GFP_NOFAIL allocation. Because
__GFP_NOFAIL without direct reclamation may just result in a busy
loop within non-sleepable contexts.

The current code will result in returning a NULL pointer but
not a busy-loop.

static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                struct alloc_context *ac)
{
        ...
        /*
         * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
         * we always retry
         */
        if (gfp_mask & __GFP_NOFAIL) {
                /*
                 * All existing users of the __GFP_NOFAIL are blockable, so warn
                 * of any new users that actually require GFP_NOWAIT
                 */
                if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
                        goto fail;
                ...
        }
        ...
fail:
        warn_alloc(gfp_mask, ac->nodemask,
                        "page allocation failure: order:%u", order);
got_pg:
        return page;
}

We have two choices to address the issue:
1. busy-loop
2. BUG_ON

the below patch chose 2:
https://lore.kernel.org/linux-mm/20240731000155.109583-5-21cnbao@gmail.com/
""“

Unfortuantely, we do that under read lock. A possible way to fix that
is to move the pages allocation out of the lock into the caller, but
having to allocate a huge number of pages and auxiliary page array
seems to be problematic as well per Tetsuon [2]:

"""
You should implement proper error handling instead of using
__GFP_NOFAIL if count can become large.
"""

So I choose another way, which does not release kernel bounce pages
when user tries to register usersapce bounce pages. Then we don't need
to do allocation in the path which is not expected to be fail (e.g in
the release). We pay this for more memory usage as we don't release
kernel bounce pages but further optimizations could be done on top.

[1] https://lore.kernel.org/all/CACGkMEtcOJAA96SF9B8m-nZ1X04-XZr+nq8ZQ2saLnUdfOGOLg@mail.gmail.com/T/#m3caef86a66ea6318ef94f9976ddb3a0ccfe6fcf8
[2] https://lore.kernel.org/all/CACGkMEtcOJAA96SF9B8m-nZ1X04-XZr+nq8ZQ2saLnUdfOGOLg@mail.gmail.com/T/#m7ad10eaba48ade5abf2d572f24e185d9fb146480

Fixes: 6c77ed22880d ("vduse: Support using userspace pages as bounce buffer")
Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
Tested-by: Xie Yongji <xieyongji@bytedance.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes since V1:
- Tweak the commit log
- Assign map->user_bounce_page to NULL for safety
---
 drivers/vdpa/vdpa_user/iova_domain.c | 19 +++++++++++--------
 drivers/vdpa/vdpa_user/iova_domain.h |  1 +
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
index 791d38d6284c..58116f89d8da 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.c
+++ b/drivers/vdpa/vdpa_user/iova_domain.c
@@ -162,6 +162,7 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
 				enum dma_data_direction dir)
 {
 	struct vduse_bounce_map *map;
+	struct page *page;
 	unsigned int offset;
 	void *addr;
 	size_t sz;
@@ -178,7 +179,10 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
 			    map->orig_phys == INVALID_PHYS_ADDR))
 			return;
 
-		addr = kmap_local_page(map->bounce_page);
+		page = domain->user_bounce_pages ?
+		       map->user_bounce_page : map->bounce_page;
+
+		addr = kmap_local_page(page);
 		do_bounce(map->orig_phys + offset, addr + offset, sz, dir);
 		kunmap_local(addr);
 		size -= sz;
@@ -270,9 +274,8 @@ int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
 				memcpy_to_page(pages[i], 0,
 					       page_address(map->bounce_page),
 					       PAGE_SIZE);
-			__free_page(map->bounce_page);
 		}
-		map->bounce_page = pages[i];
+		map->user_bounce_page = pages[i];
 		get_page(pages[i]);
 	}
 	domain->user_bounce_pages = true;
@@ -297,17 +300,17 @@ void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain)
 		struct page *page = NULL;
 
 		map = &domain->bounce_maps[i];
-		if (WARN_ON(!map->bounce_page))
+		if (WARN_ON(!map->user_bounce_page))
 			continue;
 
 		/* Copy user page to kernel page if it's in use */
 		if (map->orig_phys != INVALID_PHYS_ADDR) {
-			page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL);
+			page = map->bounce_page;
 			memcpy_from_page(page_address(page),
-					 map->bounce_page, 0, PAGE_SIZE);
+					 map->user_bounce_page, 0, PAGE_SIZE);
 		}
-		put_page(map->bounce_page);
-		map->bounce_page = page;
+		put_page(map->user_bounce_page);
+		map->user_bounce_page = NULL;
 	}
 	domain->user_bounce_pages = false;
 out:
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
index f92f22a7267d..7f3f0928ec78 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.h
+++ b/drivers/vdpa/vdpa_user/iova_domain.h
@@ -21,6 +21,7 @@
 
 struct vduse_bounce_map {
 	struct page *bounce_page;
+	struct page *user_bounce_page;
 	u64 orig_phys;
 };
 
-- 
2.31.1

Re: [PATCH v2] vduse: avoid using __GFP_NOFAIL
Posted by Barry Song 1 year, 4 months ago
On Thu, Aug 8, 2024 at 5:43 PM Jason Wang <jasowang@redhat.com> wrote:
>
> Barry said [1]:
>
> """
> mm doesn't support non-blockable __GFP_NOFAIL allocation. Because
> __GFP_NOFAIL without direct reclamation may just result in a busy
> loop within non-sleepable contexts.
>
> The current code will result in returning a NULL pointer but
> not a busy-loop.
>
> static inline struct page *
> __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>                                                 struct alloc_context *ac)
> {
>         ...
>         /*
>          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
>          * we always retry
>          */
>         if (gfp_mask & __GFP_NOFAIL) {
>                 /*
>                  * All existing users of the __GFP_NOFAIL are blockable, so warn
>                  * of any new users that actually require GFP_NOWAIT
>                  */
>                 if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
>                         goto fail;
>                 ...
>         }
>         ...
> fail:
>         warn_alloc(gfp_mask, ac->nodemask,
>                         "page allocation failure: order:%u", order);
> got_pg:
>         return page;
> }
>
> We have two choices to address the issue:
> 1. busy-loop
> 2. BUG_ON
>
> the below patch chose 2:
> https://lore.kernel.org/linux-mm/20240731000155.109583-5-21cnbao@gmail.com/
> ""“
>
> Unfortuantely, we do that under read lock. A possible way to fix that
> is to move the pages allocation out of the lock into the caller, but
> having to allocate a huge number of pages and auxiliary page array
> seems to be problematic as well per Tetsuon [2]:
>
> """
> You should implement proper error handling instead of using
> __GFP_NOFAIL if count can become large.
> """
>
> So I choose another way, which does not release kernel bounce pages
> when user tries to register usersapce bounce pages. Then we don't need
> to do allocation in the path which is not expected to be fail (e.g in
> the release). We pay this for more memory usage as we don't release
> kernel bounce pages but further optimizations could be done on top.
>
> [1] https://lore.kernel.org/all/CACGkMEtcOJAA96SF9B8m-nZ1X04-XZr+nq8ZQ2saLnUdfOGOLg@mail.gmail.com/T/#m3caef86a66ea6318ef94f9976ddb3a0ccfe6fcf8
> [2] https://lore.kernel.org/all/CACGkMEtcOJAA96SF9B8m-nZ1X04-XZr+nq8ZQ2saLnUdfOGOLg@mail.gmail.com/T/#m7ad10eaba48ade5abf2d572f24e185d9fb146480
>
> Fixes: 6c77ed22880d ("vduse: Support using userspace pages as bounce buffer")
> Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
> Tested-by: Xie Yongji <xieyongji@bytedance.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Hi Jason,

Is this the final version to fix __GFP_NOFAIL issue, or will you have a new
version?
Do you prefer it to go through the mm tree or the drivers subsystem tree?
If it's the former, I can send a new version that includes this one in my
series[1].

[1] https://lore.kernel.org/linux-mm/20240731000155.109583-1-21cnbao@gmail.com/

> ---
> Changes since V1:
> - Tweak the commit log
> - Assign map->user_bounce_page to NULL for safety
> ---
>  drivers/vdpa/vdpa_user/iova_domain.c | 19 +++++++++++--------
>  drivers/vdpa/vdpa_user/iova_domain.h |  1 +
>  2 files changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> index 791d38d6284c..58116f89d8da 100644
> --- a/drivers/vdpa/vdpa_user/iova_domain.c
> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> @@ -162,6 +162,7 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
>                                 enum dma_data_direction dir)
>  {
>         struct vduse_bounce_map *map;
> +       struct page *page;
>         unsigned int offset;
>         void *addr;
>         size_t sz;
> @@ -178,7 +179,10 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
>                             map->orig_phys == INVALID_PHYS_ADDR))
>                         return;
>
> -               addr = kmap_local_page(map->bounce_page);
> +               page = domain->user_bounce_pages ?
> +                      map->user_bounce_page : map->bounce_page;
> +
> +               addr = kmap_local_page(page);
>                 do_bounce(map->orig_phys + offset, addr + offset, sz, dir);
>                 kunmap_local(addr);
>                 size -= sz;
> @@ -270,9 +274,8 @@ int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
>                                 memcpy_to_page(pages[i], 0,
>                                                page_address(map->bounce_page),
>                                                PAGE_SIZE);
> -                       __free_page(map->bounce_page);
>                 }
> -               map->bounce_page = pages[i];
> +               map->user_bounce_page = pages[i];
>                 get_page(pages[i]);
>         }
>         domain->user_bounce_pages = true;
> @@ -297,17 +300,17 @@ void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain)
>                 struct page *page = NULL;
>
>                 map = &domain->bounce_maps[i];
> -               if (WARN_ON(!map->bounce_page))
> +               if (WARN_ON(!map->user_bounce_page))
>                         continue;
>
>                 /* Copy user page to kernel page if it's in use */
>                 if (map->orig_phys != INVALID_PHYS_ADDR) {
> -                       page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL);
> +                       page = map->bounce_page;
>                         memcpy_from_page(page_address(page),
> -                                        map->bounce_page, 0, PAGE_SIZE);
> +                                        map->user_bounce_page, 0, PAGE_SIZE);
>                 }
> -               put_page(map->bounce_page);
> -               map->bounce_page = page;
> +               put_page(map->user_bounce_page);
> +               map->user_bounce_page = NULL;
>         }
>         domain->user_bounce_pages = false;
>  out:
> diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
> index f92f22a7267d..7f3f0928ec78 100644
> --- a/drivers/vdpa/vdpa_user/iova_domain.h
> +++ b/drivers/vdpa/vdpa_user/iova_domain.h
> @@ -21,6 +21,7 @@
>
>  struct vduse_bounce_map {
>         struct page *bounce_page;
> +       struct page *user_bounce_page;
>         u64 orig_phys;
>  };
>
> --
> 2.31.1
>

Thanks
Barry
Re: [PATCH v2] vduse: avoid using __GFP_NOFAIL
Posted by Jason Wang 1 year, 4 months ago
On Fri, Aug 16, 2024 at 8:17 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Thu, Aug 8, 2024 at 5:43 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > Barry said [1]:
> >
> > """
> > mm doesn't support non-blockable __GFP_NOFAIL allocation. Because
> > __GFP_NOFAIL without direct reclamation may just result in a busy
> > loop within non-sleepable contexts.
> >
> > The current code will result in returning a NULL pointer but
> > not a busy-loop.
> >
> > static inline struct page *
> > __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> >                                                 struct alloc_context *ac)
> > {
> >         ...
> >         /*
> >          * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
> >          * we always retry
> >          */
> >         if (gfp_mask & __GFP_NOFAIL) {
> >                 /*
> >                  * All existing users of the __GFP_NOFAIL are blockable, so warn
> >                  * of any new users that actually require GFP_NOWAIT
> >                  */
> >                 if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
> >                         goto fail;
> >                 ...
> >         }
> >         ...
> > fail:
> >         warn_alloc(gfp_mask, ac->nodemask,
> >                         "page allocation failure: order:%u", order);
> > got_pg:
> >         return page;
> > }
> >
> > We have two choices to address the issue:
> > 1. busy-loop
> > 2. BUG_ON
> >
> > the below patch chose 2:
> > https://lore.kernel.org/linux-mm/20240731000155.109583-5-21cnbao@gmail.com/
> > ""“
> >
> > Unfortuantely, we do that under read lock. A possible way to fix that
> > is to move the pages allocation out of the lock into the caller, but
> > having to allocate a huge number of pages and auxiliary page array
> > seems to be problematic as well per Tetsuon [2]:
> >
> > """
> > You should implement proper error handling instead of using
> > __GFP_NOFAIL if count can become large.
> > """
> >
> > So I choose another way, which does not release kernel bounce pages
> > when user tries to register usersapce bounce pages. Then we don't need
> > to do allocation in the path which is not expected to be fail (e.g in
> > the release). We pay this for more memory usage as we don't release
> > kernel bounce pages but further optimizations could be done on top.
> >
> > [1] https://lore.kernel.org/all/CACGkMEtcOJAA96SF9B8m-nZ1X04-XZr+nq8ZQ2saLnUdfOGOLg@mail.gmail.com/T/#m3caef86a66ea6318ef94f9976ddb3a0ccfe6fcf8
> > [2] https://lore.kernel.org/all/CACGkMEtcOJAA96SF9B8m-nZ1X04-XZr+nq8ZQ2saLnUdfOGOLg@mail.gmail.com/T/#m7ad10eaba48ade5abf2d572f24e185d9fb146480
> >
> > Fixes: 6c77ed22880d ("vduse: Support using userspace pages as bounce buffer")
> > Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
> > Tested-by: Xie Yongji <xieyongji@bytedance.com>
> > Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> Hi Jason,
>
> Is this the final version to fix __GFP_NOFAIL issue, or will you have a new
> version?

I think so.

> Do you prefer it to go through the mm tree or the drivers subsystem tree?

Either is fine.

> If it's the former, I can send a new version that includes this one in my
> series[1].
>
> [1] https://lore.kernel.org/linux-mm/20240731000155.109583-1-21cnbao@gmail.com/

That's fine.

Thanks

>
> > ---
> > Changes since V1:
> > - Tweak the commit log
> > - Assign map->user_bounce_page to NULL for safety
> > ---
> >  drivers/vdpa/vdpa_user/iova_domain.c | 19 +++++++++++--------
> >  drivers/vdpa/vdpa_user/iova_domain.h |  1 +
> >  2 files changed, 12 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> > index 791d38d6284c..58116f89d8da 100644
> > --- a/drivers/vdpa/vdpa_user/iova_domain.c
> > +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> > @@ -162,6 +162,7 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
> >                                 enum dma_data_direction dir)
> >  {
> >         struct vduse_bounce_map *map;
> > +       struct page *page;
> >         unsigned int offset;
> >         void *addr;
> >         size_t sz;
> > @@ -178,7 +179,10 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
> >                             map->orig_phys == INVALID_PHYS_ADDR))
> >                         return;
> >
> > -               addr = kmap_local_page(map->bounce_page);
> > +               page = domain->user_bounce_pages ?
> > +                      map->user_bounce_page : map->bounce_page;
> > +
> > +               addr = kmap_local_page(page);
> >                 do_bounce(map->orig_phys + offset, addr + offset, sz, dir);
> >                 kunmap_local(addr);
> >                 size -= sz;
> > @@ -270,9 +274,8 @@ int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
> >                                 memcpy_to_page(pages[i], 0,
> >                                                page_address(map->bounce_page),
> >                                                PAGE_SIZE);
> > -                       __free_page(map->bounce_page);
> >                 }
> > -               map->bounce_page = pages[i];
> > +               map->user_bounce_page = pages[i];
> >                 get_page(pages[i]);
> >         }
> >         domain->user_bounce_pages = true;
> > @@ -297,17 +300,17 @@ void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain)
> >                 struct page *page = NULL;
> >
> >                 map = &domain->bounce_maps[i];
> > -               if (WARN_ON(!map->bounce_page))
> > +               if (WARN_ON(!map->user_bounce_page))
> >                         continue;
> >
> >                 /* Copy user page to kernel page if it's in use */
> >                 if (map->orig_phys != INVALID_PHYS_ADDR) {
> > -                       page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL);
> > +                       page = map->bounce_page;
> >                         memcpy_from_page(page_address(page),
> > -                                        map->bounce_page, 0, PAGE_SIZE);
> > +                                        map->user_bounce_page, 0, PAGE_SIZE);
> >                 }
> > -               put_page(map->bounce_page);
> > -               map->bounce_page = page;
> > +               put_page(map->user_bounce_page);
> > +               map->user_bounce_page = NULL;
> >         }
> >         domain->user_bounce_pages = false;
> >  out:
> > diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
> > index f92f22a7267d..7f3f0928ec78 100644
> > --- a/drivers/vdpa/vdpa_user/iova_domain.h
> > +++ b/drivers/vdpa/vdpa_user/iova_domain.h
> > @@ -21,6 +21,7 @@
> >
> >  struct vduse_bounce_map {
> >         struct page *bounce_page;
> > +       struct page *user_bounce_page;
> >         u64 orig_phys;
> >  };
> >
> > --
> > 2.31.1
> >
>
> Thanks
> Barry
>