[PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size

sheng.zhao@bytedance.com posted 1 patch 2 weeks, 3 days ago
drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
2 files changed, 83 insertions(+), 42 deletions(-)
[PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by sheng.zhao@bytedance.com 2 weeks, 3 days ago
From: Sheng Zhao <sheng.zhao@bytedance.com>

The allocation granularity of bounce pages is PAGE_SIZE. This may cause
even small IO requests to occupy an entire bounce page exclusively. The
kind of memory waste will be more significant on arm64 with 64KB pages.

So, optimize it by using fixed 4KB bounce pages.

Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
---
 drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
 drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
 2 files changed, 83 insertions(+), 42 deletions(-)

diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
index 58116f89d8da..768313c80b62 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.c
+++ b/drivers/vdpa/vdpa_user/iova_domain.c
@@ -103,19 +103,26 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain,
 static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
 					 u64 iova, u64 size, u64 paddr)
 {
-	struct vduse_bounce_map *map;
+	struct vduse_bounce_map *map, *head_map;
+	struct page *tmp_page;
 	u64 last = iova + size - 1;
 
 	while (iova <= last) {
-		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+		map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
 		if (!map->bounce_page) {
-			map->bounce_page = alloc_page(GFP_ATOMIC);
-			if (!map->bounce_page)
-				return -ENOMEM;
+			head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_PAGE_SHIFT];
+			if (!head_map->bounce_page) {
+				tmp_page = alloc_page(GFP_ATOMIC);
+				if (!tmp_page)
+					return -ENOMEM;
+				if (cmpxchg(&head_map->bounce_page, NULL, tmp_page))
+					__free_page(tmp_page);
+			}
+			map->bounce_page = head_map->bounce_page;
 		}
 		map->orig_phys = paddr;
-		paddr += PAGE_SIZE;
-		iova += PAGE_SIZE;
+		paddr += BOUNCE_PAGE_SIZE;
+		iova += BOUNCE_PAGE_SIZE;
 	}
 	return 0;
 }
@@ -127,12 +134,17 @@ static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain,
 	u64 last = iova + size - 1;
 
 	while (iova <= last) {
-		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+		map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
 		map->orig_phys = INVALID_PHYS_ADDR;
-		iova += PAGE_SIZE;
+		iova += BOUNCE_PAGE_SIZE;
 	}
 }
 
+static unsigned int offset_in_bounce_page(dma_addr_t addr)
+{
+	return (addr & ~BOUNCE_PAGE_MASK);
+}
+
 static void do_bounce(phys_addr_t orig, void *addr, size_t size,
 		      enum dma_data_direction dir)
 {
@@ -163,7 +175,7 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
 {
 	struct vduse_bounce_map *map;
 	struct page *page;
-	unsigned int offset;
+	unsigned int offset, head_offset;
 	void *addr;
 	size_t sz;
 
@@ -171,9 +183,10 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
 		return;
 
 	while (size) {
-		map = &domain->bounce_maps[iova >> PAGE_SHIFT];
-		offset = offset_in_page(iova);
-		sz = min_t(size_t, PAGE_SIZE - offset, size);
+		map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
+		head_offset = offset_in_page(iova);
+		offset = offset_in_bounce_page(iova);
+		sz = min_t(size_t, BOUNCE_PAGE_SIZE - offset, size);
 
 		if (WARN_ON(!map->bounce_page ||
 			    map->orig_phys == INVALID_PHYS_ADDR))
@@ -183,7 +196,7 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain,
 		       map->user_bounce_page : map->bounce_page;
 
 		addr = kmap_local_page(page);
-		do_bounce(map->orig_phys + offset, addr + offset, sz, dir);
+		do_bounce(map->orig_phys + offset, addr + head_offset, sz, dir);
 		kunmap_local(addr);
 		size -= sz;
 		iova += sz;
@@ -218,7 +231,7 @@ vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova)
 	struct page *page = NULL;
 
 	read_lock(&domain->bounce_lock);
-	map = &domain->bounce_maps[iova >> PAGE_SHIFT];
+	map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
 	if (domain->user_bounce_pages || !map->bounce_page)
 		goto out;
 
@@ -236,7 +249,7 @@ vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain)
 	struct vduse_bounce_map *map;
 	unsigned long pfn, bounce_pfns;
 
-	bounce_pfns = domain->bounce_size >> PAGE_SHIFT;
+	bounce_pfns = domain->bounce_size >> BOUNCE_PAGE_SHIFT;
 
 	for (pfn = 0; pfn < bounce_pfns; pfn++) {
 		map = &domain->bounce_maps[pfn];
@@ -246,7 +259,8 @@ vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain)
 		if (!map->bounce_page)
 			continue;
 
-		__free_page(map->bounce_page);
+		if (!((pfn << BOUNCE_PAGE_SHIFT) & ~PAGE_MASK))
+			__free_page(map->bounce_page);
 		map->bounce_page = NULL;
 	}
 }
@@ -254,8 +268,12 @@ vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain)
 int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
 				       struct page **pages, int count)
 {
-	struct vduse_bounce_map *map;
-	int i, ret;
+	struct vduse_bounce_map *map, *head_map;
+	int i, j, ret;
+	int inner_pages = PAGE_SIZE / BOUNCE_PAGE_SIZE;
+	int bounce_pfns = domain->bounce_size >> BOUNCE_PAGE_SHIFT;
+	struct page *head_page = NULL;
+	bool need_copy;
 
 	/* Now we don't support partial mapping */
 	if (count != (domain->bounce_size >> PAGE_SHIFT))
@@ -267,16 +285,23 @@ int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
 		goto out;
 
 	for (i = 0; i < count; i++) {
-		map = &domain->bounce_maps[i];
-		if (map->bounce_page) {
+		need_copy = false;
+		head_map = &domain->bounce_maps[(i * inner_pages)];
+		head_page = head_map->bounce_page;
+		for (j = 0; j < inner_pages; j++) {
+			if ((i * inner_pages + j) >= bounce_pfns)
+				break;
+			map = &domain->bounce_maps[(i * inner_pages + j)];
 			/* Copy kernel page to user page if it's in use */
-			if (map->orig_phys != INVALID_PHYS_ADDR)
-				memcpy_to_page(pages[i], 0,
-					       page_address(map->bounce_page),
-					       PAGE_SIZE);
+			if ((head_page) && (map->orig_phys != INVALID_PHYS_ADDR))
+				need_copy = true;
+			map->user_bounce_page = pages[i];
 		}
-		map->user_bounce_page = pages[i];
 		get_page(pages[i]);
+		if ((head_page) && (need_copy))
+			memcpy_to_page(pages[i], 0,
+				       page_address(head_page),
+				       PAGE_SIZE);
 	}
 	domain->user_bounce_pages = true;
 	ret = 0;
@@ -288,8 +313,12 @@ int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain,
 
 void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain)
 {
-	struct vduse_bounce_map *map;
-	unsigned long i, count;
+	struct vduse_bounce_map *map, *head_map;
+	unsigned long i, j, count;
+	int inner_pages = PAGE_SIZE / BOUNCE_PAGE_SIZE;
+	int bounce_pfns = domain->bounce_size >> BOUNCE_PAGE_SHIFT;
+	struct page *head_page = NULL;
+	bool need_copy;
 
 	write_lock(&domain->bounce_lock);
 	if (!domain->user_bounce_pages)
@@ -297,20 +326,27 @@ void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain)
 
 	count = domain->bounce_size >> PAGE_SHIFT;
 	for (i = 0; i < count; i++) {
-		struct page *page = NULL;
-
-		map = &domain->bounce_maps[i];
-		if (WARN_ON(!map->user_bounce_page))
+		need_copy = false;
+		head_map = &domain->bounce_maps[(i * inner_pages)];
+		if (WARN_ON(!head_map->user_bounce_page))
 			continue;
-
-		/* Copy user page to kernel page if it's in use */
-		if (map->orig_phys != INVALID_PHYS_ADDR) {
-			page = map->bounce_page;
-			memcpy_from_page(page_address(page),
-					 map->user_bounce_page, 0, PAGE_SIZE);
+		head_page = head_map->user_bounce_page;
+
+		for (j = 0; j < inner_pages; j++) {
+			if ((i * inner_pages + j) >= bounce_pfns)
+				break;
+			map = &domain->bounce_maps[(i * inner_pages + j)];
+			if (WARN_ON(!map->user_bounce_page))
+				continue;
+			/* Copy user page to kernel page if it's in use */
+			if ((map->orig_phys != INVALID_PHYS_ADDR) && (head_map->bounce_page))
+				need_copy = true;
+			map->user_bounce_page = NULL;
 		}
-		put_page(map->user_bounce_page);
-		map->user_bounce_page = NULL;
+		if (need_copy)
+			memcpy_from_page(page_address(head_map->bounce_page),
+					 head_page, 0, PAGE_SIZE);
+		put_page(head_page);
 	}
 	domain->user_bounce_pages = false;
 out:
@@ -581,7 +617,7 @@ vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
 	unsigned long pfn, bounce_pfns;
 	int ret;
 
-	bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT;
+	bounce_pfns = PAGE_ALIGN(bounce_size) >> BOUNCE_PAGE_SHIFT;
 	if (iova_limit <= bounce_size)
 		return NULL;
 
@@ -613,7 +649,7 @@ vduse_domain_create(unsigned long iova_limit, size_t bounce_size)
 	rwlock_init(&domain->bounce_lock);
 	spin_lock_init(&domain->iotlb_lock);
 	init_iova_domain(&domain->stream_iovad,
-			PAGE_SIZE, IOVA_START_PFN);
+			BOUNCE_PAGE_SIZE, IOVA_START_PFN);
 	ret = iova_domain_init_rcaches(&domain->stream_iovad);
 	if (ret)
 		goto err_iovad_stream;
diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h
index 7f3f0928ec78..23139a2eaf5c 100644
--- a/drivers/vdpa/vdpa_user/iova_domain.h
+++ b/drivers/vdpa/vdpa_user/iova_domain.h
@@ -19,6 +19,11 @@
 
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
 
+#define BOUNCE_PAGE_SHIFT	12
+#define BOUNCE_PAGE_SIZE	(1 << BOUNCE_PAGE_SHIFT)
+#define BOUNCE_PAGE_MASK	(~(BOUNCE_PAGE_SIZE - 1))
+#define BOUNCE_PAGE_ALIGN(addr)	(((addr) + BOUNCE_PAGE_SIZE - 1) & ~(BOUNCE_PAGE_SIZE - 1))
+
 struct vduse_bounce_map {
 	struct page *bounce_page;
 	struct page *user_bounce_page;
-- 
2.20.1
Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Jason Wang 2 weeks, 1 day ago
On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
>
> From: Sheng Zhao <sheng.zhao@bytedance.com>
>
> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
> even small IO requests to occupy an entire bounce page exclusively. The
> kind of memory waste will be more significant on arm64 with 64KB pages.

Let's tweak the title as there are archs that are using non 4KB pages
other than arm.

>
> So, optimize it by using fixed 4KB bounce pages.
>
> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
> ---
>  drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
>  drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
>  2 files changed, 83 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> index 58116f89d8da..768313c80b62 100644
> --- a/drivers/vdpa/vdpa_user/iova_domain.c
> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> @@ -103,19 +103,26 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain,
>  static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
>                                          u64 iova, u64 size, u64 paddr)
>  {
> -       struct vduse_bounce_map *map;
> +       struct vduse_bounce_map *map, *head_map;
> +       struct page *tmp_page;
>         u64 last = iova + size - 1;
>
>         while (iova <= last) {
> -               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> +               map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];

BOUNCE_PAGE_SIZE is kind of confusing as it's not the size of any page
at all when PAGE_SIZE is not 4K.

>                 if (!map->bounce_page) {
> -                       map->bounce_page = alloc_page(GFP_ATOMIC);
> -                       if (!map->bounce_page)
> -                               return -ENOMEM;
> +                       head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_PAGE_SHIFT];
> +                       if (!head_map->bounce_page) {
> +                               tmp_page = alloc_page(GFP_ATOMIC);
> +                               if (!tmp_page)
> +                                       return -ENOMEM;
> +                               if (cmpxchg(&head_map->bounce_page, NULL, tmp_page))
> +                                       __free_page(tmp_page);

I don't understand why we need cmpxchg() logic.

Btw, it looks like you want to make multiple bounce_map to point to
the same 64KB page? I wonder what's the advantages of doing this. Can
we simply keep the 64KB page in bounce_map?

Thanks
Re: Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Sheng Zhao 1 week, 2 days ago

On 2025/9/17 16:16, Jason Wang wrote:
> On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
>>
>> From: Sheng Zhao <sheng.zhao@bytedance.com>
>>
>> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
>> even small IO requests to occupy an entire bounce page exclusively. The
>> kind of memory waste will be more significant on arm64 with 64KB pages.
> 
> Let's tweak the title as there are archs that are using non 4KB pages
> other than arm.
> 

Got it. I will modify this in v2.

>>
>> So, optimize it by using fixed 4KB bounce pages.
>>
>> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
>> ---
>>   drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
>>   drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
>>   2 files changed, 83 insertions(+), 42 deletions(-)
>>
>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
>> index 58116f89d8da..768313c80b62 100644
>> --- a/drivers/vdpa/vdpa_user/iova_domain.c
>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
>> @@ -103,19 +103,26 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain,
>>   static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
>>                                           u64 iova, u64 size, u64 paddr)
>>   {
>> -       struct vduse_bounce_map *map;
>> +       struct vduse_bounce_map *map, *head_map;
>> +       struct page *tmp_page;
>>          u64 last = iova + size - 1;
>>
>>          while (iova <= last) {
>> -               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>> +               map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
> 
> BOUNCE_PAGE_SIZE is kind of confusing as it's not the size of any page
> at all when PAGE_SIZE is not 4K.
> 

How about BOUNCE_MAP_SIZE?

>>                  if (!map->bounce_page) {
>> -                       map->bounce_page = alloc_page(GFP_ATOMIC);
>> -                       if (!map->bounce_page)
>> -                               return -ENOMEM;
>> +                       head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_PAGE_SHIFT];
>> +                       if (!head_map->bounce_page) {
>> +                               tmp_page = alloc_page(GFP_ATOMIC);
>> +                               if (!tmp_page)
>> +                                       return -ENOMEM;
>> +                               if (cmpxchg(&head_map->bounce_page, NULL, tmp_page))
>> +                                       __free_page(tmp_page);
> 
> I don't understand why we need cmpxchg() logic.
> 
> Btw, it looks like you want to make multiple bounce_map to point to
> the same 64KB page? I wonder what's the advantages of doing this. Can
> we simply keep the 64KB page in bounce_map?
> 
> Thanks
> 

That's correct. We use fixed 4KB-sized bounce pages, and there will be a 
many-to-one relationship between these 4KB bounce pages and the 64KB 
memory pages.

Bounce pages are allocated on demand. As a result, it may occur that 
multiple bounce pages corresponding to the same 64KB memory page attempt 
to allocate memory simultaneously, so we use cmpxchg to handle this 
concurrency.

In the current implementation, the bounce_map structure requires no 
modification. However, if we keep the 64KB page into a single bounce_map 
while still wanting to implement a similar logic, we may need an 
additional array to store multiple orig_phys values in order to 
accommodate the many-to-one relationship.

Thanks

Re: Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Jason Wang 1 week, 1 day ago
On Tue, Sep 23, 2025 at 8:37 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
>
>
>
> On 2025/9/17 16:16, Jason Wang wrote:
> > On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
> >>
> >> From: Sheng Zhao <sheng.zhao@bytedance.com>
> >>
> >> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
> >> even small IO requests to occupy an entire bounce page exclusively. The
> >> kind of memory waste will be more significant on arm64 with 64KB pages.
> >
> > Let's tweak the title as there are archs that are using non 4KB pages
> > other than arm.
> >
>
> Got it. I will modify this in v2.
>
> >>
> >> So, optimize it by using fixed 4KB bounce pages.
> >>
> >> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
> >> ---
> >>   drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
> >>   drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
> >>   2 files changed, 83 insertions(+), 42 deletions(-)
> >>
> >> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> >> index 58116f89d8da..768313c80b62 100644
> >> --- a/drivers/vdpa/vdpa_user/iova_domain.c
> >> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> >> @@ -103,19 +103,26 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain,
> >>   static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
> >>                                           u64 iova, u64 size, u64 paddr)
> >>   {
> >> -       struct vduse_bounce_map *map;
> >> +       struct vduse_bounce_map *map, *head_map;
> >> +       struct page *tmp_page;
> >>          u64 last = iova + size - 1;
> >>
> >>          while (iova <= last) {
> >> -               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >> +               map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
> >
> > BOUNCE_PAGE_SIZE is kind of confusing as it's not the size of any page
> > at all when PAGE_SIZE is not 4K.
> >
>
> How about BOUNCE_MAP_SIZE?

Fine with me.

>
> >>                  if (!map->bounce_page) {
> >> -                       map->bounce_page = alloc_page(GFP_ATOMIC);
> >> -                       if (!map->bounce_page)
> >> -                               return -ENOMEM;
> >> +                       head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_PAGE_SHIFT];
> >> +                       if (!head_map->bounce_page) {
> >> +                               tmp_page = alloc_page(GFP_ATOMIC);
> >> +                               if (!tmp_page)
> >> +                                       return -ENOMEM;
> >> +                               if (cmpxchg(&head_map->bounce_page, NULL, tmp_page))
> >> +                                       __free_page(tmp_page);
> >
> > I don't understand why we need cmpxchg() logic.
> >
> > Btw, it looks like you want to make multiple bounce_map to point to
> > the same 64KB page? I wonder what's the advantages of doing this. Can
> > we simply keep the 64KB page in bounce_map?
> >
> > Thanks
> >
>
> That's correct. We use fixed 4KB-sized bounce pages, and there will be a
> many-to-one relationship between these 4KB bounce pages and the 64KB
> memory pages.
>
> Bounce pages are allocated on demand. As a result, it may occur that
> multiple bounce pages corresponding to the same 64KB memory page attempt
> to allocate memory simultaneously, so we use cmpxchg to handle this
> concurrency.
>
> In the current implementation, the bounce_map structure requires no
> modification. However, if we keep the 64KB page into a single bounce_map
> while still wanting to implement a similar logic, we may need an
> additional array to store multiple orig_phys values in order to
> accommodate the many-to-one relationship.

Or simply having a bitmap is sufficient per bounce_map?

Thanks

>
> Thanks
>
Re: Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Sheng Zhao 1 week, 1 day ago

On 2025/9/24 08:57, Jason Wang wrote:
> On Tue, Sep 23, 2025 at 8:37 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
>>
>>
>>
>> On 2025/9/17 16:16, Jason Wang wrote:
>>> On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
>>>>
>>>> From: Sheng Zhao <sheng.zhao@bytedance.com>
>>>>
>>>> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
>>>> even small IO requests to occupy an entire bounce page exclusively. The
>>>> kind of memory waste will be more significant on arm64 with 64KB pages.
>>>
>>> Let's tweak the title as there are archs that are using non 4KB pages
>>> other than arm.
>>>
>>
>> Got it. I will modify this in v2.
>>
>>>>
>>>> So, optimize it by using fixed 4KB bounce pages.
>>>>
>>>> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
>>>> ---
>>>>    drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
>>>>    drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
>>>>    2 files changed, 83 insertions(+), 42 deletions(-)
>>>>
>>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
>>>> index 58116f89d8da..768313c80b62 100644
>>>> --- a/drivers/vdpa/vdpa_user/iova_domain.c
>>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
>>>> @@ -103,19 +103,26 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain,
>>>>    static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
>>>>                                            u64 iova, u64 size, u64 paddr)
>>>>    {
>>>> -       struct vduse_bounce_map *map;
>>>> +       struct vduse_bounce_map *map, *head_map;
>>>> +       struct page *tmp_page;
>>>>           u64 last = iova + size - 1;
>>>>
>>>>           while (iova <= last) {
>>>> -               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>> +               map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
>>>
>>> BOUNCE_PAGE_SIZE is kind of confusing as it's not the size of any page
>>> at all when PAGE_SIZE is not 4K.
>>>
>>
>> How about BOUNCE_MAP_SIZE?
> 
> Fine with me.
> 
>>
>>>>                   if (!map->bounce_page) {
>>>> -                       map->bounce_page = alloc_page(GFP_ATOMIC);
>>>> -                       if (!map->bounce_page)
>>>> -                               return -ENOMEM;
>>>> +                       head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_PAGE_SHIFT];
>>>> +                       if (!head_map->bounce_page) {
>>>> +                               tmp_page = alloc_page(GFP_ATOMIC);
>>>> +                               if (!tmp_page)
>>>> +                                       return -ENOMEM;
>>>> +                               if (cmpxchg(&head_map->bounce_page, NULL, tmp_page))
>>>> +                                       __free_page(tmp_page);
>>>
>>> I don't understand why we need cmpxchg() logic.
>>>
>>> Btw, it looks like you want to make multiple bounce_map to point to
>>> the same 64KB page? I wonder what's the advantages of doing this. Can
>>> we simply keep the 64KB page in bounce_map?
>>>
>>> Thanks
>>>
>>
>> That's correct. We use fixed 4KB-sized bounce pages, and there will be a
>> many-to-one relationship between these 4KB bounce pages and the 64KB
>> memory pages.
>>
>> Bounce pages are allocated on demand. As a result, it may occur that
>> multiple bounce pages corresponding to the same 64KB memory page attempt
>> to allocate memory simultaneously, so we use cmpxchg to handle this
>> concurrency.
>>
>> In the current implementation, the bounce_map structure requires no
>> modification. However, if we keep the 64KB page into a single bounce_map
>> while still wanting to implement a similar logic, we may need an
>> additional array to store multiple orig_phys values in order to
>> accommodate the many-to-one relationship.
> 
> Or simply having a bitmap is sufficient per bounce_map?
> 

Yes, using a bitmap can mark the usage status of each 4KB, but it may 
not simplify things overall.

- we will inevitably need to add an additional array per bounce_map to 
store the orig_phys corresponding to each 4KB for subsequent copying 
(vduse_domain_bounce).

- compared to the current commit, this modification may only be a 
structural change and fail to reduce the amount of changes to the code 
logic. For instance, cmpxchg is still required.


Thanks

> Thanks
> 
>>
>> Thanks
>>
> 

Re: Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Jason Wang 1 week, 1 day ago
On Wed, Sep 24, 2025 at 12:05 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
>
>
>
> On 2025/9/24 08:57, Jason Wang wrote:
> > On Tue, Sep 23, 2025 at 8:37 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
> >>
> >>
> >>
> >> On 2025/9/17 16:16, Jason Wang wrote:
> >>> On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
> >>>>
> >>>> From: Sheng Zhao <sheng.zhao@bytedance.com>
> >>>>
> >>>> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
> >>>> even small IO requests to occupy an entire bounce page exclusively. The
> >>>> kind of memory waste will be more significant on arm64 with 64KB pages.
> >>>
> >>> Let's tweak the title as there are archs that are using non 4KB pages
> >>> other than arm.
> >>>
> >>
> >> Got it. I will modify this in v2.
> >>
> >>>>
> >>>> So, optimize it by using fixed 4KB bounce pages.
> >>>>
> >>>> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
> >>>> ---
> >>>>    drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
> >>>>    drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
> >>>>    2 files changed, 83 insertions(+), 42 deletions(-)
> >>>>
> >>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
> >>>> index 58116f89d8da..768313c80b62 100644
> >>>> --- a/drivers/vdpa/vdpa_user/iova_domain.c
> >>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
> >>>> @@ -103,19 +103,26 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain,
> >>>>    static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
> >>>>                                            u64 iova, u64 size, u64 paddr)
> >>>>    {
> >>>> -       struct vduse_bounce_map *map;
> >>>> +       struct vduse_bounce_map *map, *head_map;
> >>>> +       struct page *tmp_page;
> >>>>           u64 last = iova + size - 1;
> >>>>
> >>>>           while (iova <= last) {
> >>>> -               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
> >>>> +               map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
> >>>
> >>> BOUNCE_PAGE_SIZE is kind of confusing as it's not the size of any page
> >>> at all when PAGE_SIZE is not 4K.
> >>>
> >>
> >> How about BOUNCE_MAP_SIZE?
> >
> > Fine with me.
> >
> >>
> >>>>                   if (!map->bounce_page) {
> >>>> -                       map->bounce_page = alloc_page(GFP_ATOMIC);
> >>>> -                       if (!map->bounce_page)
> >>>> -                               return -ENOMEM;
> >>>> +                       head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_PAGE_SHIFT];
> >>>> +                       if (!head_map->bounce_page) {
> >>>> +                               tmp_page = alloc_page(GFP_ATOMIC);
> >>>> +                               if (!tmp_page)
> >>>> +                                       return -ENOMEM;
> >>>> +                               if (cmpxchg(&head_map->bounce_page, NULL, tmp_page))
> >>>> +                                       __free_page(tmp_page);
> >>>
> >>> I don't understand why we need cmpxchg() logic.
> >>>
> >>> Btw, it looks like you want to make multiple bounce_map to point to
> >>> the same 64KB page? I wonder what's the advantages of doing this. Can
> >>> we simply keep the 64KB page in bounce_map?
> >>>
> >>> Thanks
> >>>
> >>
> >> That's correct. We use fixed 4KB-sized bounce pages, and there will be a
> >> many-to-one relationship between these 4KB bounce pages and the 64KB
> >> memory pages.
> >>
> >> Bounce pages are allocated on demand. As a result, it may occur that
> >> multiple bounce pages corresponding to the same 64KB memory page attempt
> >> to allocate memory simultaneously, so we use cmpxchg to handle this
> >> concurrency.
> >>
> >> In the current implementation, the bounce_map structure requires no
> >> modification. However, if we keep the 64KB page into a single bounce_map
> >> while still wanting to implement a similar logic, we may need an
> >> additional array to store multiple orig_phys values in order to
> >> accommodate the many-to-one relationship.
> >
> > Or simply having a bitmap is sufficient per bounce_map?
> >
>
> Yes, using a bitmap can mark the usage status of each 4KB, but it may
> not simplify things overall.
>
> - we will inevitably need to add an additional array per bounce_map to
> store the orig_phys corresponding to each 4KB for subsequent copying
> (vduse_domain_bounce).

I may miss something, the PAGE_SIZE is 64KB in this case, why do we
need to store per 4KB orig_phys?

>
> - compared to the current commit, this modification may only be a
> structural change and fail to reduce the amount of changes to the code
> logic. For instance, cmpxchg is still required.

Thanks

>
>
> Thanks
>
> > Thanks
> >
> >>
> >> Thanks
> >>
> >
>
Re: Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Sheng Zhao 1 week, 1 day ago

On 2025/9/24 12:15, Jason Wang wrote:
> On Wed, Sep 24, 2025 at 12:05 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
>>
>>
>>
>> On 2025/9/24 08:57, Jason Wang wrote:
>>> On Tue, Sep 23, 2025 at 8:37 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
>>>>
>>>>
>>>>
>>>> On 2025/9/17 16:16, Jason Wang wrote:
>>>>> On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
>>>>>>
>>>>>> From: Sheng Zhao <sheng.zhao@bytedance.com>
>>>>>>
>>>>>> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
>>>>>> even small IO requests to occupy an entire bounce page exclusively. The
>>>>>> kind of memory waste will be more significant on arm64 with 64KB pages.
>>>>>
>>>>> Let's tweak the title as there are archs that are using non 4KB pages
>>>>> other than arm.
>>>>>
>>>>
>>>> Got it. I will modify this in v2.
>>>>
>>>>>>
>>>>>> So, optimize it by using fixed 4KB bounce pages.
>>>>>>
>>>>>> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
>>>>>> ---
>>>>>>     drivers/vdpa/vdpa_user/iova_domain.c | 120 +++++++++++++++++----------
>>>>>>     drivers/vdpa/vdpa_user/iova_domain.h |   5 ++
>>>>>>     2 files changed, 83 insertions(+), 42 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c
>>>>>> index 58116f89d8da..768313c80b62 100644
>>>>>> --- a/drivers/vdpa/vdpa_user/iova_domain.c
>>>>>> +++ b/drivers/vdpa/vdpa_user/iova_domain.c
>>>>>> @@ -103,19 +103,26 @@ void vduse_domain_clear_map(struct vduse_iova_domain *domain,
>>>>>>     static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain,
>>>>>>                                             u64 iova, u64 size, u64 paddr)
>>>>>>     {
>>>>>> -       struct vduse_bounce_map *map;
>>>>>> +       struct vduse_bounce_map *map, *head_map;
>>>>>> +       struct page *tmp_page;
>>>>>>            u64 last = iova + size - 1;
>>>>>>
>>>>>>            while (iova <= last) {
>>>>>> -               map = &domain->bounce_maps[iova >> PAGE_SHIFT];
>>>>>> +               map = &domain->bounce_maps[iova >> BOUNCE_PAGE_SHIFT];
>>>>>
>>>>> BOUNCE_PAGE_SIZE is kind of confusing as it's not the size of any page
>>>>> at all when PAGE_SIZE is not 4K.
>>>>>
>>>>
>>>> How about BOUNCE_MAP_SIZE?
>>>
>>> Fine with me.
>>>
>>>>
>>>>>>                    if (!map->bounce_page) {
>>>>>> -                       map->bounce_page = alloc_page(GFP_ATOMIC);
>>>>>> -                       if (!map->bounce_page)
>>>>>> -                               return -ENOMEM;
>>>>>> +                       head_map = &domain->bounce_maps[(iova & PAGE_MASK) >> BOUNCE_PAGE_SHIFT];
>>>>>> +                       if (!head_map->bounce_page) {
>>>>>> +                               tmp_page = alloc_page(GFP_ATOMIC);
>>>>>> +                               if (!tmp_page)
>>>>>> +                                       return -ENOMEM;
>>>>>> +                               if (cmpxchg(&head_map->bounce_page, NULL, tmp_page))
>>>>>> +                                       __free_page(tmp_page);
>>>>>
>>>>> I don't understand why we need cmpxchg() logic.
>>>>>
>>>>> Btw, it looks like you want to make multiple bounce_map to point to
>>>>> the same 64KB page? I wonder what's the advantages of doing this. Can
>>>>> we simply keep the 64KB page in bounce_map?
>>>>>
>>>>> Thanks
>>>>>
>>>>
>>>> That's correct. We use fixed 4KB-sized bounce pages, and there will be a
>>>> many-to-one relationship between these 4KB bounce pages and the 64KB
>>>> memory pages.
>>>>
>>>> Bounce pages are allocated on demand. As a result, it may occur that
>>>> multiple bounce pages corresponding to the same 64KB memory page attempt
>>>> to allocate memory simultaneously, so we use cmpxchg to handle this
>>>> concurrency.
>>>>
>>>> In the current implementation, the bounce_map structure requires no
>>>> modification. However, if we keep the 64KB page into a single bounce_map
>>>> while still wanting to implement a similar logic, we may need an
>>>> additional array to store multiple orig_phys values in order to
>>>> accommodate the many-to-one relationship.
>>>
>>> Or simply having a bitmap is sufficient per bounce_map?
>>>
>>
>> Yes, using a bitmap can mark the usage status of each 4KB, but it may
>> not simplify things overall.
>>
>> - we will inevitably need to add an additional array per bounce_map to
>> store the orig_phys corresponding to each 4KB for subsequent copying
>> (vduse_domain_bounce).
> 
> I may miss something, the PAGE_SIZE is 64KB in this case, why do we
> need to store per 4KB orig_phys?
> 

Since one orig_phys originates from one IO request. If we want the 
minimum size of bounce pages occupied by an IO request to be 4KB instead 
of 64KB. we need to store their respective orig_phys values for each 4KB 
corresponding to the IO request.

In other words, we may not be able to guarantee that the orig_phys 
values of all IO requests within the same 64KB memory page are 
contiguous, so we need to store them separately.

Thanks
>>
>> - compared to the current commit, this modification may only be a
>> structural change and fail to reduce the amount of changes to the code
>> logic. For instance, cmpxchg is still required.
> 
> Thanks
> 
>>
>>
>> Thanks
>>
>>> Thanks
>>>
>>>>
>>>> Thanks
>>>>
>>>
>>
> 

Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Jason Wang 2 weeks, 3 days ago
On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
>
> From: Sheng Zhao <sheng.zhao@bytedance.com>
>
> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
> even small IO requests to occupy an entire bounce page exclusively.

This sounds more like an issue of the IOVA allocating that use the
wrong granular?

> The
> kind of memory waste will be more significant on arm64 with 64KB pages.
>
> So, optimize it by using fixed 4KB bounce pages.
>
> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>

Thanks
Re: [External] Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Sheng Zhao 2 weeks, 3 days ago

在 2025/9/15 16:21, Jason Wang 写道:
> On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
>>
>> From: Sheng Zhao <sheng.zhao@bytedance.com>
>>
>> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
>> even small IO requests to occupy an entire bounce page exclusively.
> 
> This sounds more like an issue of the IOVA allocating that use the
> wrong granular?
> 

Sorry, the previous email has a slight formatting issue.

The granularity of the IOVA allocator is customized during the 
initialization of the vduse domain, and this value is also modified in
this commit.

Thanks

>> The
>> kind of memory waste will be more significant on arm64 with 64KB pages.
>>
>> So, optimize it by using fixed 4KB bounce pages.
>>
>> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
> 
> Thanks
> 

Re: [External] Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Jason Wang 2 weeks, 2 days ago
On Mon, Sep 15, 2025 at 7:07 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
>
>
>
> 在 2025/9/15 16:21, Jason Wang 写道:
> > On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
> >>
> >> From: Sheng Zhao <sheng.zhao@bytedance.com>
> >>
> >> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
> >> even small IO requests to occupy an entire bounce page exclusively.
> >
> > This sounds more like an issue of the IOVA allocating that use the
> > wrong granular?
> >
>
> Sorry, the previous email has a slight formatting issue.
>
> The granularity of the IOVA allocator is customized during the
> initialization of the vduse domain, and this value is also modified in
> this commit.

Ok, let's add this to the changelog.

Btw, do you have perf numbers to demonstrate the benefit?

Thanks

>
> Thanks
>
> >> The
> >> kind of memory waste will be more significant on arm64 with 64KB pages.
> >>
> >> So, optimize it by using fixed 4KB bounce pages.
> >>
> >> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
> >
> > Thanks
> >
>
Re: Re: [PATCH] vduse: Use fixed 4KB bounce pages for arm64 64KB page size
Posted by Sheng Zhao 2 weeks, 2 days ago

在 2025/9/16 15:34, Jason Wang 写道:
> On Mon, Sep 15, 2025 at 7:07 PM Sheng Zhao <sheng.zhao@bytedance.com> wrote:
>>
>>
>>
>> 在 2025/9/15 16:21, Jason Wang 写道:
>>> On Mon, Sep 15, 2025 at 3:34 PM <sheng.zhao@bytedance.com> wrote:
>>>>
>>>> From: Sheng Zhao <sheng.zhao@bytedance.com>
>>>>
>>>> The allocation granularity of bounce pages is PAGE_SIZE. This may cause
>>>> even small IO requests to occupy an entire bounce page exclusively.
>>>
>>> This sounds more like an issue of the IOVA allocating that use the
>>> wrong granular?
>>>
>>
>> Sorry, the previous email has a slight formatting issue.
>>
>> The granularity of the IOVA allocator is customized during the
>> initialization of the vduse domain, and this value is also modified in
>> this commit.
> 
> Ok, let's add this to the changelog.
> 
> Btw, do you have perf numbers to demonstrate the benefit?
> 
> Thanks
> 

For arm64 64KB base pages, compared with fixed 4KB bounce pages, using 
native pages is more likely to fill up the bounce buffer(default 64MB), 
resulting in I/O performance bottlenecks.

I used QEMU vduse-blk as the backend for testing write performance. 
Below are the fio test results:


	  | native       | fixed-4k
----------+--------------+-------------
numjobs=2 | bw=44.4MiB/s | bw=47.0MiB/s
iodepth=4 | iops=90.9k   | iops=96.1k
----------+--------------+-------------
numjobs=4 | bw=58.8MiB/s | bw=61.1MiB/s
iodepth=4 | iops=120.3k  | iops=125.4k
----------+--------------+-------------
numjobs=8 | bw=64.0MiB/s | bw=74.7MiB/s
iodepth=8 | iops=131.1k  | iops=153.1k
----------+--------------+-------------
numjobs=16| bw=69.8MiB/s | bw=92.7MiB/s
iodepth=8 | iops=143.0k  | iops=190.0k


Thanks

>>
>> Thanks
>>
>>>> The
>>>> kind of memory waste will be more significant on arm64 with 64KB pages.
>>>>
>>>> So, optimize it by using fixed 4KB bounce pages.
>>>>
>>>> Signed-off-by: Sheng Zhao <sheng.zhao@bytedance.com>
>>>
>>> Thanks
>>>
>>
>