dma-buf: system_heap: use larger contiguous mappings instead of per-page mmap

[PATCH] dma-buf: system_heap: use larger contiguous mappings instead of per-page mmap

Posted by Barry Song 1 month ago

From: Barry Song <v-songbaohua@oppo.com>

We can allocate high-order pages, but mapping them one by
one is inefficient. This patch changes the code to map
as large a chunk as possible. The code looks somewhat
complicated mainly because supporting mmap with a
non-zero offset is a bit tricky.

Using the micro-benchmark below, we see that mmap becomes
3.5X faster:

  #include <stdio.h>
  #include <fcntl.h>
  #include <linux/dma-heap.h>
  #include <sys/ioctl.h>
  #include <sys/mman.h>
  #include <time.h>
  #include <unistd.h>
  #include <stdlib.h>

  #define SIZE   (512UL * 1024 * 1024)
  #define PAGE   4096
  #define STRIDE (PAGE/sizeof(int))
  #define PAGES  (SIZE/PAGE)

  int main(void) {
      int heap = open("/dev/dma_heap/system", O_RDONLY);
      struct dma_heap_allocation_data d = { .len = SIZE, .fd_flags = O_RDWR|O_CLOEXEC };
      ioctl(heap, DMA_HEAP_IOCTL_ALLOC, &d);

      struct timespec t0, t1;
      clock_gettime(CLOCK_MONOTONIC, &t0);
      int *p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, d.fd, 0);
      clock_gettime(CLOCK_MONOTONIC, &t1);

      for (int i = 0; i < PAGES; i++) p[i*STRIDE] = i;
      for (int i = 0; i < PAGES; i++)
          if (p[i*STRIDE] != i) {
              fprintf(stderr, "mismatch at page %d\n", i);
              exit(1);
          }

      long ns = (t1.tv_sec-t0.tv_sec)*1000000000L + (t1.tv_nsec-t0.tv_nsec);
      printf("mmap 512MB took %.3f us, verify OK\n", ns/1000.0);
      return 0;
  }

W/ patch:

~ # ./a.out
mmap 512MB took 200266.000 us, verify OK
~ # ./a.out
mmap 512MB took 198151.000 us, verify OK
~ # ./a.out
mmap 512MB took 197069.000 us, verify OK
~ # ./a.out
mmap 512MB took 196781.000 us, verify OK
~ # ./a.out
mmap 512MB took 198102.000 us, verify OK
~ # ./a.out
mmap 512MB took 195552.000 us, verify OK

W/o patch:

~ # ./a.out
mmap 512MB took 6987470.000 us, verify OK
~ # ./a.out
mmap 512MB took 6970739.000 us, verify OK
~ # ./a.out
mmap 512MB took 6984383.000 us, verify OK
~ # ./a.out
mmap 512MB took 6971311.000 us, verify OK
~ # ./a.out
mmap 512MB took 6991680.000 us, verify OK

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 drivers/dma-buf/heaps/system_heap.c | 33 +++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
index bbe7881f1360..4c782fe33fd4 100644
--- a/drivers/dma-buf/heaps/system_heap.c
+++ b/drivers/dma-buf/heaps/system_heap.c
@@ -186,20 +186,35 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
 	struct system_heap_buffer *buffer = dmabuf->priv;
 	struct sg_table *table = &buffer->sg_table;
 	unsigned long addr = vma->vm_start;
-	struct sg_page_iter piter;
-	int ret;
+	unsigned long pgoff = vma->vm_pgoff;
+	struct scatterlist *sg;
+	int i, ret;
+
+	for_each_sgtable_sg(table, sg, i) {
+		unsigned long n = sg->length >> PAGE_SHIFT;
 
-	for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
-		struct page *page = sg_page_iter_page(&piter);
+		if (pgoff < n)
+			break;
+		pgoff -= n;
+	}
+
+	for (; sg && addr < vma->vm_end; sg = sg_next(sg)) {
+		unsigned long n = (sg->length >> PAGE_SHIFT) - pgoff;
+		struct page *page = sg_page(sg) + pgoff;
+		unsigned long size = n << PAGE_SHIFT;
+
+		if (addr + size > vma->vm_end)
+			size = vma->vm_end - addr;
 
-		ret = remap_pfn_range(vma, addr, page_to_pfn(page), PAGE_SIZE,
-				      vma->vm_page_prot);
+		ret = remap_pfn_range(vma, addr, page_to_pfn(page),
+				size, vma->vm_page_prot);
 		if (ret)
 			return ret;
-		addr += PAGE_SIZE;
-		if (addr >= vma->vm_end)
-			return 0;
+
+		addr += size;
+		pgoff = 0;
 	}
+
 	return 0;
 }
 
-- 
2.39.3 (Apple Git-146)

Re: [PATCH] dma-buf: system_heap: use larger contiguous mappings instead of per-page mmap

Posted by John Stultz 4 weeks, 1 day ago

On Sat, Aug 30, 2025 at 4:58 PM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> We can allocate high-order pages, but mapping them one by
> one is inefficient. This patch changes the code to map
> as large a chunk as possible. The code looks somewhat
> complicated mainly because supporting mmap with a
> non-zero offset is a bit tricky.
>
> Using the micro-benchmark below, we see that mmap becomes
> 3.5X faster:
...

It's been awhile since I've done mm things, so take it with a pinch of
salt, but this seems reasonable to me.

Though, one thought below...

> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> index bbe7881f1360..4c782fe33fd4 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -186,20 +186,35 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
>         struct system_heap_buffer *buffer = dmabuf->priv;
>         struct sg_table *table = &buffer->sg_table;
>         unsigned long addr = vma->vm_start;
> -       struct sg_page_iter piter;
> -       int ret;
> +       unsigned long pgoff = vma->vm_pgoff;
> +       struct scatterlist *sg;
> +       int i, ret;
> +
> +       for_each_sgtable_sg(table, sg, i) {
> +               unsigned long n = sg->length >> PAGE_SHIFT;
>
> -       for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
> -               struct page *page = sg_page_iter_page(&piter);
> +               if (pgoff < n)
> +                       break;
> +               pgoff -= n;
> +       }
> +
> +       for (; sg && addr < vma->vm_end; sg = sg_next(sg)) {
> +               unsigned long n = (sg->length >> PAGE_SHIFT) - pgoff;
> +               struct page *page = sg_page(sg) + pgoff;
> +               unsigned long size = n << PAGE_SHIFT;
> +
> +               if (addr + size > vma->vm_end)
> +                       size = vma->vm_end - addr;
>
> -               ret = remap_pfn_range(vma, addr, page_to_pfn(page), PAGE_SIZE,
> -                                     vma->vm_page_prot);
> +               ret = remap_pfn_range(vma, addr, page_to_pfn(page),
> +                               size, vma->vm_page_prot);

It feels like this sort of mapping loop for higher order pages
wouldn't be a unique pattern to just this code.  Would this be better
worked into a helper so it would be more generally usable?

Otherwise,
Acked-by: John Stultz <jstultz@google.com>

thanks
-john

Re: [PATCH] dma-buf: system_heap: use larger contiguous mappings instead of per-page mmap

Posted by Barry Song 3 weeks, 6 days ago

On Thu, Sep 4, 2025 at 8:07 AM John Stultz <jstultz@google.com> wrote:
>
> On Sat, Aug 30, 2025 at 4:58 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > We can allocate high-order pages, but mapping them one by
> > one is inefficient. This patch changes the code to map
> > as large a chunk as possible. The code looks somewhat
> > complicated mainly because supporting mmap with a
> > non-zero offset is a bit tricky.
> >
> > Using the micro-benchmark below, we see that mmap becomes
> > 3.5X faster:
> ...
>
> It's been awhile since I've done mm things, so take it with a pinch of
> salt, but this seems reasonable to me.
>
> Though, one thought below...
>
> > diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
> > index bbe7881f1360..4c782fe33fd4 100644
> > --- a/drivers/dma-buf/heaps/system_heap.c
> > +++ b/drivers/dma-buf/heaps/system_heap.c
> > @@ -186,20 +186,35 @@ static int system_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
> >         struct system_heap_buffer *buffer = dmabuf->priv;
> >         struct sg_table *table = &buffer->sg_table;
> >         unsigned long addr = vma->vm_start;
> > -       struct sg_page_iter piter;
> > -       int ret;
> > +       unsigned long pgoff = vma->vm_pgoff;
> > +       struct scatterlist *sg;
> > +       int i, ret;
> > +
> > +       for_each_sgtable_sg(table, sg, i) {
> > +               unsigned long n = sg->length >> PAGE_SHIFT;
> >
> > -       for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
> > -               struct page *page = sg_page_iter_page(&piter);
> > +               if (pgoff < n)
> > +                       break;
> > +               pgoff -= n;
> > +       }
> > +
> > +       for (; sg && addr < vma->vm_end; sg = sg_next(sg)) {
> > +               unsigned long n = (sg->length >> PAGE_SHIFT) - pgoff;
> > +               struct page *page = sg_page(sg) + pgoff;
> > +               unsigned long size = n << PAGE_SHIFT;
> > +
> > +               if (addr + size > vma->vm_end)
> > +                       size = vma->vm_end - addr;
> >
> > -               ret = remap_pfn_range(vma, addr, page_to_pfn(page), PAGE_SIZE,
> > -                                     vma->vm_page_prot);
> > +               ret = remap_pfn_range(vma, addr, page_to_pfn(page),
> > +                               size, vma->vm_page_prot);
>
> It feels like this sort of mapping loop for higher order pages
> wouldn't be a unique pattern to just this code.  Would this be better
> worked into a helper so it would be more generally usable?

Another case is vmap, but that would require extending vmap_sg and
related code, with little chance to share code with mmap. It also seems
hard to find other drivers that use mmap with sg. If it turns out that
others are making similar changes, we could ask to extract our current
modifications into a common helper.

>
> Otherwise,
> Acked-by: John Stultz <jstultz@google.com>

Thanks!

>

Best regards,
Barry