Allow multiple callers to install pages simultaneously by downgrading
the mmap_sem to non-exclusive mode. Races to the same PTE are handled
using get_user_pages_remote() to retrieve the already installed page.
This method significantly reduces contention in the mmap semaphore.
To ensure safety, vma_lookup() is used (instead of alloc->vma) to avoid
operating on an isolated VMA. In addition, zap_page_range_single() is
called under the alloc->mutex to avoid racing with the shrinker.
Many thanks to Barry Song who posted a similar approach [1].
Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1]
Cc: David Hildenbrand <david@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
---
drivers/android/binder_alloc.c | 64 +++++++++++++++++++++-------------
1 file changed, 40 insertions(+), 24 deletions(-)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 7241bf4a3ff2..acdc05552603 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -221,26 +221,14 @@ static int binder_install_single_page(struct binder_alloc *alloc,
struct binder_lru_page *lru_page,
unsigned long addr)
{
+ struct vm_area_struct *vma;
struct page *page;
- int ret = 0;
+ long npages;
+ int ret;
if (!mmget_not_zero(alloc->mm))
return -ESRCH;
- /*
- * Protected with mmap_sem in write mode as multiple tasks
- * might race to install the same page.
- */
- mmap_write_lock(alloc->mm);
- if (binder_get_installed_page(lru_page))
- goto out;
-
- if (!alloc->vma) {
- pr_err("%d: %s failed, no vma\n", alloc->pid, __func__);
- ret = -ESRCH;
- goto out;
- }
-
page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
if (!page) {
pr_err("%d: failed to allocate page\n", alloc->pid);
@@ -248,19 +236,47 @@ static int binder_install_single_page(struct binder_alloc *alloc,
goto out;
}
- ret = vm_insert_page(alloc->vma, addr, page);
- if (ret) {
- pr_err("%d: %s failed to insert page at offset %lx with %d\n",
- alloc->pid, __func__, addr - alloc->buffer, ret);
+ mmap_read_lock(alloc->mm);
+ vma = vma_lookup(alloc->mm, addr);
+ if (!vma || vma != alloc->vma) {
+ mmap_read_unlock(alloc->mm);
__free_page(page);
- ret = -ENOMEM;
+ pr_err("%d: %s failed, no vma\n", alloc->pid, __func__);
+ ret = -ESRCH;
goto out;
}
- /* Mark page installation complete and safe to use */
- binder_set_installed_page(lru_page, page);
+ ret = vm_insert_page(vma, addr, page);
+ switch (ret) {
+ case -EBUSY:
+ /*
+ * EBUSY is ok. Someone installed the pte first but the
+ * lru_page->page_ptr has not been updated yet. Discard
+ * our page and look up the one already installed.
+ */
+ ret = 0;
+ __free_page(page);
+ npages = get_user_pages_remote(alloc->mm, addr, 1, 0, &page, NULL);
+ if (npages <= 0) {
+ pr_err("%d: failed to find page at offset %lx\n",
+ alloc->pid, addr - alloc->buffer);
+ ret = -ESRCH;
+ break;
+ }
+ fallthrough;
+ case 0:
+ /* Mark page installation complete and safe to use */
+ binder_set_installed_page(lru_page, page);
+ break;
+ default:
+ __free_page(page);
+ pr_err("%d: %s failed to insert page at offset %lx with %d\n",
+ alloc->pid, __func__, addr - alloc->buffer, ret);
+ ret = -ENOMEM;
+ break;
+ }
+ mmap_read_unlock(alloc->mm);
out:
- mmap_write_unlock(alloc->mm);
mmput_async(alloc->mm);
return ret;
}
@@ -1091,7 +1107,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
trace_binder_unmap_kernel_end(alloc, index);
list_lru_isolate(lru, item);
- mutex_unlock(&alloc->mutex);
spin_unlock(lock);
if (vma) {
@@ -1102,6 +1117,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
trace_binder_unmap_user_end(alloc, index);
}
+ mutex_unlock(&alloc->mutex);
mmap_read_unlock(mm);
mmput_async(mm);
__free_page(page_to_free);
--
2.47.0.199.ga7371fff76-goog
On Wed, Nov 6, 2024 at 8:02 PM Carlos Llamas <cmllamas@google.com> wrote: > > Allow multiple callers to install pages simultaneously by downgrading > the mmap_sem to non-exclusive mode. Races to the same PTE are handled > using get_user_pages_remote() to retrieve the already installed page. > This method significantly reduces contention in the mmap semaphore. > > To ensure safety, vma_lookup() is used (instead of alloc->vma) to avoid > operating on an isolated VMA. In addition, zap_page_range_single() is > called under the alloc->mutex to avoid racing with the shrinker. > > Many thanks to Barry Song who posted a similar approach [1]. > > Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1] > Cc: David Hildenbrand <david@redhat.com> > Cc: Barry Song <v-songbaohua@oppo.com> > Cc: Suren Baghdasaryan <surenb@google.com> > Cc: Liam R. Howlett <Liam.Howlett@oracle.com> > Signed-off-by: Carlos Llamas <cmllamas@google.com> > --- > drivers/android/binder_alloc.c | 64 +++++++++++++++++++++------------- > 1 file changed, 40 insertions(+), 24 deletions(-) > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c > index 7241bf4a3ff2..acdc05552603 100644 > --- a/drivers/android/binder_alloc.c > +++ b/drivers/android/binder_alloc.c > @@ -221,26 +221,14 @@ static int binder_install_single_page(struct binder_alloc *alloc, > struct binder_lru_page *lru_page, > unsigned long addr) > { > + struct vm_area_struct *vma; > struct page *page; > - int ret = 0; > + long npages; > + int ret; > > if (!mmget_not_zero(alloc->mm)) > return -ESRCH; > > - /* > - * Protected with mmap_sem in write mode as multiple tasks > - * might race to install the same page. > - */ > - mmap_write_lock(alloc->mm); > - if (binder_get_installed_page(lru_page)) > - goto out; > - > - if (!alloc->vma) { > - pr_err("%d: %s failed, no vma\n", alloc->pid, __func__); > - ret = -ESRCH; > - goto out; > - } > - > page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); > if (!page) { > pr_err("%d: failed to allocate page\n", alloc->pid); > @@ -248,19 +236,47 @@ static int binder_install_single_page(struct binder_alloc *alloc, > goto out; > } > > - ret = vm_insert_page(alloc->vma, addr, page); > - if (ret) { > - pr_err("%d: %s failed to insert page at offset %lx with %d\n", > - alloc->pid, __func__, addr - alloc->buffer, ret); > + mmap_read_lock(alloc->mm); > + vma = vma_lookup(alloc->mm, addr); > + if (!vma || vma != alloc->vma) { > + mmap_read_unlock(alloc->mm); nit: instead of unlocking here you could have another label before mmap_read_unlock() at the end and jump to it. > __free_page(page); > - ret = -ENOMEM; > + pr_err("%d: %s failed, no vma\n", alloc->pid, __func__); > + ret = -ESRCH; > goto out; > } > > - /* Mark page installation complete and safe to use */ > - binder_set_installed_page(lru_page, page); > + ret = vm_insert_page(vma, addr, page); > + switch (ret) { > + case -EBUSY: > + /* > + * EBUSY is ok. Someone installed the pte first but the > + * lru_page->page_ptr has not been updated yet. Discard > + * our page and look up the one already installed. > + */ > + ret = 0; > + __free_page(page); > + npages = get_user_pages_remote(alloc->mm, addr, 1, 0, &page, NULL); > + if (npages <= 0) { > + pr_err("%d: failed to find page at offset %lx\n", > + alloc->pid, addr - alloc->buffer); > + ret = -ESRCH; > + break; > + } > + fallthrough; > + case 0: > + /* Mark page installation complete and safe to use */ > + binder_set_installed_page(lru_page, page); > + break; > + default: > + __free_page(page); > + pr_err("%d: %s failed to insert page at offset %lx with %d\n", > + alloc->pid, __func__, addr - alloc->buffer, ret); > + ret = -ENOMEM; vm_insert_page() can return EINVAL (see validate_page_before_insert()). Instead of converting other codes into ENOMEM why not return "ret" as is? > + break; > + } > + mmap_read_unlock(alloc->mm); > out: > - mmap_write_unlock(alloc->mm); > mmput_async(alloc->mm); > return ret; > } > @@ -1091,7 +1107,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, > trace_binder_unmap_kernel_end(alloc, index); > > list_lru_isolate(lru, item); > - mutex_unlock(&alloc->mutex); > spin_unlock(lock); > > if (vma) { > @@ -1102,6 +1117,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, > trace_binder_unmap_user_end(alloc, index); > } > > + mutex_unlock(&alloc->mutex); > mmap_read_unlock(mm); > mmput_async(mm); > __free_page(page_to_free); > -- > 2.47.0.199.ga7371fff76-goog >
On Thu, Nov 07, 2024 at 07:10:28AM -0800, Suren Baghdasaryan wrote: > On Wed, Nov 6, 2024 at 8:02 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > Allow multiple callers to install pages simultaneously by downgrading > > the mmap_sem to non-exclusive mode. Races to the same PTE are handled > > using get_user_pages_remote() to retrieve the already installed page. > > This method significantly reduces contention in the mmap semaphore. > > > > To ensure safety, vma_lookup() is used (instead of alloc->vma) to avoid > > operating on an isolated VMA. In addition, zap_page_range_single() is > > called under the alloc->mutex to avoid racing with the shrinker. > > > > Many thanks to Barry Song who posted a similar approach [1]. > > > > Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1] > > Cc: David Hildenbrand <david@redhat.com> > > Cc: Barry Song <v-songbaohua@oppo.com> > > Cc: Suren Baghdasaryan <surenb@google.com> > > Cc: Liam R. Howlett <Liam.Howlett@oracle.com> > > Signed-off-by: Carlos Llamas <cmllamas@google.com> > > --- > > drivers/android/binder_alloc.c | 64 +++++++++++++++++++++------------- > > 1 file changed, 40 insertions(+), 24 deletions(-) > > > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c > > index 7241bf4a3ff2..acdc05552603 100644 > > --- a/drivers/android/binder_alloc.c > > +++ b/drivers/android/binder_alloc.c > > @@ -221,26 +221,14 @@ static int binder_install_single_page(struct binder_alloc *alloc, > > struct binder_lru_page *lru_page, > > unsigned long addr) > > { > > + struct vm_area_struct *vma; > > struct page *page; > > - int ret = 0; > > + long npages; > > + int ret; > > > > if (!mmget_not_zero(alloc->mm)) > > return -ESRCH; > > > > - /* > > - * Protected with mmap_sem in write mode as multiple tasks > > - * might race to install the same page. > > - */ > > - mmap_write_lock(alloc->mm); > > - if (binder_get_installed_page(lru_page)) > > - goto out; > > - > > - if (!alloc->vma) { > > - pr_err("%d: %s failed, no vma\n", alloc->pid, __func__); > > - ret = -ESRCH; > > - goto out; > > - } > > - > > page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); > > if (!page) { > > pr_err("%d: failed to allocate page\n", alloc->pid); > > @@ -248,19 +236,47 @@ static int binder_install_single_page(struct binder_alloc *alloc, > > goto out; > > } > > > > - ret = vm_insert_page(alloc->vma, addr, page); > > - if (ret) { > > - pr_err("%d: %s failed to insert page at offset %lx with %d\n", > > - alloc->pid, __func__, addr - alloc->buffer, ret); > > + mmap_read_lock(alloc->mm); > > + vma = vma_lookup(alloc->mm, addr); > > + if (!vma || vma != alloc->vma) { > > + mmap_read_unlock(alloc->mm); > > nit: instead of unlocking here you could have another label before > mmap_read_unlock() at the end and jump to it. Sounds good, I'll do this. > > > __free_page(page); > > - ret = -ENOMEM; > > + pr_err("%d: %s failed, no vma\n", alloc->pid, __func__); > > + ret = -ESRCH; > > goto out; > > } > > > > - /* Mark page installation complete and safe to use */ > > - binder_set_installed_page(lru_page, page); > > + ret = vm_insert_page(vma, addr, page); > > + switch (ret) { > > + case -EBUSY: > > + /* > > + * EBUSY is ok. Someone installed the pte first but the > > + * lru_page->page_ptr has not been updated yet. Discard > > + * our page and look up the one already installed. > > + */ > > + ret = 0; > > + __free_page(page); > > + npages = get_user_pages_remote(alloc->mm, addr, 1, 0, &page, NULL); > > + if (npages <= 0) { > > + pr_err("%d: failed to find page at offset %lx\n", > > + alloc->pid, addr - alloc->buffer); > > + ret = -ESRCH; > > + break; > > + } > > + fallthrough; > > + case 0: > > + /* Mark page installation complete and safe to use */ > > + binder_set_installed_page(lru_page, page); > > + break; > > + default: > > + __free_page(page); > > + pr_err("%d: %s failed to insert page at offset %lx with %d\n", > > + alloc->pid, __func__, addr - alloc->buffer, ret); > > + ret = -ENOMEM; > > vm_insert_page() can return EINVAL (see > validate_page_before_insert()). Instead of converting other codes into > ENOMEM why not return "ret" as is? This is purely historical, binder has always propagated -ENOMEM to userspace on errors from vm_insert_page() and I'm not sure why. FWIW, I've dropped the behavior in the last patch and now I just forward whatever vm_insert_page() returns. I had a look at libbinder code and it doesn't really make a difference. Perhaps, I should be explicit about this move and do it in a separate commit. Thanks, -- Carlos Llamas
© 2016 - 2024 Red Hat, Inc.