Use per-vma locking for concurrent page installations, this minimizes
contention with unrelated vmas improving performance. The mmap_lock is
still acquired when needed though, e.g. before get_user_pages_remote().
Many thanks to Barry Song who posted a similar approach [1].
Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1]
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
---
drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++-----------
1 file changed, 57 insertions(+), 28 deletions(-)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 814435a2601a..debfa541e01b 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc)
return smp_load_acquire(&alloc->mapped);
}
+static struct page *binder_page_lookup(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct page *page;
+ long ret;
+
+ if (!mmget_not_zero(mm))
+ return NULL;
+
+ mmap_read_lock(mm);
+ ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL);
+ mmap_read_unlock(mm);
+ mmput_async(mm);
+
+ return ret > 0 ? page : NULL;
+}
+
+static int binder_page_insert(struct binder_alloc *alloc,
+ unsigned long addr,
+ struct page *page)
+{
+ struct mm_struct *mm = alloc->mm;
+ struct vm_area_struct *vma;
+ int ret = -ESRCH;
+
+ if (!mmget_not_zero(mm))
+ return -ESRCH;
+
+ /* attempt per-vma lock first */
+ vma = lock_vma_under_rcu(mm, addr);
+ if (!vma)
+ goto lock_mmap;
+
+ if (binder_alloc_is_mapped(alloc))
+ ret = vm_insert_page(vma, addr, page);
+ vma_end_read(vma);
+ goto done;
+
+lock_mmap:
+ /* fall back to mmap_lock */
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (vma && binder_alloc_is_mapped(alloc))
+ ret = vm_insert_page(vma, addr, page);
+ mmap_read_unlock(mm);
+done:
+ mmput_async(mm);
+ return ret;
+}
+
static struct page *binder_page_alloc(struct binder_alloc *alloc,
unsigned long index,
unsigned long addr)
@@ -254,31 +304,14 @@ static int binder_install_single_page(struct binder_alloc *alloc,
unsigned long index,
unsigned long addr)
{
- struct vm_area_struct *vma;
struct page *page;
- long npages;
int ret;
- if (!mmget_not_zero(alloc->mm))
- return -ESRCH;
-
page = binder_page_alloc(alloc, index, addr);
- if (!page) {
- ret = -ENOMEM;
- goto out;
- }
-
- mmap_read_lock(alloc->mm);
- vma = vma_lookup(alloc->mm, addr);
- if (!vma || !binder_alloc_is_mapped(alloc)) {
- mmap_read_unlock(alloc->mm);
- __free_page(page);
- pr_err("%d: %s failed, no vma\n", alloc->pid, __func__);
- ret = -ESRCH;
- goto out;
- }
+ if (!page)
+ return -ENOMEM;
- ret = vm_insert_page(vma, addr, page);
+ ret = binder_page_insert(alloc, addr, page);
switch (ret) {
case -EBUSY:
/*
@@ -288,12 +321,11 @@ static int binder_install_single_page(struct binder_alloc *alloc,
*/
ret = 0;
__free_page(page);
- npages = get_user_pages_remote(alloc->mm, addr, 1, 0, &page, NULL);
- if (npages <= 0) {
+ page = binder_page_lookup(alloc->mm, addr);
+ if (!page) {
pr_err("%d: failed to find page at offset %lx\n",
alloc->pid, addr - alloc->vm_start);
- ret = -ESRCH;
- break;
+ return -ESRCH;
}
fallthrough;
case 0:
@@ -304,12 +336,9 @@ static int binder_install_single_page(struct binder_alloc *alloc,
__free_page(page);
pr_err("%d: %s failed to insert page at offset %lx with %d\n",
alloc->pid, __func__, addr - alloc->vm_start, ret);
- ret = -ENOMEM;
break;
}
- mmap_read_unlock(alloc->mm);
-out:
- mmput_async(alloc->mm);
+
return ret;
}
--
2.47.0.199.ga7371fff76-goog
On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > Use per-vma locking for concurrent page installations, this minimizes > contention with unrelated vmas improving performance. The mmap_lock is > still acquired when needed though, e.g. before get_user_pages_remote(). > > Many thanks to Barry Song who posted a similar approach [1]. > > Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1] > Cc: Nhat Pham <nphamcs@gmail.com> > Cc: Johannes Weiner <hannes@cmpxchg.org> > Cc: Barry Song <v-songbaohua@oppo.com> > Cc: Suren Baghdasaryan <surenb@google.com> > Cc: Hillf Danton <hdanton@sina.com> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Signed-off-by: Carlos Llamas <cmllamas@google.com> > --- > drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++----------- > 1 file changed, 57 insertions(+), 28 deletions(-) > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c > index 814435a2601a..debfa541e01b 100644 > --- a/drivers/android/binder_alloc.c > +++ b/drivers/android/binder_alloc.c > @@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc) > return smp_load_acquire(&alloc->mapped); > } > > +static struct page *binder_page_lookup(struct mm_struct *mm, Maybe pass "struct binder_alloc" in both binder_page_lookup() and binder_page_insert()? I like how previous code stabilized mm with mmget_not_zero() once vs now binder_page_lookup() and binder_page_insert() have to mmget/mmput individually. Not a big deal but looked cleaner. > + unsigned long addr) > +{ > + struct page *page; > + long ret; > + > + if (!mmget_not_zero(mm)) > + return NULL; > + > + mmap_read_lock(mm); > + ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL); > + mmap_read_unlock(mm); > + mmput_async(mm); > + > + return ret > 0 ? page : NULL; > +} > + > +static int binder_page_insert(struct binder_alloc *alloc, > + unsigned long addr, > + struct page *page) > +{ > + struct mm_struct *mm = alloc->mm; > + struct vm_area_struct *vma; > + int ret = -ESRCH; > + > + if (!mmget_not_zero(mm)) > + return -ESRCH; > + > + /* attempt per-vma lock first */ > + vma = lock_vma_under_rcu(mm, addr); > + if (!vma) > + goto lock_mmap; > + > + if (binder_alloc_is_mapped(alloc)) I don't think you need this check here. lock_vma_under_rcu() ensures that the VMA was not detached from the tree after locking the VMA, so if you got a VMA it's in the tree and it can't be removed (because it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is called after VMA gets detached from the tree and that won't happen while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, binder_alloc_is_mapped() has to always return true. A WARN_ON() check here to ensure that might be a better option. > + ret = vm_insert_page(vma, addr, page); > + vma_end_read(vma); > + goto done; I think the code would be more readable without these jumps: vma = lock_vma_under_rcu(mm, addr); if (vma) { if (!WARN_ON(!binder_alloc_is_mapped(alloc))) ret = vm_insert_page(vma, addr, page); vma_end_read(vma); } else { /* fall back to mmap_lock */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (vma && binder_alloc_is_mapped(alloc)) ret = vm_insert_page(vma, addr, page); mmap_read_unlock(mm); } mmput_async(mm); return ret; > + > +lock_mmap: > + /* fall back to mmap_lock */ > + mmap_read_lock(mm); > + vma = vma_lookup(mm, addr); > + if (vma && binder_alloc_is_mapped(alloc)) > + ret = vm_insert_page(vma, addr, page); > + mmap_read_unlock(mm); > +done: > + mmput_async(mm); > + return ret; > +} > + > static struct page *binder_page_alloc(struct binder_alloc *alloc, > unsigned long index, > unsigned long addr) > @@ -254,31 +304,14 @@ static int binder_install_single_page(struct binder_alloc *alloc, > unsigned long index, > unsigned long addr) > { > - struct vm_area_struct *vma; > struct page *page; > - long npages; > int ret; > > - if (!mmget_not_zero(alloc->mm)) > - return -ESRCH; > - > page = binder_page_alloc(alloc, index, addr); > - if (!page) { > - ret = -ENOMEM; > - goto out; > - } > - > - mmap_read_lock(alloc->mm); > - vma = vma_lookup(alloc->mm, addr); > - if (!vma || !binder_alloc_is_mapped(alloc)) { > - mmap_read_unlock(alloc->mm); > - __free_page(page); > - pr_err("%d: %s failed, no vma\n", alloc->pid, __func__); > - ret = -ESRCH; > - goto out; > - } > + if (!page) > + return -ENOMEM; > > - ret = vm_insert_page(vma, addr, page); > + ret = binder_page_insert(alloc, addr, page); > switch (ret) { > case -EBUSY: > /* > @@ -288,12 +321,11 @@ static int binder_install_single_page(struct binder_alloc *alloc, > */ > ret = 0; > __free_page(page); > - npages = get_user_pages_remote(alloc->mm, addr, 1, 0, &page, NULL); > - if (npages <= 0) { > + page = binder_page_lookup(alloc->mm, addr); > + if (!page) { > pr_err("%d: failed to find page at offset %lx\n", > alloc->pid, addr - alloc->vm_start); > - ret = -ESRCH; > - break; > + return -ESRCH; > } > fallthrough; > case 0: > @@ -304,12 +336,9 @@ static int binder_install_single_page(struct binder_alloc *alloc, > __free_page(page); > pr_err("%d: %s failed to insert page at offset %lx with %d\n", > alloc->pid, __func__, addr - alloc->vm_start, ret); > - ret = -ENOMEM; > break; > } > - mmap_read_unlock(alloc->mm); > -out: > - mmput_async(alloc->mm); > + > return ret; > } > > -- > 2.47.0.199.ga7371fff76-goog >
On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote: > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > Use per-vma locking for concurrent page installations, this minimizes > > contention with unrelated vmas improving performance. The mmap_lock is > > still acquired when needed though, e.g. before get_user_pages_remote(). > > > > Many thanks to Barry Song who posted a similar approach [1]. > > > > Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1] > > Cc: Nhat Pham <nphamcs@gmail.com> > > Cc: Johannes Weiner <hannes@cmpxchg.org> > > Cc: Barry Song <v-songbaohua@oppo.com> > > Cc: Suren Baghdasaryan <surenb@google.com> > > Cc: Hillf Danton <hdanton@sina.com> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Signed-off-by: Carlos Llamas <cmllamas@google.com> > > --- > > drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++----------- > > 1 file changed, 57 insertions(+), 28 deletions(-) > > > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c > > index 814435a2601a..debfa541e01b 100644 > > --- a/drivers/android/binder_alloc.c > > +++ b/drivers/android/binder_alloc.c > > @@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc) > > return smp_load_acquire(&alloc->mapped); > > } > > > > +static struct page *binder_page_lookup(struct mm_struct *mm, > > Maybe pass "struct binder_alloc" in both binder_page_lookup() and > binder_page_insert()? I'm not sure this is worth it though. Yeah, it would match with binder_page_insert() nicely, but also there is no usage for alloc in binder_page_lookup(). It's only purpose would be to access the mm: static struct page *binder_page_lookup(struct binder_alloc *alloc, unsigned long addr) { struct mm_struct *mm = alloc->mm; If you think this is cleaner I really don't mind adding it for v3. > I like how previous code stabilized mm with mmget_not_zero() once vs > now binder_page_lookup() and binder_page_insert() have to mmget/mmput > individually. Not a big deal but looked cleaner. Sure, I can factor this out (the way it was in v1). > > > + unsigned long addr) > > +{ > > + struct page *page; > > + long ret; > > + > > + if (!mmget_not_zero(mm)) > > + return NULL; > > + > > + mmap_read_lock(mm); > > + ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL); > > + mmap_read_unlock(mm); > > + mmput_async(mm); > > + > > + return ret > 0 ? page : NULL; > > +} > > + > > +static int binder_page_insert(struct binder_alloc *alloc, > > + unsigned long addr, > > + struct page *page) > > +{ > > + struct mm_struct *mm = alloc->mm; > > + struct vm_area_struct *vma; > > + int ret = -ESRCH; > > + > > + if (!mmget_not_zero(mm)) > > + return -ESRCH; > > + > > + /* attempt per-vma lock first */ > > + vma = lock_vma_under_rcu(mm, addr); > > + if (!vma) > > + goto lock_mmap; > > + > > + if (binder_alloc_is_mapped(alloc)) > > I don't think you need this check here. lock_vma_under_rcu() ensures > that the VMA was not detached from the tree after locking the VMA, so > if you got a VMA it's in the tree and it can't be removed (because > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is > called after VMA gets detached from the tree and that won't happen > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, > binder_alloc_is_mapped() has to always return true. A WARN_ON() check > here to ensure that might be a better option. Yes we are guaranteed to have _a_ non-isolated vma. However, the check validates that it's the _expected_ vma. IIUC, our vma could have been unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have gotten the same address space assigned? The binder_alloc_is_mapped() checks if the vma belongs to binder. This reminds me, I should also check this for get_user_pages_remote(). > > > + ret = vm_insert_page(vma, addr, page); > > + vma_end_read(vma); > > + goto done; > > I think the code would be more readable without these jumps: > > vma = lock_vma_under_rcu(mm, addr); > if (vma) { > if (!WARN_ON(!binder_alloc_is_mapped(alloc))) > ret = vm_insert_page(vma, addr, page); > vma_end_read(vma); > } else { > /* fall back to mmap_lock */ > mmap_read_lock(mm); > vma = vma_lookup(mm, addr); > if (vma && binder_alloc_is_mapped(alloc)) > ret = vm_insert_page(vma, addr, page); > mmap_read_unlock(mm); > } > mmput_async(mm); > return ret; Ok. I'm thinking with mmput_async() being factored out, I'll add an early return. e.g.: vma = lock_vma_under_rcu(mm, addr); if (vma) { if (binder_alloc_is_mapped(alloc)) ret = vm_insert_page(vma, addr, page); vma_end_read(vma); return ret; } /* fall back to mmap_lock */ mmap_read_lock(mm); [...] Thanks, Carlos Llamas
On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote: > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote: > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > > > Use per-vma locking for concurrent page installations, this minimizes > > > contention with unrelated vmas improving performance. The mmap_lock is > > > still acquired when needed though, e.g. before get_user_pages_remote(). > > > > > > Many thanks to Barry Song who posted a similar approach [1]. > > > > > > Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1] > > > Cc: Nhat Pham <nphamcs@gmail.com> > > > Cc: Johannes Weiner <hannes@cmpxchg.org> > > > Cc: Barry Song <v-songbaohua@oppo.com> > > > Cc: Suren Baghdasaryan <surenb@google.com> > > > Cc: Hillf Danton <hdanton@sina.com> > > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > Signed-off-by: Carlos Llamas <cmllamas@google.com> > > > --- > > > drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++----------- > > > 1 file changed, 57 insertions(+), 28 deletions(-) > > > > > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c > > > index 814435a2601a..debfa541e01b 100644 > > > --- a/drivers/android/binder_alloc.c > > > +++ b/drivers/android/binder_alloc.c > > > @@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc) > > > return smp_load_acquire(&alloc->mapped); > > > } > > > > > > +static struct page *binder_page_lookup(struct mm_struct *mm, > > > > Maybe pass "struct binder_alloc" in both binder_page_lookup() and > > binder_page_insert()? > > I'm not sure this is worth it though. Yeah, it would match with > binder_page_insert() nicely, but also there is no usage for alloc in > binder_page_lookup(). It's only purpose would be to access the mm: > > static struct page *binder_page_lookup(struct binder_alloc *alloc, > unsigned long addr) > { > struct mm_struct *mm = alloc->mm; > > If you think this is cleaner I really don't mind adding it for v3. > > > I like how previous code stabilized mm with mmget_not_zero() once vs > > now binder_page_lookup() and binder_page_insert() have to mmget/mmput > > individually. Not a big deal but looked cleaner. > > Sure, I can factor this out (the way it was in v1). > > > > > > + unsigned long addr) > > > +{ > > > + struct page *page; > > > + long ret; > > > + > > > + if (!mmget_not_zero(mm)) > > > + return NULL; > > > + > > > + mmap_read_lock(mm); > > > + ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL); > > > + mmap_read_unlock(mm); > > > + mmput_async(mm); > > > + > > > + return ret > 0 ? page : NULL; > > > +} > > > + > > > +static int binder_page_insert(struct binder_alloc *alloc, > > > + unsigned long addr, > > > + struct page *page) > > > +{ > > > + struct mm_struct *mm = alloc->mm; > > > + struct vm_area_struct *vma; > > > + int ret = -ESRCH; > > > + > > > + if (!mmget_not_zero(mm)) > > > + return -ESRCH; > > > + > > > + /* attempt per-vma lock first */ > > > + vma = lock_vma_under_rcu(mm, addr); > > > + if (!vma) > > > + goto lock_mmap; > > > + > > > + if (binder_alloc_is_mapped(alloc)) > > > > I don't think you need this check here. lock_vma_under_rcu() ensures > > that the VMA was not detached from the tree after locking the VMA, so > > if you got a VMA it's in the tree and it can't be removed (because > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is > > called after VMA gets detached from the tree and that won't happen > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check > > here to ensure that might be a better option. > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check > validates that it's the _expected_ vma. IIUC, our vma could have been > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have > gotten the same address space assigned? No, this should never happen. lock_vma_under_rcu() specifically checks the address range *after* it locks the VMA: https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026 > > The binder_alloc_is_mapped() checks if the vma belongs to binder. This > reminds me, I should also check this for get_user_pages_remote(). > > > > > > + ret = vm_insert_page(vma, addr, page); > > > + vma_end_read(vma); > > > + goto done; > > > > I think the code would be more readable without these jumps: > > > > vma = lock_vma_under_rcu(mm, addr); > > if (vma) { > > if (!WARN_ON(!binder_alloc_is_mapped(alloc))) > > ret = vm_insert_page(vma, addr, page); > > vma_end_read(vma); > > } else { > > /* fall back to mmap_lock */ > > mmap_read_lock(mm); > > vma = vma_lookup(mm, addr); > > if (vma && binder_alloc_is_mapped(alloc)) > > ret = vm_insert_page(vma, addr, page); > > mmap_read_unlock(mm); > > } > > mmput_async(mm); > > return ret; > > Ok. I'm thinking with mmput_async() being factored out, I'll add an > early return. e.g.: > > vma = lock_vma_under_rcu(mm, addr); > if (vma) { > if (binder_alloc_is_mapped(alloc)) > ret = vm_insert_page(vma, addr, page); > vma_end_read(vma); > return ret; > } > > /* fall back to mmap_lock */ > mmap_read_lock(mm); > [...] > > > Thanks, > Carlos Llamas
On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote: > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote: > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote: > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > +static int binder_page_insert(struct binder_alloc *alloc, > > > > + unsigned long addr, > > > > + struct page *page) > > > > +{ > > > > + struct mm_struct *mm = alloc->mm; > > > > + struct vm_area_struct *vma; > > > > + int ret = -ESRCH; > > > > + > > > > + if (!mmget_not_zero(mm)) > > > > + return -ESRCH; > > > > + > > > > + /* attempt per-vma lock first */ > > > > + vma = lock_vma_under_rcu(mm, addr); > > > > + if (!vma) > > > > + goto lock_mmap; > > > > + > > > > + if (binder_alloc_is_mapped(alloc)) > > > > > > I don't think you need this check here. lock_vma_under_rcu() ensures > > > that the VMA was not detached from the tree after locking the VMA, so > > > if you got a VMA it's in the tree and it can't be removed (because > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is > > > called after VMA gets detached from the tree and that won't happen > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check > > > here to ensure that might be a better option. > > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check > > validates that it's the _expected_ vma. IIUC, our vma could have been > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have > > gotten the same address space assigned? > > No, this should never happen. lock_vma_under_rcu() specifically checks > the address range *after* it locks the VMA: > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026 The scenario I'm describing is the following: Proc A Proc B mmap(addr, binder_fd) binder_page_insert() mmget_not_zero() munmap(addr) alloc->mapped = false; [...] // mmap other vma but same addr mmap(addr, other_fd) vma = lock_vma_under_rcu() Isn't there a chance for the vma that Proc A receives is an unrelated vma that was placed in the same address range?
On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote: > > On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote: > > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote: > > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote: > > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > > +static int binder_page_insert(struct binder_alloc *alloc, > > > > > + unsigned long addr, > > > > > + struct page *page) > > > > > +{ > > > > > + struct mm_struct *mm = alloc->mm; > > > > > + struct vm_area_struct *vma; > > > > > + int ret = -ESRCH; > > > > > + > > > > > + if (!mmget_not_zero(mm)) > > > > > + return -ESRCH; > > > > > + > > > > > + /* attempt per-vma lock first */ > > > > > + vma = lock_vma_under_rcu(mm, addr); > > > > > + if (!vma) > > > > > + goto lock_mmap; > > > > > + > > > > > + if (binder_alloc_is_mapped(alloc)) > > > > > > > > I don't think you need this check here. lock_vma_under_rcu() ensures > > > > that the VMA was not detached from the tree after locking the VMA, so > > > > if you got a VMA it's in the tree and it can't be removed (because > > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is > > > > called after VMA gets detached from the tree and that won't happen > > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, > > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check > > > > here to ensure that might be a better option. > > > > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check > > > validates that it's the _expected_ vma. IIUC, our vma could have been > > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have > > > gotten the same address space assigned? > > > > No, this should never happen. lock_vma_under_rcu() specifically checks > > the address range *after* it locks the VMA: > > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026 > > The scenario I'm describing is the following: > > Proc A Proc B > mmap(addr, binder_fd) > binder_page_insert() > mmget_not_zero() > munmap(addr) > alloc->mapped = false; > [...] > // mmap other vma but same addr > mmap(addr, other_fd) > > vma = lock_vma_under_rcu() > > Isn't there a chance for the vma that Proc A receives is an unrelated > vma that was placed in the same address range? Ah, I see now. The VMA is a valid one and at the address we specified but it does not belong to the binder. Yes, then you do need this check.
On Thu, Nov 7, 2024 at 10:27 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote: > > > > On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote: > > > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote: > > > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote: > > > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > > > +static int binder_page_insert(struct binder_alloc *alloc, > > > > > > + unsigned long addr, > > > > > > + struct page *page) > > > > > > +{ > > > > > > + struct mm_struct *mm = alloc->mm; > > > > > > + struct vm_area_struct *vma; > > > > > > + int ret = -ESRCH; > > > > > > + > > > > > > + if (!mmget_not_zero(mm)) > > > > > > + return -ESRCH; > > > > > > + > > > > > > + /* attempt per-vma lock first */ > > > > > > + vma = lock_vma_under_rcu(mm, addr); > > > > > > + if (!vma) > > > > > > + goto lock_mmap; > > > > > > + > > > > > > + if (binder_alloc_is_mapped(alloc)) > > > > > > > > > > I don't think you need this check here. lock_vma_under_rcu() ensures > > > > > that the VMA was not detached from the tree after locking the VMA, so > > > > > if you got a VMA it's in the tree and it can't be removed (because > > > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is > > > > > called after VMA gets detached from the tree and that won't happen > > > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, > > > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check > > > > > here to ensure that might be a better option. > > > > > > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check > > > > validates that it's the _expected_ vma. IIUC, our vma could have been > > > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have > > > > gotten the same address space assigned? > > > > > > No, this should never happen. lock_vma_under_rcu() specifically checks > > > the address range *after* it locks the VMA: > > > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026 > > > > The scenario I'm describing is the following: > > > > Proc A Proc B > > mmap(addr, binder_fd) > > binder_page_insert() > > mmget_not_zero() > > munmap(addr) > > alloc->mapped = false; > > [...] > > // mmap other vma but same addr > > mmap(addr, other_fd) > > > > vma = lock_vma_under_rcu() > > > > Isn't there a chance for the vma that Proc A receives is an unrelated > > vma that was placed in the same address range? > > Ah, I see now. The VMA is a valid one and at the address we specified > but it does not belong to the binder. Yes, then you do need this > check. Is this scenario possible?: Proc A Proc B mmap(addr, binder_fd) binder_page_insert() mmget_not_zero() munmap(addr) alloc->mapped = false; [...] // mmap other vma but same addr mmap(addr, other_fd) mmap(other_addr, binder_fd) vma = lock_vma_under_rcu(addr) If so, I think your binder_alloc_is_mapped() check will return true but the binder area is mapped at a different other_addr. To avoid that I think you can check that "addr" still belongs to [alloc->vm_start, alloc->buffer_size] after you obtained and locked the VMA.
On Thu, Nov 07, 2024 at 10:52:30AM -0800, Suren Baghdasaryan wrote: > On Thu, Nov 7, 2024 at 10:27 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > > On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote: > > > > > > On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote: > > > > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote: > > > > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote: > > > > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > > > > +static int binder_page_insert(struct binder_alloc *alloc, > > > > > > > + unsigned long addr, > > > > > > > + struct page *page) > > > > > > > +{ > > > > > > > + struct mm_struct *mm = alloc->mm; > > > > > > > + struct vm_area_struct *vma; > > > > > > > + int ret = -ESRCH; > > > > > > > + > > > > > > > + if (!mmget_not_zero(mm)) > > > > > > > + return -ESRCH; > > > > > > > + > > > > > > > + /* attempt per-vma lock first */ > > > > > > > + vma = lock_vma_under_rcu(mm, addr); > > > > > > > + if (!vma) > > > > > > > + goto lock_mmap; > > > > > > > + > > > > > > > + if (binder_alloc_is_mapped(alloc)) > > > > > > > > > > > > I don't think you need this check here. lock_vma_under_rcu() ensures > > > > > > that the VMA was not detached from the tree after locking the VMA, so > > > > > > if you got a VMA it's in the tree and it can't be removed (because > > > > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is > > > > > > called after VMA gets detached from the tree and that won't happen > > > > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, > > > > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check > > > > > > here to ensure that might be a better option. > > > > > > > > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check > > > > > validates that it's the _expected_ vma. IIUC, our vma could have been > > > > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have > > > > > gotten the same address space assigned? > > > > > > > > No, this should never happen. lock_vma_under_rcu() specifically checks > > > > the address range *after* it locks the VMA: > > > > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026 > > > > > > The scenario I'm describing is the following: > > > > > > Proc A Proc B > > > mmap(addr, binder_fd) > > > binder_page_insert() > > > mmget_not_zero() > > > munmap(addr) > > > alloc->mapped = false; > > > [...] > > > // mmap other vma but same addr > > > mmap(addr, other_fd) > > > > > > vma = lock_vma_under_rcu() > > > > > > Isn't there a chance for the vma that Proc A receives is an unrelated > > > vma that was placed in the same address range? > > > > Ah, I see now. The VMA is a valid one and at the address we specified > > but it does not belong to the binder. Yes, then you do need this > > check. > > Is this scenario possible?: > > Proc A Proc B > mmap(addr, binder_fd) > binder_page_insert() > mmget_not_zero() > munmap(addr) > alloc->mapped = false; > [...] > // mmap other vma but same addr > mmap(addr, other_fd) > mmap(other_addr, binder_fd) > vma = lock_vma_under_rcu(addr) > > If so, I think your binder_alloc_is_mapped() check will return true > but the binder area is mapped at a different other_addr. To avoid that > I think you can check that "addr" still belongs to [alloc->vm_start, > alloc->buffer_size] after you obtained and locked the VMA. Wait, I thought that vm_ops->close() was called with the mmap_lock in exclusive mode. This is where binder clears the alloc->mapped. If this is not the case (was it ever?), then I'd definitely need to fix this. I'll have a closer look. Thanks, Carlos Llamas
On Thu, Nov 07, 2024 at 07:33:24PM +0000, Carlos Llamas wrote: > On Thu, Nov 07, 2024 at 10:52:30AM -0800, Suren Baghdasaryan wrote: > > On Thu, Nov 7, 2024 at 10:27 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote: > > > > > > > > On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote: > > > > > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote: > > > > > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote: > > > > > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote: > > > > > > > > +static int binder_page_insert(struct binder_alloc *alloc, > > > > > > > > + unsigned long addr, > > > > > > > > + struct page *page) > > > > > > > > +{ > > > > > > > > + struct mm_struct *mm = alloc->mm; > > > > > > > > + struct vm_area_struct *vma; > > > > > > > > + int ret = -ESRCH; > > > > > > > > + > > > > > > > > + if (!mmget_not_zero(mm)) > > > > > > > > + return -ESRCH; > > > > > > > > + > > > > > > > > + /* attempt per-vma lock first */ > > > > > > > > + vma = lock_vma_under_rcu(mm, addr); > > > > > > > > + if (!vma) > > > > > > > > + goto lock_mmap; > > > > > > > > + > > > > > > > > + if (binder_alloc_is_mapped(alloc)) > > > > > > > > > > > > > > I don't think you need this check here. lock_vma_under_rcu() ensures > > > > > > > that the VMA was not detached from the tree after locking the VMA, so > > > > > > > if you got a VMA it's in the tree and it can't be removed (because > > > > > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is > > > > > > > called after VMA gets detached from the tree and that won't happen > > > > > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA, > > > > > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check > > > > > > > here to ensure that might be a better option. > > > > > > > > > > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check > > > > > > validates that it's the _expected_ vma. IIUC, our vma could have been > > > > > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have > > > > > > gotten the same address space assigned? > > > > > > > > > > No, this should never happen. lock_vma_under_rcu() specifically checks > > > > > the address range *after* it locks the VMA: > > > > > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026 > > > > > > > > The scenario I'm describing is the following: > > > > > > > > Proc A Proc B > > > > mmap(addr, binder_fd) > > > > binder_page_insert() > > > > mmget_not_zero() > > > > munmap(addr) > > > > alloc->mapped = false; > > > > [...] > > > > // mmap other vma but same addr > > > > mmap(addr, other_fd) > > > > > > > > vma = lock_vma_under_rcu() > > > > > > > > Isn't there a chance for the vma that Proc A receives is an unrelated > > > > vma that was placed in the same address range? > > > > > > Ah, I see now. The VMA is a valid one and at the address we specified > > > but it does not belong to the binder. Yes, then you do need this > > > check. > > > > Is this scenario possible?: > > > > Proc A Proc B > > mmap(addr, binder_fd) > > binder_page_insert() > > mmget_not_zero() > > munmap(addr) > > alloc->mapped = false; > > [...] > > // mmap other vma but same addr > > mmap(addr, other_fd) > > mmap(other_addr, binder_fd) > > vma = lock_vma_under_rcu(addr) > > > > If so, I think your binder_alloc_is_mapped() check will return true > > but the binder area is mapped at a different other_addr. To avoid that > > I think you can check that "addr" still belongs to [alloc->vm_start, > > alloc->buffer_size] after you obtained and locked the VMA. > > Wait, I thought that vm_ops->close() was called with the mmap_lock in > exclusive mode. This is where binder clears the alloc->mapped. If this > is not the case (was it ever?), then I'd definitely need to fix this. On a second though, we don't need the mmap_sem in exclusive mode. When we acquire the vma through lock_vma_under_rcu() we guarantee it's not isolated and if our alloc->mapped is set, that means it has not been unmapped either. So we are good to proceed. -- Carlos Llamas
© 2016 - 2024 Red Hat, Inc.