Use per-vma locking for concurrent page installations, this minimizes
contention with unrelated vmas improving performance. The mmap_lock is
still acquired when needed though, e.g. before get_user_pages_remote().
Many thanks to Barry Song who posted a similar approach [1].
Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1]
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
---
drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++-----------
1 file changed, 57 insertions(+), 28 deletions(-)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 814435a2601a..debfa541e01b 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc)
return smp_load_acquire(&alloc->mapped);
}
+static struct page *binder_page_lookup(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct page *page;
+ long ret;
+
+ if (!mmget_not_zero(mm))
+ return NULL;
+
+ mmap_read_lock(mm);
+ ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL);
+ mmap_read_unlock(mm);
+ mmput_async(mm);
+
+ return ret > 0 ? page : NULL;
+}
+
+static int binder_page_insert(struct binder_alloc *alloc,
+ unsigned long addr,
+ struct page *page)
+{
+ struct mm_struct *mm = alloc->mm;
+ struct vm_area_struct *vma;
+ int ret = -ESRCH;
+
+ if (!mmget_not_zero(mm))
+ return -ESRCH;
+
+ /* attempt per-vma lock first */
+ vma = lock_vma_under_rcu(mm, addr);
+ if (!vma)
+ goto lock_mmap;
+
+ if (binder_alloc_is_mapped(alloc))
+ ret = vm_insert_page(vma, addr, page);
+ vma_end_read(vma);
+ goto done;
+
+lock_mmap:
+ /* fall back to mmap_lock */
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (vma && binder_alloc_is_mapped(alloc))
+ ret = vm_insert_page(vma, addr, page);
+ mmap_read_unlock(mm);
+done:
+ mmput_async(mm);
+ return ret;
+}
+
static struct page *binder_page_alloc(struct binder_alloc *alloc,
unsigned long index,
unsigned long addr)
@@ -254,31 +304,14 @@ static int binder_install_single_page(struct binder_alloc *alloc,
unsigned long index,
unsigned long addr)
{
- struct vm_area_struct *vma;
struct page *page;
- long npages;
int ret;
- if (!mmget_not_zero(alloc->mm))
- return -ESRCH;
-
page = binder_page_alloc(alloc, index, addr);
- if (!page) {
- ret = -ENOMEM;
- goto out;
- }
-
- mmap_read_lock(alloc->mm);
- vma = vma_lookup(alloc->mm, addr);
- if (!vma || !binder_alloc_is_mapped(alloc)) {
- mmap_read_unlock(alloc->mm);
- __free_page(page);
- pr_err("%d: %s failed, no vma\n", alloc->pid, __func__);
- ret = -ESRCH;
- goto out;
- }
+ if (!page)
+ return -ENOMEM;
- ret = vm_insert_page(vma, addr, page);
+ ret = binder_page_insert(alloc, addr, page);
switch (ret) {
case -EBUSY:
/*
@@ -288,12 +321,11 @@ static int binder_install_single_page(struct binder_alloc *alloc,
*/
ret = 0;
__free_page(page);
- npages = get_user_pages_remote(alloc->mm, addr, 1, 0, &page, NULL);
- if (npages <= 0) {
+ page = binder_page_lookup(alloc->mm, addr);
+ if (!page) {
pr_err("%d: failed to find page at offset %lx\n",
alloc->pid, addr - alloc->vm_start);
- ret = -ESRCH;
- break;
+ return -ESRCH;
}
fallthrough;
case 0:
@@ -304,12 +336,9 @@ static int binder_install_single_page(struct binder_alloc *alloc,
__free_page(page);
pr_err("%d: %s failed to insert page at offset %lx with %d\n",
alloc->pid, __func__, addr - alloc->vm_start, ret);
- ret = -ENOMEM;
break;
}
- mmap_read_unlock(alloc->mm);
-out:
- mmput_async(alloc->mm);
+
return ret;
}
--
2.47.0.199.ga7371fff76-goog
On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
>
> Use per-vma locking for concurrent page installations, this minimizes
> contention with unrelated vmas improving performance. The mmap_lock is
> still acquired when needed though, e.g. before get_user_pages_remote().
>
> Many thanks to Barry Song who posted a similar approach [1].
>
> Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1]
> Cc: Nhat Pham <nphamcs@gmail.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Barry Song <v-songbaohua@oppo.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Hillf Danton <hdanton@sina.com>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Signed-off-by: Carlos Llamas <cmllamas@google.com>
> ---
> drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++-----------
> 1 file changed, 57 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
> index 814435a2601a..debfa541e01b 100644
> --- a/drivers/android/binder_alloc.c
> +++ b/drivers/android/binder_alloc.c
> @@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc)
> return smp_load_acquire(&alloc->mapped);
> }
>
> +static struct page *binder_page_lookup(struct mm_struct *mm,
Maybe pass "struct binder_alloc" in both binder_page_lookup() and
binder_page_insert()?
I like how previous code stabilized mm with mmget_not_zero() once vs
now binder_page_lookup() and binder_page_insert() have to mmget/mmput
individually. Not a big deal but looked cleaner.
> + unsigned long addr)
> +{
> + struct page *page;
> + long ret;
> +
> + if (!mmget_not_zero(mm))
> + return NULL;
> +
> + mmap_read_lock(mm);
> + ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL);
> + mmap_read_unlock(mm);
> + mmput_async(mm);
> +
> + return ret > 0 ? page : NULL;
> +}
> +
> +static int binder_page_insert(struct binder_alloc *alloc,
> + unsigned long addr,
> + struct page *page)
> +{
> + struct mm_struct *mm = alloc->mm;
> + struct vm_area_struct *vma;
> + int ret = -ESRCH;
> +
> + if (!mmget_not_zero(mm))
> + return -ESRCH;
> +
> + /* attempt per-vma lock first */
> + vma = lock_vma_under_rcu(mm, addr);
> + if (!vma)
> + goto lock_mmap;
> +
> + if (binder_alloc_is_mapped(alloc))
I don't think you need this check here. lock_vma_under_rcu() ensures
that the VMA was not detached from the tree after locking the VMA, so
if you got a VMA it's in the tree and it can't be removed (because
it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
called after VMA gets detached from the tree and that won't happen
while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
binder_alloc_is_mapped() has to always return true. A WARN_ON() check
here to ensure that might be a better option.
> + ret = vm_insert_page(vma, addr, page);
> + vma_end_read(vma);
> + goto done;
I think the code would be more readable without these jumps:
vma = lock_vma_under_rcu(mm, addr);
if (vma) {
if (!WARN_ON(!binder_alloc_is_mapped(alloc)))
ret = vm_insert_page(vma, addr, page);
vma_end_read(vma);
} else {
/* fall back to mmap_lock */
mmap_read_lock(mm);
vma = vma_lookup(mm, addr);
if (vma && binder_alloc_is_mapped(alloc))
ret = vm_insert_page(vma, addr, page);
mmap_read_unlock(mm);
}
mmput_async(mm);
return ret;
> +
> +lock_mmap:
> + /* fall back to mmap_lock */
> + mmap_read_lock(mm);
> + vma = vma_lookup(mm, addr);
> + if (vma && binder_alloc_is_mapped(alloc))
> + ret = vm_insert_page(vma, addr, page);
> + mmap_read_unlock(mm);
> +done:
> + mmput_async(mm);
> + return ret;
> +}
> +
> static struct page *binder_page_alloc(struct binder_alloc *alloc,
> unsigned long index,
> unsigned long addr)
> @@ -254,31 +304,14 @@ static int binder_install_single_page(struct binder_alloc *alloc,
> unsigned long index,
> unsigned long addr)
> {
> - struct vm_area_struct *vma;
> struct page *page;
> - long npages;
> int ret;
>
> - if (!mmget_not_zero(alloc->mm))
> - return -ESRCH;
> -
> page = binder_page_alloc(alloc, index, addr);
> - if (!page) {
> - ret = -ENOMEM;
> - goto out;
> - }
> -
> - mmap_read_lock(alloc->mm);
> - vma = vma_lookup(alloc->mm, addr);
> - if (!vma || !binder_alloc_is_mapped(alloc)) {
> - mmap_read_unlock(alloc->mm);
> - __free_page(page);
> - pr_err("%d: %s failed, no vma\n", alloc->pid, __func__);
> - ret = -ESRCH;
> - goto out;
> - }
> + if (!page)
> + return -ENOMEM;
>
> - ret = vm_insert_page(vma, addr, page);
> + ret = binder_page_insert(alloc, addr, page);
> switch (ret) {
> case -EBUSY:
> /*
> @@ -288,12 +321,11 @@ static int binder_install_single_page(struct binder_alloc *alloc,
> */
> ret = 0;
> __free_page(page);
> - npages = get_user_pages_remote(alloc->mm, addr, 1, 0, &page, NULL);
> - if (npages <= 0) {
> + page = binder_page_lookup(alloc->mm, addr);
> + if (!page) {
> pr_err("%d: failed to find page at offset %lx\n",
> alloc->pid, addr - alloc->vm_start);
> - ret = -ESRCH;
> - break;
> + return -ESRCH;
> }
> fallthrough;
> case 0:
> @@ -304,12 +336,9 @@ static int binder_install_single_page(struct binder_alloc *alloc,
> __free_page(page);
> pr_err("%d: %s failed to insert page at offset %lx with %d\n",
> alloc->pid, __func__, addr - alloc->vm_start, ret);
> - ret = -ENOMEM;
> break;
> }
> - mmap_read_unlock(alloc->mm);
> -out:
> - mmput_async(alloc->mm);
> +
> return ret;
> }
>
> --
> 2.47.0.199.ga7371fff76-goog
>
On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote:
> On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
> >
> > Use per-vma locking for concurrent page installations, this minimizes
> > contention with unrelated vmas improving performance. The mmap_lock is
> > still acquired when needed though, e.g. before get_user_pages_remote().
> >
> > Many thanks to Barry Song who posted a similar approach [1].
> >
> > Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1]
> > Cc: Nhat Pham <nphamcs@gmail.com>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Barry Song <v-songbaohua@oppo.com>
> > Cc: Suren Baghdasaryan <surenb@google.com>
> > Cc: Hillf Danton <hdanton@sina.com>
> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Signed-off-by: Carlos Llamas <cmllamas@google.com>
> > ---
> > drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++-----------
> > 1 file changed, 57 insertions(+), 28 deletions(-)
> >
> > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
> > index 814435a2601a..debfa541e01b 100644
> > --- a/drivers/android/binder_alloc.c
> > +++ b/drivers/android/binder_alloc.c
> > @@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc)
> > return smp_load_acquire(&alloc->mapped);
> > }
> >
> > +static struct page *binder_page_lookup(struct mm_struct *mm,
>
> Maybe pass "struct binder_alloc" in both binder_page_lookup() and
> binder_page_insert()?
I'm not sure this is worth it though. Yeah, it would match with
binder_page_insert() nicely, but also there is no usage for alloc in
binder_page_lookup(). It's only purpose would be to access the mm:
static struct page *binder_page_lookup(struct binder_alloc *alloc,
unsigned long addr)
{
struct mm_struct *mm = alloc->mm;
If you think this is cleaner I really don't mind adding it for v3.
> I like how previous code stabilized mm with mmget_not_zero() once vs
> now binder_page_lookup() and binder_page_insert() have to mmget/mmput
> individually. Not a big deal but looked cleaner.
Sure, I can factor this out (the way it was in v1).
>
> > + unsigned long addr)
> > +{
> > + struct page *page;
> > + long ret;
> > +
> > + if (!mmget_not_zero(mm))
> > + return NULL;
> > +
> > + mmap_read_lock(mm);
> > + ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL);
> > + mmap_read_unlock(mm);
> > + mmput_async(mm);
> > +
> > + return ret > 0 ? page : NULL;
> > +}
> > +
> > +static int binder_page_insert(struct binder_alloc *alloc,
> > + unsigned long addr,
> > + struct page *page)
> > +{
> > + struct mm_struct *mm = alloc->mm;
> > + struct vm_area_struct *vma;
> > + int ret = -ESRCH;
> > +
> > + if (!mmget_not_zero(mm))
> > + return -ESRCH;
> > +
> > + /* attempt per-vma lock first */
> > + vma = lock_vma_under_rcu(mm, addr);
> > + if (!vma)
> > + goto lock_mmap;
> > +
> > + if (binder_alloc_is_mapped(alloc))
>
> I don't think you need this check here. lock_vma_under_rcu() ensures
> that the VMA was not detached from the tree after locking the VMA, so
> if you got a VMA it's in the tree and it can't be removed (because
> it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
> called after VMA gets detached from the tree and that won't happen
> while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
> binder_alloc_is_mapped() has to always return true. A WARN_ON() check
> here to ensure that might be a better option.
Yes we are guaranteed to have _a_ non-isolated vma. However, the check
validates that it's the _expected_ vma. IIUC, our vma could have been
unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have
gotten the same address space assigned?
The binder_alloc_is_mapped() checks if the vma belongs to binder. This
reminds me, I should also check this for get_user_pages_remote().
>
> > + ret = vm_insert_page(vma, addr, page);
> > + vma_end_read(vma);
> > + goto done;
>
> I think the code would be more readable without these jumps:
>
> vma = lock_vma_under_rcu(mm, addr);
> if (vma) {
> if (!WARN_ON(!binder_alloc_is_mapped(alloc)))
> ret = vm_insert_page(vma, addr, page);
> vma_end_read(vma);
> } else {
> /* fall back to mmap_lock */
> mmap_read_lock(mm);
> vma = vma_lookup(mm, addr);
> if (vma && binder_alloc_is_mapped(alloc))
> ret = vm_insert_page(vma, addr, page);
> mmap_read_unlock(mm);
> }
> mmput_async(mm);
> return ret;
Ok. I'm thinking with mmput_async() being factored out, I'll add an
early return. e.g.:
vma = lock_vma_under_rcu(mm, addr);
if (vma) {
if (binder_alloc_is_mapped(alloc))
ret = vm_insert_page(vma, addr, page);
vma_end_read(vma);
return ret;
}
/* fall back to mmap_lock */
mmap_read_lock(mm);
[...]
Thanks,
Carlos Llamas
On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote:
>
> On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote:
> > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
> > >
> > > Use per-vma locking for concurrent page installations, this minimizes
> > > contention with unrelated vmas improving performance. The mmap_lock is
> > > still acquired when needed though, e.g. before get_user_pages_remote().
> > >
> > > Many thanks to Barry Song who posted a similar approach [1].
> > >
> > > Link: https://lore.kernel.org/all/20240902225009.34576-1-21cnbao@gmail.com/ [1]
> > > Cc: Nhat Pham <nphamcs@gmail.com>
> > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > Cc: Barry Song <v-songbaohua@oppo.com>
> > > Cc: Suren Baghdasaryan <surenb@google.com>
> > > Cc: Hillf Danton <hdanton@sina.com>
> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Signed-off-by: Carlos Llamas <cmllamas@google.com>
> > > ---
> > > drivers/android/binder_alloc.c | 85 +++++++++++++++++++++++-----------
> > > 1 file changed, 57 insertions(+), 28 deletions(-)
> > >
> > > diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
> > > index 814435a2601a..debfa541e01b 100644
> > > --- a/drivers/android/binder_alloc.c
> > > +++ b/drivers/android/binder_alloc.c
> > > @@ -233,6 +233,56 @@ static inline bool binder_alloc_is_mapped(struct binder_alloc *alloc)
> > > return smp_load_acquire(&alloc->mapped);
> > > }
> > >
> > > +static struct page *binder_page_lookup(struct mm_struct *mm,
> >
> > Maybe pass "struct binder_alloc" in both binder_page_lookup() and
> > binder_page_insert()?
>
> I'm not sure this is worth it though. Yeah, it would match with
> binder_page_insert() nicely, but also there is no usage for alloc in
> binder_page_lookup(). It's only purpose would be to access the mm:
>
> static struct page *binder_page_lookup(struct binder_alloc *alloc,
> unsigned long addr)
> {
> struct mm_struct *mm = alloc->mm;
>
> If you think this is cleaner I really don't mind adding it for v3.
>
> > I like how previous code stabilized mm with mmget_not_zero() once vs
> > now binder_page_lookup() and binder_page_insert() have to mmget/mmput
> > individually. Not a big deal but looked cleaner.
>
> Sure, I can factor this out (the way it was in v1).
>
> >
> > > + unsigned long addr)
> > > +{
> > > + struct page *page;
> > > + long ret;
> > > +
> > > + if (!mmget_not_zero(mm))
> > > + return NULL;
> > > +
> > > + mmap_read_lock(mm);
> > > + ret = get_user_pages_remote(mm, addr, 1, 0, &page, NULL);
> > > + mmap_read_unlock(mm);
> > > + mmput_async(mm);
> > > +
> > > + return ret > 0 ? page : NULL;
> > > +}
> > > +
> > > +static int binder_page_insert(struct binder_alloc *alloc,
> > > + unsigned long addr,
> > > + struct page *page)
> > > +{
> > > + struct mm_struct *mm = alloc->mm;
> > > + struct vm_area_struct *vma;
> > > + int ret = -ESRCH;
> > > +
> > > + if (!mmget_not_zero(mm))
> > > + return -ESRCH;
> > > +
> > > + /* attempt per-vma lock first */
> > > + vma = lock_vma_under_rcu(mm, addr);
> > > + if (!vma)
> > > + goto lock_mmap;
> > > +
> > > + if (binder_alloc_is_mapped(alloc))
> >
> > I don't think you need this check here. lock_vma_under_rcu() ensures
> > that the VMA was not detached from the tree after locking the VMA, so
> > if you got a VMA it's in the tree and it can't be removed (because
> > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
> > called after VMA gets detached from the tree and that won't happen
> > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
> > binder_alloc_is_mapped() has to always return true. A WARN_ON() check
> > here to ensure that might be a better option.
>
> Yes we are guaranteed to have _a_ non-isolated vma. However, the check
> validates that it's the _expected_ vma. IIUC, our vma could have been
> unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have
> gotten the same address space assigned?
No, this should never happen. lock_vma_under_rcu() specifically checks
the address range *after* it locks the VMA:
https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026
>
> The binder_alloc_is_mapped() checks if the vma belongs to binder. This
> reminds me, I should also check this for get_user_pages_remote().
>
> >
> > > + ret = vm_insert_page(vma, addr, page);
> > > + vma_end_read(vma);
> > > + goto done;
> >
> > I think the code would be more readable without these jumps:
> >
> > vma = lock_vma_under_rcu(mm, addr);
> > if (vma) {
> > if (!WARN_ON(!binder_alloc_is_mapped(alloc)))
> > ret = vm_insert_page(vma, addr, page);
> > vma_end_read(vma);
> > } else {
> > /* fall back to mmap_lock */
> > mmap_read_lock(mm);
> > vma = vma_lookup(mm, addr);
> > if (vma && binder_alloc_is_mapped(alloc))
> > ret = vm_insert_page(vma, addr, page);
> > mmap_read_unlock(mm);
> > }
> > mmput_async(mm);
> > return ret;
>
> Ok. I'm thinking with mmput_async() being factored out, I'll add an
> early return. e.g.:
>
> vma = lock_vma_under_rcu(mm, addr);
> if (vma) {
> if (binder_alloc_is_mapped(alloc))
> ret = vm_insert_page(vma, addr, page);
> vma_end_read(vma);
> return ret;
> }
>
> /* fall back to mmap_lock */
> mmap_read_lock(mm);
> [...]
>
>
> Thanks,
> Carlos Llamas
On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote:
> On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote:
> > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote:
> > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
> > > > +static int binder_page_insert(struct binder_alloc *alloc,
> > > > + unsigned long addr,
> > > > + struct page *page)
> > > > +{
> > > > + struct mm_struct *mm = alloc->mm;
> > > > + struct vm_area_struct *vma;
> > > > + int ret = -ESRCH;
> > > > +
> > > > + if (!mmget_not_zero(mm))
> > > > + return -ESRCH;
> > > > +
> > > > + /* attempt per-vma lock first */
> > > > + vma = lock_vma_under_rcu(mm, addr);
> > > > + if (!vma)
> > > > + goto lock_mmap;
> > > > +
> > > > + if (binder_alloc_is_mapped(alloc))
> > >
> > > I don't think you need this check here. lock_vma_under_rcu() ensures
> > > that the VMA was not detached from the tree after locking the VMA, so
> > > if you got a VMA it's in the tree and it can't be removed (because
> > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
> > > called after VMA gets detached from the tree and that won't happen
> > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
> > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check
> > > here to ensure that might be a better option.
> >
> > Yes we are guaranteed to have _a_ non-isolated vma. However, the check
> > validates that it's the _expected_ vma. IIUC, our vma could have been
> > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have
> > gotten the same address space assigned?
>
> No, this should never happen. lock_vma_under_rcu() specifically checks
> the address range *after* it locks the VMA:
> https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026
The scenario I'm describing is the following:
Proc A Proc B
mmap(addr, binder_fd)
binder_page_insert()
mmget_not_zero()
munmap(addr)
alloc->mapped = false;
[...]
// mmap other vma but same addr
mmap(addr, other_fd)
vma = lock_vma_under_rcu()
Isn't there a chance for the vma that Proc A receives is an unrelated
vma that was placed in the same address range?
On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote:
>
> On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote:
> > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote:
> > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote:
> > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
> > > > > +static int binder_page_insert(struct binder_alloc *alloc,
> > > > > + unsigned long addr,
> > > > > + struct page *page)
> > > > > +{
> > > > > + struct mm_struct *mm = alloc->mm;
> > > > > + struct vm_area_struct *vma;
> > > > > + int ret = -ESRCH;
> > > > > +
> > > > > + if (!mmget_not_zero(mm))
> > > > > + return -ESRCH;
> > > > > +
> > > > > + /* attempt per-vma lock first */
> > > > > + vma = lock_vma_under_rcu(mm, addr);
> > > > > + if (!vma)
> > > > > + goto lock_mmap;
> > > > > +
> > > > > + if (binder_alloc_is_mapped(alloc))
> > > >
> > > > I don't think you need this check here. lock_vma_under_rcu() ensures
> > > > that the VMA was not detached from the tree after locking the VMA, so
> > > > if you got a VMA it's in the tree and it can't be removed (because
> > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
> > > > called after VMA gets detached from the tree and that won't happen
> > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
> > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check
> > > > here to ensure that might be a better option.
> > >
> > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check
> > > validates that it's the _expected_ vma. IIUC, our vma could have been
> > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have
> > > gotten the same address space assigned?
> >
> > No, this should never happen. lock_vma_under_rcu() specifically checks
> > the address range *after* it locks the VMA:
> > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026
>
> The scenario I'm describing is the following:
>
> Proc A Proc B
> mmap(addr, binder_fd)
> binder_page_insert()
> mmget_not_zero()
> munmap(addr)
> alloc->mapped = false;
> [...]
> // mmap other vma but same addr
> mmap(addr, other_fd)
>
> vma = lock_vma_under_rcu()
>
> Isn't there a chance for the vma that Proc A receives is an unrelated
> vma that was placed in the same address range?
Ah, I see now. The VMA is a valid one and at the address we specified
but it does not belong to the binder. Yes, then you do need this
check.
On Thu, Nov 7, 2024 at 10:27 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote:
> >
> > On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote:
> > > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote:
> > > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote:
> > > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
> > > > > > +static int binder_page_insert(struct binder_alloc *alloc,
> > > > > > + unsigned long addr,
> > > > > > + struct page *page)
> > > > > > +{
> > > > > > + struct mm_struct *mm = alloc->mm;
> > > > > > + struct vm_area_struct *vma;
> > > > > > + int ret = -ESRCH;
> > > > > > +
> > > > > > + if (!mmget_not_zero(mm))
> > > > > > + return -ESRCH;
> > > > > > +
> > > > > > + /* attempt per-vma lock first */
> > > > > > + vma = lock_vma_under_rcu(mm, addr);
> > > > > > + if (!vma)
> > > > > > + goto lock_mmap;
> > > > > > +
> > > > > > + if (binder_alloc_is_mapped(alloc))
> > > > >
> > > > > I don't think you need this check here. lock_vma_under_rcu() ensures
> > > > > that the VMA was not detached from the tree after locking the VMA, so
> > > > > if you got a VMA it's in the tree and it can't be removed (because
> > > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
> > > > > called after VMA gets detached from the tree and that won't happen
> > > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
> > > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check
> > > > > here to ensure that might be a better option.
> > > >
> > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check
> > > > validates that it's the _expected_ vma. IIUC, our vma could have been
> > > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have
> > > > gotten the same address space assigned?
> > >
> > > No, this should never happen. lock_vma_under_rcu() specifically checks
> > > the address range *after* it locks the VMA:
> > > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026
> >
> > The scenario I'm describing is the following:
> >
> > Proc A Proc B
> > mmap(addr, binder_fd)
> > binder_page_insert()
> > mmget_not_zero()
> > munmap(addr)
> > alloc->mapped = false;
> > [...]
> > // mmap other vma but same addr
> > mmap(addr, other_fd)
> >
> > vma = lock_vma_under_rcu()
> >
> > Isn't there a chance for the vma that Proc A receives is an unrelated
> > vma that was placed in the same address range?
>
> Ah, I see now. The VMA is a valid one and at the address we specified
> but it does not belong to the binder. Yes, then you do need this
> check.
Is this scenario possible?:
Proc A Proc B
mmap(addr, binder_fd)
binder_page_insert()
mmget_not_zero()
munmap(addr)
alloc->mapped = false;
[...]
// mmap other vma but same addr
mmap(addr, other_fd)
mmap(other_addr, binder_fd)
vma = lock_vma_under_rcu(addr)
If so, I think your binder_alloc_is_mapped() check will return true
but the binder area is mapped at a different other_addr. To avoid that
I think you can check that "addr" still belongs to [alloc->vm_start,
alloc->buffer_size] after you obtained and locked the VMA.
On Thu, Nov 07, 2024 at 10:52:30AM -0800, Suren Baghdasaryan wrote:
> On Thu, Nov 7, 2024 at 10:27 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote:
> > >
> > > On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote:
> > > > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote:
> > > > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote:
> > > > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
> > > > > > > +static int binder_page_insert(struct binder_alloc *alloc,
> > > > > > > + unsigned long addr,
> > > > > > > + struct page *page)
> > > > > > > +{
> > > > > > > + struct mm_struct *mm = alloc->mm;
> > > > > > > + struct vm_area_struct *vma;
> > > > > > > + int ret = -ESRCH;
> > > > > > > +
> > > > > > > + if (!mmget_not_zero(mm))
> > > > > > > + return -ESRCH;
> > > > > > > +
> > > > > > > + /* attempt per-vma lock first */
> > > > > > > + vma = lock_vma_under_rcu(mm, addr);
> > > > > > > + if (!vma)
> > > > > > > + goto lock_mmap;
> > > > > > > +
> > > > > > > + if (binder_alloc_is_mapped(alloc))
> > > > > >
> > > > > > I don't think you need this check here. lock_vma_under_rcu() ensures
> > > > > > that the VMA was not detached from the tree after locking the VMA, so
> > > > > > if you got a VMA it's in the tree and it can't be removed (because
> > > > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
> > > > > > called after VMA gets detached from the tree and that won't happen
> > > > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
> > > > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check
> > > > > > here to ensure that might be a better option.
> > > > >
> > > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check
> > > > > validates that it's the _expected_ vma. IIUC, our vma could have been
> > > > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have
> > > > > gotten the same address space assigned?
> > > >
> > > > No, this should never happen. lock_vma_under_rcu() specifically checks
> > > > the address range *after* it locks the VMA:
> > > > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026
> > >
> > > The scenario I'm describing is the following:
> > >
> > > Proc A Proc B
> > > mmap(addr, binder_fd)
> > > binder_page_insert()
> > > mmget_not_zero()
> > > munmap(addr)
> > > alloc->mapped = false;
> > > [...]
> > > // mmap other vma but same addr
> > > mmap(addr, other_fd)
> > >
> > > vma = lock_vma_under_rcu()
> > >
> > > Isn't there a chance for the vma that Proc A receives is an unrelated
> > > vma that was placed in the same address range?
> >
> > Ah, I see now. The VMA is a valid one and at the address we specified
> > but it does not belong to the binder. Yes, then you do need this
> > check.
>
> Is this scenario possible?:
>
> Proc A Proc B
> mmap(addr, binder_fd)
> binder_page_insert()
> mmget_not_zero()
> munmap(addr)
> alloc->mapped = false;
> [...]
> // mmap other vma but same addr
> mmap(addr, other_fd)
> mmap(other_addr, binder_fd)
> vma = lock_vma_under_rcu(addr)
>
> If so, I think your binder_alloc_is_mapped() check will return true
> but the binder area is mapped at a different other_addr. To avoid that
> I think you can check that "addr" still belongs to [alloc->vm_start,
> alloc->buffer_size] after you obtained and locked the VMA.
Wait, I thought that vm_ops->close() was called with the mmap_lock in
exclusive mode. This is where binder clears the alloc->mapped. If this
is not the case (was it ever?), then I'd definitely need to fix this.
I'll have a closer look.
Thanks,
Carlos Llamas
On Thu, Nov 07, 2024 at 07:33:24PM +0000, Carlos Llamas wrote:
> On Thu, Nov 07, 2024 at 10:52:30AM -0800, Suren Baghdasaryan wrote:
> > On Thu, Nov 7, 2024 at 10:27 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Thu, Nov 7, 2024 at 10:19 AM Carlos Llamas <cmllamas@google.com> wrote:
> > > >
> > > > On Thu, Nov 07, 2024 at 10:04:23AM -0800, Suren Baghdasaryan wrote:
> > > > > On Thu, Nov 7, 2024 at 9:55 AM Carlos Llamas <cmllamas@google.com> wrote:
> > > > > > On Thu, Nov 07, 2024 at 08:16:39AM -0800, Suren Baghdasaryan wrote:
> > > > > > > On Wed, Nov 6, 2024 at 8:03 PM Carlos Llamas <cmllamas@google.com> wrote:
> > > > > > > > +static int binder_page_insert(struct binder_alloc *alloc,
> > > > > > > > + unsigned long addr,
> > > > > > > > + struct page *page)
> > > > > > > > +{
> > > > > > > > + struct mm_struct *mm = alloc->mm;
> > > > > > > > + struct vm_area_struct *vma;
> > > > > > > > + int ret = -ESRCH;
> > > > > > > > +
> > > > > > > > + if (!mmget_not_zero(mm))
> > > > > > > > + return -ESRCH;
> > > > > > > > +
> > > > > > > > + /* attempt per-vma lock first */
> > > > > > > > + vma = lock_vma_under_rcu(mm, addr);
> > > > > > > > + if (!vma)
> > > > > > > > + goto lock_mmap;
> > > > > > > > +
> > > > > > > > + if (binder_alloc_is_mapped(alloc))
> > > > > > >
> > > > > > > I don't think you need this check here. lock_vma_under_rcu() ensures
> > > > > > > that the VMA was not detached from the tree after locking the VMA, so
> > > > > > > if you got a VMA it's in the tree and it can't be removed (because
> > > > > > > it's locked). remove_vma()->vma_close()->vma->vm_ops->close() is
> > > > > > > called after VMA gets detached from the tree and that won't happen
> > > > > > > while VMA is locked. So, if lock_vma_under_rcu() returns a VMA,
> > > > > > > binder_alloc_is_mapped() has to always return true. A WARN_ON() check
> > > > > > > here to ensure that might be a better option.
> > > > > >
> > > > > > Yes we are guaranteed to have _a_ non-isolated vma. However, the check
> > > > > > validates that it's the _expected_ vma. IIUC, our vma could have been
> > > > > > unmapped (clearing alloc->mapped) and a _new_ unrelated vma could have
> > > > > > gotten the same address space assigned?
> > > > >
> > > > > No, this should never happen. lock_vma_under_rcu() specifically checks
> > > > > the address range *after* it locks the VMA:
> > > > > https://elixir.bootlin.com/linux/v6.11.6/source/mm/memory.c#L6026
> > > >
> > > > The scenario I'm describing is the following:
> > > >
> > > > Proc A Proc B
> > > > mmap(addr, binder_fd)
> > > > binder_page_insert()
> > > > mmget_not_zero()
> > > > munmap(addr)
> > > > alloc->mapped = false;
> > > > [...]
> > > > // mmap other vma but same addr
> > > > mmap(addr, other_fd)
> > > >
> > > > vma = lock_vma_under_rcu()
> > > >
> > > > Isn't there a chance for the vma that Proc A receives is an unrelated
> > > > vma that was placed in the same address range?
> > >
> > > Ah, I see now. The VMA is a valid one and at the address we specified
> > > but it does not belong to the binder. Yes, then you do need this
> > > check.
> >
> > Is this scenario possible?:
> >
> > Proc A Proc B
> > mmap(addr, binder_fd)
> > binder_page_insert()
> > mmget_not_zero()
> > munmap(addr)
> > alloc->mapped = false;
> > [...]
> > // mmap other vma but same addr
> > mmap(addr, other_fd)
> > mmap(other_addr, binder_fd)
> > vma = lock_vma_under_rcu(addr)
> >
> > If so, I think your binder_alloc_is_mapped() check will return true
> > but the binder area is mapped at a different other_addr. To avoid that
> > I think you can check that "addr" still belongs to [alloc->vm_start,
> > alloc->buffer_size] after you obtained and locked the VMA.
>
> Wait, I thought that vm_ops->close() was called with the mmap_lock in
> exclusive mode. This is where binder clears the alloc->mapped. If this
> is not the case (was it ever?), then I'd definitely need to fix this.
On a second though, we don't need the mmap_sem in exclusive mode. When
we acquire the vma through lock_vma_under_rcu() we guarantee it's not
isolated and if our alloc->mapped is set, that means it has not been
unmapped either. So we are good to proceed.
--
Carlos Llamas
© 2016 - 2026 Red Hat, Inc.