To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
object reuse before RCU grace period is over will be detected by
lock_vma_under_rcu().
Current checks are sufficient as long as vma is detached before it is
freed. The only place this is not currently happening is in exit_mmap().
Add the missing vma_mark_detached() in exit_mmap().
Another issue which might trick lock_vma_under_rcu() during vma reuse
is vm_area_dup(), which copies the entire content of the vma into a new
one, overriding new vma's vm_refcnt and temporarily making it appear as
attached. This might trick a racing lock_vma_under_rcu() to operate on
a reused vma if it found the vma before it got reused. To prevent this
situation, we should ensure that vm_refcnt stays at detached state (0)
when it is copied and advances to attached state only after it is added
into the vma tree. Introduce vma_copy() which preserves new vma's
vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
state with no current readers when they are freed, lock_vma_under_rcu()
will not be able to take vm_refcnt after vma got detached even if vma
is reused.
Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
vm_area_struct reuse and will minimize the number of call_rcu() calls.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm.h | 2 -
include/linux/mm_types.h | 10 +++--
include/linux/slab.h | 6 ---
kernel/fork.c | 72 ++++++++++++++++++++------------
mm/mmap.c | 3 +-
mm/vma.c | 11 ++---
mm/vma.h | 2 +-
tools/testing/vma/vma_internal.h | 7 +---
8 files changed, 59 insertions(+), 54 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1d6b1563b956..a674558e4c05 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
-/* Use only if VMA has no other users */
-void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2d83d79d1899..93bfcd0c1fde 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
typedef unsigned long vm_flags_t;
+/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
/*
* A region containing a mapping of a non-memory backed file under NOMMU
* conditions. These are held in a global tree and are pinned by the VMAs that
@@ -695,9 +701,7 @@ struct vm_area_struct {
unsigned long vm_start;
unsigned long vm_end;
};
-#ifdef CONFIG_PER_VMA_LOCK
- struct rcu_head vm_rcu; /* Used for deferred freeing. */
-#endif
+ freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
};
/*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 10a971c2bde3..681b685b6c4e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -234,12 +234,6 @@ enum _slab_flag_bits {
#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
#endif
-/*
- * freeptr_t represents a SLUB freelist pointer, which might be encoded
- * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
- */
-typedef struct { unsigned long v; } freeptr_t;
-
/*
* ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d9275783cf8..770b973a099c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
return vma;
}
+static void vma_copy(const struct vm_area_struct *src, struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+ dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+}
+
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
@@ -458,11 +493,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- data_race(memcpy(new, orig, sizeof(*new)));
+ vma_copy(orig, new);
vma_lock_init(new, true);
INIT_LIST_HEAD(&new->anon_vma_chain);
vma_numab_state_init(new);
@@ -471,7 +502,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
return new;
}
-void __vm_area_free(struct vm_area_struct *vma)
+void vm_area_free(struct vm_area_struct *vma)
{
/* The vma should be detached while being destroyed. */
vma_assert_detached(vma);
@@ -480,25 +511,6 @@ void __vm_area_free(struct vm_area_struct *vma)
kmem_cache_free(vm_area_cachep, vma);
}
-#ifdef CONFIG_PER_VMA_LOCK
-static void vm_area_free_rcu_cb(struct rcu_head *head)
-{
- struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
- vm_rcu);
-
- __vm_area_free(vma);
-}
-#endif
-
-void vm_area_free(struct vm_area_struct *vma)
-{
-#ifdef CONFIG_PER_VMA_LOCK
- call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
-#else
- __vm_area_free(vma);
-#endif
-}
-
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -3144,6 +3156,11 @@ void __init mm_cache_init(void)
void __init proc_caches_init(void)
{
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
+
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3160,8 +3177,9 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct,
- SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC|
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
diff --git a/mm/mmap.c b/mm/mmap.c
index cda01071c7b1..7aa36216ecc0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1305,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, /* unreachable = */ true);
+ vma_mark_detached(vma);
+ remove_vma(vma);
count++;
cond_resched();
vma = vma_next(&vmi);
diff --git a/mm/vma.c b/mm/vma.c
index 93ff42ac2002..0a5158d611e3 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -406,19 +406,14 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
/*
* Close a vm structure and free it.
*/
-void remove_vma(struct vm_area_struct *vma, bool unreachable)
+void remove_vma(struct vm_area_struct *vma)
{
might_sleep();
vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
- if (unreachable) {
- vma_mark_detached(vma);
- __vm_area_free(vma);
- } else {
- vm_area_free(vma);
- }
+ vm_area_free(vma);
}
/*
@@ -1201,7 +1196,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, /* unreachable = */ false);
+ remove_vma(vma);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
diff --git a/mm/vma.h b/mm/vma.h
index 63dd38d5230c..f51005b95b39 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -170,7 +170,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
-void remove_vma(struct vm_area_struct *vma, bool unreachable);
+void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 2ce032943861..49a85ce0d45a 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -697,14 +697,9 @@ static inline void mpol_put(struct mempolicy *)
{
}
-static inline void __vm_area_free(struct vm_area_struct *vma)
-{
- free(vma);
-}
-
static inline void vm_area_free(struct vm_area_struct *vma)
{
- __vm_area_free(vma);
+ free(vma);
}
static inline void lru_add_drain(void)
--
2.47.1.613.gc27f4b7a9f-goog
* Suren Baghdasaryan <surenb@google.com> [250108 21:31]:
> To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> object reuse before RCU grace period is over will be detected by
> lock_vma_under_rcu().
> Current checks are sufficient as long as vma is detached before it is
> freed. The only place this is not currently happening is in exit_mmap().
> Add the missing vma_mark_detached() in exit_mmap().
> Another issue which might trick lock_vma_under_rcu() during vma reuse
> is vm_area_dup(), which copies the entire content of the vma into a new
> one, overriding new vma's vm_refcnt and temporarily making it appear as
> attached. This might trick a racing lock_vma_under_rcu() to operate on
> a reused vma if it found the vma before it got reused. To prevent this
> situation, we should ensure that vm_refcnt stays at detached state (0)
> when it is copied and advances to attached state only after it is added
> into the vma tree. Introduce vma_copy() which preserves new vma's
> vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
> state with no current readers when they are freed, lock_vma_under_rcu()
> will not be able to take vm_refcnt after vma got detached even if vma
> is reused.
> Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
> vm_area_struct reuse and will minimize the number of call_rcu() calls.
>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
> include/linux/mm.h | 2 -
> include/linux/mm_types.h | 10 +++--
> include/linux/slab.h | 6 ---
> kernel/fork.c | 72 ++++++++++++++++++++------------
> mm/mmap.c | 3 +-
> mm/vma.c | 11 ++---
> mm/vma.h | 2 +-
> tools/testing/vma/vma_internal.h | 7 +---
> 8 files changed, 59 insertions(+), 54 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 1d6b1563b956..a674558e4c05 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
> struct vm_area_struct *vm_area_alloc(struct mm_struct *);
> struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
> void vm_area_free(struct vm_area_struct *);
> -/* Use only if VMA has no other users */
> -void __vm_area_free(struct vm_area_struct *vma);
>
> #ifndef CONFIG_MMU
> extern struct rb_root nommu_region_tree;
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 2d83d79d1899..93bfcd0c1fde 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
>
> typedef unsigned long vm_flags_t;
>
> +/*
> + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> + */
> +typedef struct { unsigned long v; } freeptr_t;
> +
> /*
> * A region containing a mapping of a non-memory backed file under NOMMU
> * conditions. These are held in a global tree and are pinned by the VMAs that
> @@ -695,9 +701,7 @@ struct vm_area_struct {
> unsigned long vm_start;
> unsigned long vm_end;
> };
> -#ifdef CONFIG_PER_VMA_LOCK
> - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> -#endif
> + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> };
>
> /*
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 10a971c2bde3..681b685b6c4e 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -234,12 +234,6 @@ enum _slab_flag_bits {
> #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
> #endif
>
> -/*
> - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> - */
> -typedef struct { unsigned long v; } freeptr_t;
> -
> /*
> * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
> *
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9d9275783cf8..770b973a099c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> return vma;
> }
>
There exists a copy_vma() which copies the vma to a new area in the mm
in rmap. Naming this vma_copy() is confusing :)
It might be better to just put this code in the vm_area_dup() or call it
__vm_area_dup(), or __vma_dup() ?
> +static void vma_copy(const struct vm_area_struct *src, struct vm_area_struct *dest)
> +{
> + dest->vm_mm = src->vm_mm;
> + dest->vm_ops = src->vm_ops;
> + dest->vm_start = src->vm_start;
> + dest->vm_end = src->vm_end;
> + dest->anon_vma = src->anon_vma;
> + dest->vm_pgoff = src->vm_pgoff;
> + dest->vm_file = src->vm_file;
> + dest->vm_private_data = src->vm_private_data;
> + vm_flags_init(dest, src->vm_flags);
> + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> + sizeof(dest->vm_page_prot));
> + /*
> + * src->shared.rb may be modified concurrently, but the clone
> + * will be reinitialized.
> + */
> + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> + sizeof(dest->vm_userfaultfd_ctx));
> +#ifdef CONFIG_ANON_VMA_NAME
> + dest->anon_name = src->anon_name;
> +#endif
> +#ifdef CONFIG_SWAP
> + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> + sizeof(dest->swap_readahead_info));
> +#endif
> +#ifndef CONFIG_MMU
> + dest->vm_region = src->vm_region;
> +#endif
> +#ifdef CONFIG_NUMA
> + dest->vm_policy = src->vm_policy;
> +#endif
> +}
> +
> struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> {
> struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> @@ -458,11 +493,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
>
> ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> - /*
> - * orig->shared.rb may be modified concurrently, but the clone
> - * will be reinitialized.
> - */
> - data_race(memcpy(new, orig, sizeof(*new)));
> + vma_copy(orig, new);
> vma_lock_init(new, true);
I think this suffers from a race still?
That is, we can still race between vm_lock_seq == mm_lock_seq and the
lock acquire, where a free and reuse happens. In the even that the
reader is caught between the sequence and lock taking, the
vma->vmlock_dep_map may not be replaced and it could see the old lock
(or zero?) and things go bad:
It could try to take vmlock_dep_map == 0 in read mode.
It can take the old lock, detect the refcnt is wrong and release the new
lock.
Thanks,
Liam
On Fri, Jan 10, 2025 at 9:48 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [250108 21:31]:
> > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> > object reuse before RCU grace period is over will be detected by
> > lock_vma_under_rcu().
> > Current checks are sufficient as long as vma is detached before it is
> > freed. The only place this is not currently happening is in exit_mmap().
> > Add the missing vma_mark_detached() in exit_mmap().
> > Another issue which might trick lock_vma_under_rcu() during vma reuse
> > is vm_area_dup(), which copies the entire content of the vma into a new
> > one, overriding new vma's vm_refcnt and temporarily making it appear as
> > attached. This might trick a racing lock_vma_under_rcu() to operate on
> > a reused vma if it found the vma before it got reused. To prevent this
> > situation, we should ensure that vm_refcnt stays at detached state (0)
> > when it is copied and advances to attached state only after it is added
> > into the vma tree. Introduce vma_copy() which preserves new vma's
> > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
> > state with no current readers when they are freed, lock_vma_under_rcu()
> > will not be able to take vm_refcnt after vma got detached even if vma
> > is reused.
> > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
> > vm_area_struct reuse and will minimize the number of call_rcu() calls.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> > include/linux/mm.h | 2 -
> > include/linux/mm_types.h | 10 +++--
> > include/linux/slab.h | 6 ---
> > kernel/fork.c | 72 ++++++++++++++++++++------------
> > mm/mmap.c | 3 +-
> > mm/vma.c | 11 ++---
> > mm/vma.h | 2 +-
> > tools/testing/vma/vma_internal.h | 7 +---
> > 8 files changed, 59 insertions(+), 54 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 1d6b1563b956..a674558e4c05 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
> > struct vm_area_struct *vm_area_alloc(struct mm_struct *);
> > struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
> > void vm_area_free(struct vm_area_struct *);
> > -/* Use only if VMA has no other users */
> > -void __vm_area_free(struct vm_area_struct *vma);
> >
> > #ifndef CONFIG_MMU
> > extern struct rb_root nommu_region_tree;
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 2d83d79d1899..93bfcd0c1fde 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
> >
> > typedef unsigned long vm_flags_t;
> >
> > +/*
> > + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > + */
> > +typedef struct { unsigned long v; } freeptr_t;
> > +
> > /*
> > * A region containing a mapping of a non-memory backed file under NOMMU
> > * conditions. These are held in a global tree and are pinned by the VMAs that
> > @@ -695,9 +701,7 @@ struct vm_area_struct {
> > unsigned long vm_start;
> > unsigned long vm_end;
> > };
> > -#ifdef CONFIG_PER_VMA_LOCK
> > - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> > -#endif
> > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> > };
> >
> > /*
> > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > index 10a971c2bde3..681b685b6c4e 100644
> > --- a/include/linux/slab.h
> > +++ b/include/linux/slab.h
> > @@ -234,12 +234,6 @@ enum _slab_flag_bits {
> > #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
> > #endif
> >
> > -/*
> > - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > - */
> > -typedef struct { unsigned long v; } freeptr_t;
> > -
> > /*
> > * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
> > *
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 9d9275783cf8..770b973a099c 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > return vma;
> > }
> >
>
> There exists a copy_vma() which copies the vma to a new area in the mm
> in rmap. Naming this vma_copy() is confusing :)
>
> It might be better to just put this code in the vm_area_dup() or call it
> __vm_area_dup(), or __vma_dup() ?
Hmm. It's not really duplicating a vma but copying its content (no
allocation). How about __vm_area_copy() to indicate it is copying
vm_area_struct content?
>
> > +static void vma_copy(const struct vm_area_struct *src, struct vm_area_struct *dest)
> > +{
> > + dest->vm_mm = src->vm_mm;
> > + dest->vm_ops = src->vm_ops;
> > + dest->vm_start = src->vm_start;
> > + dest->vm_end = src->vm_end;
> > + dest->anon_vma = src->anon_vma;
> > + dest->vm_pgoff = src->vm_pgoff;
> > + dest->vm_file = src->vm_file;
> > + dest->vm_private_data = src->vm_private_data;
> > + vm_flags_init(dest, src->vm_flags);
> > + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > + sizeof(dest->vm_page_prot));
> > + /*
> > + * src->shared.rb may be modified concurrently, but the clone
> > + * will be reinitialized.
> > + */
> > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > + sizeof(dest->vm_userfaultfd_ctx));
> > +#ifdef CONFIG_ANON_VMA_NAME
> > + dest->anon_name = src->anon_name;
> > +#endif
> > +#ifdef CONFIG_SWAP
> > + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > + sizeof(dest->swap_readahead_info));
> > +#endif
> > +#ifndef CONFIG_MMU
> > + dest->vm_region = src->vm_region;
> > +#endif
> > +#ifdef CONFIG_NUMA
> > + dest->vm_policy = src->vm_policy;
> > +#endif
> > +}
> > +
> > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > {
> > struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> > @@ -458,11 +493,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> >
> > ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> > ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> > - /*
> > - * orig->shared.rb may be modified concurrently, but the clone
> > - * will be reinitialized.
> > - */
> > - data_race(memcpy(new, orig, sizeof(*new)));
> > + vma_copy(orig, new);
> > vma_lock_init(new, true);
>
> I think this suffers from a race still?
>
> That is, we can still race between vm_lock_seq == mm_lock_seq and the
> lock acquire, where a free and reuse happens. In the even that the
> reader is caught between the sequence and lock taking, the
> vma->vmlock_dep_map may not be replaced and it could see the old lock
> (or zero?) and things go bad:
>
> It could try to take vmlock_dep_map == 0 in read mode.
>
> It can take the old lock, detect the refcnt is wrong and release the new
> lock.
I don't think this race can happen. Notice a call to
vma_assert_detached() inside vm_area_free(), so before vma is freed
and possibly reused, it has to be detached. vma_mark_detached()
ensures that there are no current or future readers by executing the
__vma_enter_locked() + __vma_exit_locked() sequence if vm_refcnt is
not already at 0. Once __vma_exit_locked() is done, vm_refcnt is at 0
and any new reader will be rejected on
__refcount_inc_not_zero_limited(), before even checking vm_lock_seq ==
mm_lock_seq. Even if a reader tries to sneak in between
__vma_enter_locked() and __vma_exit_locked() calls,
__refcount_inc_not_zero_limited() will reject it because
VMA_LOCK_OFFSET is set and VMA_REF_LIMIT will be violated.
IOW, when VMA is freed, it's guaranteed to be detached with no current
or future readers, therefore "race between vm_lock_seq == mm_lock_seq
and the lock acquire, where a free and reuse happens" should not be
possible.
Did I understand your concern correctly and does my explanation make
sense to you?
>
> Thanks,
> Liam
* Suren Baghdasaryan <surenb@google.com> [250110 14:08]:
> On Fri, Jan 10, 2025 at 9:48 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [250108 21:31]:
> > > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> > > object reuse before RCU grace period is over will be detected by
> > > lock_vma_under_rcu().
> > > Current checks are sufficient as long as vma is detached before it is
> > > freed. The only place this is not currently happening is in exit_mmap().
> > > Add the missing vma_mark_detached() in exit_mmap().
> > > Another issue which might trick lock_vma_under_rcu() during vma reuse
> > > is vm_area_dup(), which copies the entire content of the vma into a new
> > > one, overriding new vma's vm_refcnt and temporarily making it appear as
> > > attached. This might trick a racing lock_vma_under_rcu() to operate on
> > > a reused vma if it found the vma before it got reused. To prevent this
> > > situation, we should ensure that vm_refcnt stays at detached state (0)
> > > when it is copied and advances to attached state only after it is added
> > > into the vma tree. Introduce vma_copy() which preserves new vma's
> > > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
> > > state with no current readers when they are freed, lock_vma_under_rcu()
> > > will not be able to take vm_refcnt after vma got detached even if vma
> > > is reused.
> > > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
> > > vm_area_struct reuse and will minimize the number of call_rcu() calls.
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > ---
> > > include/linux/mm.h | 2 -
> > > include/linux/mm_types.h | 10 +++--
> > > include/linux/slab.h | 6 ---
> > > kernel/fork.c | 72 ++++++++++++++++++++------------
> > > mm/mmap.c | 3 +-
> > > mm/vma.c | 11 ++---
> > > mm/vma.h | 2 +-
> > > tools/testing/vma/vma_internal.h | 7 +---
> > > 8 files changed, 59 insertions(+), 54 deletions(-)
> > >
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 1d6b1563b956..a674558e4c05 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
> > > struct vm_area_struct *vm_area_alloc(struct mm_struct *);
> > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
> > > void vm_area_free(struct vm_area_struct *);
> > > -/* Use only if VMA has no other users */
> > > -void __vm_area_free(struct vm_area_struct *vma);
> > >
> > > #ifndef CONFIG_MMU
> > > extern struct rb_root nommu_region_tree;
> > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > index 2d83d79d1899..93bfcd0c1fde 100644
> > > --- a/include/linux/mm_types.h
> > > +++ b/include/linux/mm_types.h
> > > @@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
> > >
> > > typedef unsigned long vm_flags_t;
> > >
> > > +/*
> > > + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > + */
> > > +typedef struct { unsigned long v; } freeptr_t;
> > > +
> > > /*
> > > * A region containing a mapping of a non-memory backed file under NOMMU
> > > * conditions. These are held in a global tree and are pinned by the VMAs that
> > > @@ -695,9 +701,7 @@ struct vm_area_struct {
> > > unsigned long vm_start;
> > > unsigned long vm_end;
> > > };
> > > -#ifdef CONFIG_PER_VMA_LOCK
> > > - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> > > -#endif
> > > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> > > };
> > >
> > > /*
> > > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > > index 10a971c2bde3..681b685b6c4e 100644
> > > --- a/include/linux/slab.h
> > > +++ b/include/linux/slab.h
> > > @@ -234,12 +234,6 @@ enum _slab_flag_bits {
> > > #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
> > > #endif
> > >
> > > -/*
> > > - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > - */
> > > -typedef struct { unsigned long v; } freeptr_t;
> > > -
> > > /*
> > > * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
> > > *
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 9d9275783cf8..770b973a099c 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > return vma;
> > > }
> > >
> >
> > There exists a copy_vma() which copies the vma to a new area in the mm
> > in rmap. Naming this vma_copy() is confusing :)
> >
> > It might be better to just put this code in the vm_area_dup() or call it
> > __vm_area_dup(), or __vma_dup() ?
>
> Hmm. It's not really duplicating a vma but copying its content (no
> allocation). How about __vm_area_copy() to indicate it is copying
> vm_area_struct content?
Sorry, I missed this. it's not copying all the content either.
vm_area_init_dup() maybe?
Considering the scope of the series, I'm not sure I want to have a
bike shed conversation.. But I also don't want copy_<foo> <foo>_copy
confusion in the future.
On Fri, Jan 10, 2025 at 11:51 AM 'Liam R. Howlett' via kernel-team
<kernel-team@android.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [250110 14:08]:
> > On Fri, Jan 10, 2025 at 9:48 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Suren Baghdasaryan <surenb@google.com> [250108 21:31]:
> > > > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> > > > object reuse before RCU grace period is over will be detected by
> > > > lock_vma_under_rcu().
> > > > Current checks are sufficient as long as vma is detached before it is
> > > > freed. The only place this is not currently happening is in exit_mmap().
> > > > Add the missing vma_mark_detached() in exit_mmap().
> > > > Another issue which might trick lock_vma_under_rcu() during vma reuse
> > > > is vm_area_dup(), which copies the entire content of the vma into a new
> > > > one, overriding new vma's vm_refcnt and temporarily making it appear as
> > > > attached. This might trick a racing lock_vma_under_rcu() to operate on
> > > > a reused vma if it found the vma before it got reused. To prevent this
> > > > situation, we should ensure that vm_refcnt stays at detached state (0)
> > > > when it is copied and advances to attached state only after it is added
> > > > into the vma tree. Introduce vma_copy() which preserves new vma's
> > > > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
> > > > state with no current readers when they are freed, lock_vma_under_rcu()
> > > > will not be able to take vm_refcnt after vma got detached even if vma
> > > > is reused.
> > > > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
> > > > vm_area_struct reuse and will minimize the number of call_rcu() calls.
> > > >
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > > ---
> > > > include/linux/mm.h | 2 -
> > > > include/linux/mm_types.h | 10 +++--
> > > > include/linux/slab.h | 6 ---
> > > > kernel/fork.c | 72 ++++++++++++++++++++------------
> > > > mm/mmap.c | 3 +-
> > > > mm/vma.c | 11 ++---
> > > > mm/vma.h | 2 +-
> > > > tools/testing/vma/vma_internal.h | 7 +---
> > > > 8 files changed, 59 insertions(+), 54 deletions(-)
> > > >
> > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > index 1d6b1563b956..a674558e4c05 100644
> > > > --- a/include/linux/mm.h
> > > > +++ b/include/linux/mm.h
> > > > @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
> > > > struct vm_area_struct *vm_area_alloc(struct mm_struct *);
> > > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
> > > > void vm_area_free(struct vm_area_struct *);
> > > > -/* Use only if VMA has no other users */
> > > > -void __vm_area_free(struct vm_area_struct *vma);
> > > >
> > > > #ifndef CONFIG_MMU
> > > > extern struct rb_root nommu_region_tree;
> > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > index 2d83d79d1899..93bfcd0c1fde 100644
> > > > --- a/include/linux/mm_types.h
> > > > +++ b/include/linux/mm_types.h
> > > > @@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
> > > >
> > > > typedef unsigned long vm_flags_t;
> > > >
> > > > +/*
> > > > + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > > + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > > + */
> > > > +typedef struct { unsigned long v; } freeptr_t;
> > > > +
> > > > /*
> > > > * A region containing a mapping of a non-memory backed file under NOMMU
> > > > * conditions. These are held in a global tree and are pinned by the VMAs that
> > > > @@ -695,9 +701,7 @@ struct vm_area_struct {
> > > > unsigned long vm_start;
> > > > unsigned long vm_end;
> > > > };
> > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> > > > -#endif
> > > > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> > > > };
> > > >
> > > > /*
> > > > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > > > index 10a971c2bde3..681b685b6c4e 100644
> > > > --- a/include/linux/slab.h
> > > > +++ b/include/linux/slab.h
> > > > @@ -234,12 +234,6 @@ enum _slab_flag_bits {
> > > > #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
> > > > #endif
> > > >
> > > > -/*
> > > > - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > > - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > > - */
> > > > -typedef struct { unsigned long v; } freeptr_t;
> > > > -
> > > > /*
> > > > * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
> > > > *
> > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > index 9d9275783cf8..770b973a099c 100644
> > > > --- a/kernel/fork.c
> > > > +++ b/kernel/fork.c
> > > > @@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > return vma;
> > > > }
> > > >
> > >
> > > There exists a copy_vma() which copies the vma to a new area in the mm
> > > in rmap. Naming this vma_copy() is confusing :)
> > >
> > > It might be better to just put this code in the vm_area_dup() or call it
> > > __vm_area_dup(), or __vma_dup() ?
> >
> > Hmm. It's not really duplicating a vma but copying its content (no
> > allocation). How about __vm_area_copy() to indicate it is copying
> > vm_area_struct content?
>
>
> Sorry, I missed this. it's not copying all the content either.
>
> vm_area_init_dup() maybe?
Ah, how about vm_area_init_from(src, dest)?
>
> Considering the scope of the series, I'm not sure I want to have a
> bike shed conversation.. But I also don't want copy_<foo> <foo>_copy
> confusion in the future.
>
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
* Suren Baghdasaryan <surenb@google.com> [250110 15:40]:
> On Fri, Jan 10, 2025 at 11:51 AM 'Liam R. Howlett' via kernel-team
> <kernel-team@android.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [250110 14:08]:
> > > On Fri, Jan 10, 2025 at 9:48 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > > >
> > > > * Suren Baghdasaryan <surenb@google.com> [250108 21:31]:
> > > > > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> > > > > object reuse before RCU grace period is over will be detected by
> > > > > lock_vma_under_rcu().
> > > > > Current checks are sufficient as long as vma is detached before it is
> > > > > freed. The only place this is not currently happening is in exit_mmap().
> > > > > Add the missing vma_mark_detached() in exit_mmap().
> > > > > Another issue which might trick lock_vma_under_rcu() during vma reuse
> > > > > is vm_area_dup(), which copies the entire content of the vma into a new
> > > > > one, overriding new vma's vm_refcnt and temporarily making it appear as
> > > > > attached. This might trick a racing lock_vma_under_rcu() to operate on
> > > > > a reused vma if it found the vma before it got reused. To prevent this
> > > > > situation, we should ensure that vm_refcnt stays at detached state (0)
> > > > > when it is copied and advances to attached state only after it is added
> > > > > into the vma tree. Introduce vma_copy() which preserves new vma's
> > > > > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
> > > > > state with no current readers when they are freed, lock_vma_under_rcu()
> > > > > will not be able to take vm_refcnt after vma got detached even if vma
> > > > > is reused.
> > > > > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
> > > > > vm_area_struct reuse and will minimize the number of call_rcu() calls.
> > > > >
> > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > > > ---
> > > > > include/linux/mm.h | 2 -
> > > > > include/linux/mm_types.h | 10 +++--
> > > > > include/linux/slab.h | 6 ---
> > > > > kernel/fork.c | 72 ++++++++++++++++++++------------
> > > > > mm/mmap.c | 3 +-
> > > > > mm/vma.c | 11 ++---
> > > > > mm/vma.h | 2 +-
> > > > > tools/testing/vma/vma_internal.h | 7 +---
> > > > > 8 files changed, 59 insertions(+), 54 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > > index 1d6b1563b956..a674558e4c05 100644
> > > > > --- a/include/linux/mm.h
> > > > > +++ b/include/linux/mm.h
> > > > > @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
> > > > > struct vm_area_struct *vm_area_alloc(struct mm_struct *);
> > > > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
> > > > > void vm_area_free(struct vm_area_struct *);
> > > > > -/* Use only if VMA has no other users */
> > > > > -void __vm_area_free(struct vm_area_struct *vma);
> > > > >
> > > > > #ifndef CONFIG_MMU
> > > > > extern struct rb_root nommu_region_tree;
> > > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > > index 2d83d79d1899..93bfcd0c1fde 100644
> > > > > --- a/include/linux/mm_types.h
> > > > > +++ b/include/linux/mm_types.h
> > > > > @@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
> > > > >
> > > > > typedef unsigned long vm_flags_t;
> > > > >
> > > > > +/*
> > > > > + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > > > + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > > > + */
> > > > > +typedef struct { unsigned long v; } freeptr_t;
> > > > > +
> > > > > /*
> > > > > * A region containing a mapping of a non-memory backed file under NOMMU
> > > > > * conditions. These are held in a global tree and are pinned by the VMAs that
> > > > > @@ -695,9 +701,7 @@ struct vm_area_struct {
> > > > > unsigned long vm_start;
> > > > > unsigned long vm_end;
> > > > > };
> > > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > > - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> > > > > -#endif
> > > > > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> > > > > };
> > > > >
> > > > > /*
> > > > > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > > > > index 10a971c2bde3..681b685b6c4e 100644
> > > > > --- a/include/linux/slab.h
> > > > > +++ b/include/linux/slab.h
> > > > > @@ -234,12 +234,6 @@ enum _slab_flag_bits {
> > > > > #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
> > > > > #endif
> > > > >
> > > > > -/*
> > > > > - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > > > - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > > > - */
> > > > > -typedef struct { unsigned long v; } freeptr_t;
> > > > > -
> > > > > /*
> > > > > * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
> > > > > *
> > > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > > index 9d9275783cf8..770b973a099c 100644
> > > > > --- a/kernel/fork.c
> > > > > +++ b/kernel/fork.c
> > > > > @@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > > return vma;
> > > > > }
> > > > >
> > > >
> > > > There exists a copy_vma() which copies the vma to a new area in the mm
> > > > in rmap. Naming this vma_copy() is confusing :)
> > > >
> > > > It might be better to just put this code in the vm_area_dup() or call it
> > > > __vm_area_dup(), or __vma_dup() ?
> > >
> > > Hmm. It's not really duplicating a vma but copying its content (no
> > > allocation). How about __vm_area_copy() to indicate it is copying
> > > vm_area_struct content?
> >
> >
> > Sorry, I missed this. it's not copying all the content either.
> >
> > vm_area_init_dup() maybe?
>
> Ah, how about vm_area_init_from(src, dest)?
>
Sure, thanks.
* Suren Baghdasaryan <surenb@google.com> [250110 14:08]:
> On Fri, Jan 10, 2025 at 9:48 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [250108 21:31]:
> > > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> > > object reuse before RCU grace period is over will be detected by
> > > lock_vma_under_rcu().
> > > Current checks are sufficient as long as vma is detached before it is
> > > freed. The only place this is not currently happening is in exit_mmap().
> > > Add the missing vma_mark_detached() in exit_mmap().
> > > Another issue which might trick lock_vma_under_rcu() during vma reuse
> > > is vm_area_dup(), which copies the entire content of the vma into a new
> > > one, overriding new vma's vm_refcnt and temporarily making it appear as
> > > attached. This might trick a racing lock_vma_under_rcu() to operate on
> > > a reused vma if it found the vma before it got reused. To prevent this
> > > situation, we should ensure that vm_refcnt stays at detached state (0)
> > > when it is copied and advances to attached state only after it is added
> > > into the vma tree. Introduce vma_copy() which preserves new vma's
> > > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
> > > state with no current readers when they are freed, lock_vma_under_rcu()
> > > will not be able to take vm_refcnt after vma got detached even if vma
> > > is reused.
> > > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
> > > vm_area_struct reuse and will minimize the number of call_rcu() calls.
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > ---
> > > include/linux/mm.h | 2 -
> > > include/linux/mm_types.h | 10 +++--
> > > include/linux/slab.h | 6 ---
> > > kernel/fork.c | 72 ++++++++++++++++++++------------
> > > mm/mmap.c | 3 +-
> > > mm/vma.c | 11 ++---
> > > mm/vma.h | 2 +-
> > > tools/testing/vma/vma_internal.h | 7 +---
> > > 8 files changed, 59 insertions(+), 54 deletions(-)
> > >
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 1d6b1563b956..a674558e4c05 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
> > > struct vm_area_struct *vm_area_alloc(struct mm_struct *);
> > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
> > > void vm_area_free(struct vm_area_struct *);
> > > -/* Use only if VMA has no other users */
> > > -void __vm_area_free(struct vm_area_struct *vma);
> > >
> > > #ifndef CONFIG_MMU
> > > extern struct rb_root nommu_region_tree;
> > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > index 2d83d79d1899..93bfcd0c1fde 100644
> > > --- a/include/linux/mm_types.h
> > > +++ b/include/linux/mm_types.h
> > > @@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
> > >
> > > typedef unsigned long vm_flags_t;
> > >
> > > +/*
> > > + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > + */
> > > +typedef struct { unsigned long v; } freeptr_t;
> > > +
> > > /*
> > > * A region containing a mapping of a non-memory backed file under NOMMU
> > > * conditions. These are held in a global tree and are pinned by the VMAs that
> > > @@ -695,9 +701,7 @@ struct vm_area_struct {
> > > unsigned long vm_start;
> > > unsigned long vm_end;
> > > };
> > > -#ifdef CONFIG_PER_VMA_LOCK
> > > - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> > > -#endif
> > > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> > > };
> > >
> > > /*
> > > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > > index 10a971c2bde3..681b685b6c4e 100644
> > > --- a/include/linux/slab.h
> > > +++ b/include/linux/slab.h
> > > @@ -234,12 +234,6 @@ enum _slab_flag_bits {
> > > #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
> > > #endif
> > >
> > > -/*
> > > - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > - */
> > > -typedef struct { unsigned long v; } freeptr_t;
> > > -
> > > /*
> > > * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
> > > *
> > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > index 9d9275783cf8..770b973a099c 100644
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > return vma;
> > > }
> > >
> >
> > There exists a copy_vma() which copies the vma to a new area in the mm
> > in rmap. Naming this vma_copy() is confusing :)
> >
> > It might be better to just put this code in the vm_area_dup() or call it
> > __vm_area_dup(), or __vma_dup() ?
>
> Hmm. It's not really duplicating a vma but copying its content (no
> allocation). How about __vm_area_copy() to indicate it is copying
> vm_area_struct content?
>
> >
> > > +static void vma_copy(const struct vm_area_struct *src, struct vm_area_struct *dest)
> > > +{
> > > + dest->vm_mm = src->vm_mm;
> > > + dest->vm_ops = src->vm_ops;
> > > + dest->vm_start = src->vm_start;
> > > + dest->vm_end = src->vm_end;
> > > + dest->anon_vma = src->anon_vma;
> > > + dest->vm_pgoff = src->vm_pgoff;
> > > + dest->vm_file = src->vm_file;
> > > + dest->vm_private_data = src->vm_private_data;
> > > + vm_flags_init(dest, src->vm_flags);
> > > + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > > + sizeof(dest->vm_page_prot));
> > > + /*
> > > + * src->shared.rb may be modified concurrently, but the clone
> > > + * will be reinitialized.
> > > + */
> > > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > > + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > > + sizeof(dest->vm_userfaultfd_ctx));
> > > +#ifdef CONFIG_ANON_VMA_NAME
> > > + dest->anon_name = src->anon_name;
> > > +#endif
> > > +#ifdef CONFIG_SWAP
> > > + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > > + sizeof(dest->swap_readahead_info));
> > > +#endif
> > > +#ifndef CONFIG_MMU
> > > + dest->vm_region = src->vm_region;
> > > +#endif
> > > +#ifdef CONFIG_NUMA
> > > + dest->vm_policy = src->vm_policy;
> > > +#endif
> > > +}
> > > +
> > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > > {
> > > struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> > > @@ -458,11 +493,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > >
> > > ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> > > ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> > > - /*
> > > - * orig->shared.rb may be modified concurrently, but the clone
> > > - * will be reinitialized.
> > > - */
> > > - data_race(memcpy(new, orig, sizeof(*new)));
> > > + vma_copy(orig, new);
> > > vma_lock_init(new, true);
> >
> > I think this suffers from a race still?
> >
> > That is, we can still race between vm_lock_seq == mm_lock_seq and the
> > lock acquire, where a free and reuse happens. In the even that the
> > reader is caught between the sequence and lock taking, the
> > vma->vmlock_dep_map may not be replaced and it could see the old lock
> > (or zero?) and things go bad:
> >
> > It could try to take vmlock_dep_map == 0 in read mode.
> >
> > It can take the old lock, detect the refcnt is wrong and release the new
> > lock.
>
> I don't think this race can happen. Notice a call to
> vma_assert_detached() inside vm_area_free(), so before vma is freed
> and possibly reused, it has to be detached. vma_mark_detached()
> ensures that there are no current or future readers by executing the
> __vma_enter_locked() + __vma_exit_locked() sequence if vm_refcnt is
> not already at 0. Once __vma_exit_locked() is done, vm_refcnt is at 0
> and any new reader will be rejected on
> __refcount_inc_not_zero_limited(), before even checking vm_lock_seq ==
> mm_lock_seq.
Isn't the vm_lock_seq check before the ref count in vma_start_read()?
From patch 11/16:
@@ -720,13 +752,19 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
return false;
- if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited() will fail
+ * because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT)))
return false;
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
>Even if a reader tries to sneak in between
> __vma_enter_locked() and __vma_exit_locked() calls,
> __refcount_inc_not_zero_limited() will reject it because
> VMA_LOCK_OFFSET is set and VMA_REF_LIMIT will be violated.
> IOW, when VMA is freed, it's guaranteed to be detached with no current
> or future readers, therefore "race between vm_lock_seq == mm_lock_seq
> and the lock acquire, where a free and reuse happens" should not be
> possible.
>
> Did I understand your concern correctly and does my explanation make
> sense to you?
It is close to what Vlastimil said before.
Here is the sequence for a NULL dereference, refcnt value is not needed:
A: B: C:
lock_vma_under_rcu()
vma = mas_walk()
vma_start_read()
vm_lock_seq == mm->mm_lock_seq.sequence
vma_start_write
vma detached and freed
vm_area_dup()
- vma reallocated
- zero vma
rwsem_acquire_read(NULL)
Here is a sequence for unlocking the new lock while locking the old one.
The refcnt failure detects the detached state but does not protect
against the wrong lock use:
A: B: C:
lock_vma_under_rcu()
vma = mas_walk()
vma_start_read()
vm_lock_seq == mm->mm_lock_seq.sequence
vma_start_write
vma detached and freed
vm_area_dup()
- vma reallocated
rwsem_acquire_read(old lock)
__refcount_inc_not_zero_limited() fails
vma_init_lock();
rwsem_release(new lock)
I don't think avoiding the copy of the ref count from the old vma is
enough to stop these races?
Thanks,
Liam
On Fri, Jan 10, 2025 at 11:46 AM 'Liam R. Howlett' via kernel-team
<kernel-team@android.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [250110 14:08]:
> > On Fri, Jan 10, 2025 at 9:48 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Suren Baghdasaryan <surenb@google.com> [250108 21:31]:
> > > > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> > > > object reuse before RCU grace period is over will be detected by
> > > > lock_vma_under_rcu().
> > > > Current checks are sufficient as long as vma is detached before it is
> > > > freed. The only place this is not currently happening is in exit_mmap().
> > > > Add the missing vma_mark_detached() in exit_mmap().
> > > > Another issue which might trick lock_vma_under_rcu() during vma reuse
> > > > is vm_area_dup(), which copies the entire content of the vma into a new
> > > > one, overriding new vma's vm_refcnt and temporarily making it appear as
> > > > attached. This might trick a racing lock_vma_under_rcu() to operate on
> > > > a reused vma if it found the vma before it got reused. To prevent this
> > > > situation, we should ensure that vm_refcnt stays at detached state (0)
> > > > when it is copied and advances to attached state only after it is added
> > > > into the vma tree. Introduce vma_copy() which preserves new vma's
> > > > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached
> > > > state with no current readers when they are freed, lock_vma_under_rcu()
> > > > will not be able to take vm_refcnt after vma got detached even if vma
> > > > is reused.
> > > > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
> > > > vm_area_struct reuse and will minimize the number of call_rcu() calls.
> > > >
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > > ---
> > > > include/linux/mm.h | 2 -
> > > > include/linux/mm_types.h | 10 +++--
> > > > include/linux/slab.h | 6 ---
> > > > kernel/fork.c | 72 ++++++++++++++++++++------------
> > > > mm/mmap.c | 3 +-
> > > > mm/vma.c | 11 ++---
> > > > mm/vma.h | 2 +-
> > > > tools/testing/vma/vma_internal.h | 7 +---
> > > > 8 files changed, 59 insertions(+), 54 deletions(-)
> > > >
> > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > index 1d6b1563b956..a674558e4c05 100644
> > > > --- a/include/linux/mm.h
> > > > +++ b/include/linux/mm.h
> > > > @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
> > > > struct vm_area_struct *vm_area_alloc(struct mm_struct *);
> > > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
> > > > void vm_area_free(struct vm_area_struct *);
> > > > -/* Use only if VMA has no other users */
> > > > -void __vm_area_free(struct vm_area_struct *vma);
> > > >
> > > > #ifndef CONFIG_MMU
> > > > extern struct rb_root nommu_region_tree;
> > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > > index 2d83d79d1899..93bfcd0c1fde 100644
> > > > --- a/include/linux/mm_types.h
> > > > +++ b/include/linux/mm_types.h
> > > > @@ -582,6 +582,12 @@ static inline void *folio_get_private(struct folio *folio)
> > > >
> > > > typedef unsigned long vm_flags_t;
> > > >
> > > > +/*
> > > > + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > > + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > > + */
> > > > +typedef struct { unsigned long v; } freeptr_t;
> > > > +
> > > > /*
> > > > * A region containing a mapping of a non-memory backed file under NOMMU
> > > > * conditions. These are held in a global tree and are pinned by the VMAs that
> > > > @@ -695,9 +701,7 @@ struct vm_area_struct {
> > > > unsigned long vm_start;
> > > > unsigned long vm_end;
> > > > };
> > > > -#ifdef CONFIG_PER_VMA_LOCK
> > > > - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> > > > -#endif
> > > > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> > > > };
> > > >
> > > > /*
> > > > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > > > index 10a971c2bde3..681b685b6c4e 100644
> > > > --- a/include/linux/slab.h
> > > > +++ b/include/linux/slab.h
> > > > @@ -234,12 +234,6 @@ enum _slab_flag_bits {
> > > > #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
> > > > #endif
> > > >
> > > > -/*
> > > > - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> > > > - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> > > > - */
> > > > -typedef struct { unsigned long v; } freeptr_t;
> > > > -
> > > > /*
> > > > * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
> > > > *
> > > > diff --git a/kernel/fork.c b/kernel/fork.c
> > > > index 9d9275783cf8..770b973a099c 100644
> > > > --- a/kernel/fork.c
> > > > +++ b/kernel/fork.c
> > > > @@ -449,6 +449,41 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > return vma;
> > > > }
> > > >
> > >
> > > There exists a copy_vma() which copies the vma to a new area in the mm
> > > in rmap. Naming this vma_copy() is confusing :)
> > >
> > > It might be better to just put this code in the vm_area_dup() or call it
> > > __vm_area_dup(), or __vma_dup() ?
> >
> > Hmm. It's not really duplicating a vma but copying its content (no
> > allocation). How about __vm_area_copy() to indicate it is copying
> > vm_area_struct content?
> >
> > >
> > > > +static void vma_copy(const struct vm_area_struct *src, struct vm_area_struct *dest)
> > > > +{
> > > > + dest->vm_mm = src->vm_mm;
> > > > + dest->vm_ops = src->vm_ops;
> > > > + dest->vm_start = src->vm_start;
> > > > + dest->vm_end = src->vm_end;
> > > > + dest->anon_vma = src->anon_vma;
> > > > + dest->vm_pgoff = src->vm_pgoff;
> > > > + dest->vm_file = src->vm_file;
> > > > + dest->vm_private_data = src->vm_private_data;
> > > > + vm_flags_init(dest, src->vm_flags);
> > > > + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > > > + sizeof(dest->vm_page_prot));
> > > > + /*
> > > > + * src->shared.rb may be modified concurrently, but the clone
> > > > + * will be reinitialized.
> > > > + */
> > > > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > > > + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > > > + sizeof(dest->vm_userfaultfd_ctx));
> > > > +#ifdef CONFIG_ANON_VMA_NAME
> > > > + dest->anon_name = src->anon_name;
> > > > +#endif
> > > > +#ifdef CONFIG_SWAP
> > > > + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > > > + sizeof(dest->swap_readahead_info));
> > > > +#endif
> > > > +#ifndef CONFIG_MMU
> > > > + dest->vm_region = src->vm_region;
> > > > +#endif
> > > > +#ifdef CONFIG_NUMA
> > > > + dest->vm_policy = src->vm_policy;
> > > > +#endif
> > > > +}
> > > > +
> > > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > > > {
> > > > struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> > > > @@ -458,11 +493,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > > >
> > > > ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> > > > ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> > > > - /*
> > > > - * orig->shared.rb may be modified concurrently, but the clone
> > > > - * will be reinitialized.
> > > > - */
> > > > - data_race(memcpy(new, orig, sizeof(*new)));
> > > > + vma_copy(orig, new);
> > > > vma_lock_init(new, true);
> > >
> > > I think this suffers from a race still?
> > >
> > > That is, we can still race between vm_lock_seq == mm_lock_seq and the
> > > lock acquire, where a free and reuse happens. In the even that the
> > > reader is caught between the sequence and lock taking, the
> > > vma->vmlock_dep_map may not be replaced and it could see the old lock
> > > (or zero?) and things go bad:
> > >
> > > It could try to take vmlock_dep_map == 0 in read mode.
> > >
> > > It can take the old lock, detect the refcnt is wrong and release the new
> > > lock.
> >
> > I don't think this race can happen. Notice a call to
> > vma_assert_detached() inside vm_area_free(), so before vma is freed
> > and possibly reused, it has to be detached. vma_mark_detached()
> > ensures that there are no current or future readers by executing the
> > __vma_enter_locked() + __vma_exit_locked() sequence if vm_refcnt is
> > not already at 0. Once __vma_exit_locked() is done, vm_refcnt is at 0
> > and any new reader will be rejected on
> > __refcount_inc_not_zero_limited(), before even checking vm_lock_seq ==
> > mm_lock_seq.
>
> Isn't the vm_lock_seq check before the ref count in vma_start_read()?
>
> From patch 11/16:
>
> @@ -720,13 +752,19 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
> return false;
>
> - if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
> + /*
> + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited() will fail
> + * because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
> + */
> + if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
> + VMA_REF_LIMIT)))
> return false;
>
> + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
>
>
>
> >Even if a reader tries to sneak in between
> > __vma_enter_locked() and __vma_exit_locked() calls,
> > __refcount_inc_not_zero_limited() will reject it because
> > VMA_LOCK_OFFSET is set and VMA_REF_LIMIT will be violated.
> > IOW, when VMA is freed, it's guaranteed to be detached with no current
> > or future readers, therefore "race between vm_lock_seq == mm_lock_seq
> > and the lock acquire, where a free and reuse happens" should not be
> > possible.
> >
> > Did I understand your concern correctly and does my explanation make
> > sense to you?
>
> It is close to what Vlastimil said before.
>
> Here is the sequence for a NULL dereference, refcnt value is not needed:
>
> A: B: C:
> lock_vma_under_rcu()
> vma = mas_walk()
> vma_start_read()
> vm_lock_seq == mm->mm_lock_seq.sequence
>
> vma_start_write
> vma detached and freed
At this point B makes vm_refcnt==0.
>
>
> vm_area_dup()
> - vma reallocated
> - zero vma
vm_refcnt here is still 0.
>
Here before calling rwsem_acquire_read(), the reader has to
successfully do __refcount_inc_not_zero_limited() and that will fail
because vm_refcnt==0. So rwsem_acquire_read(NULL) will not be called.
What am I missing?
> rwsem_acquire_read(NULL)
>
>
> Here is a sequence for unlocking the new lock while locking the old one.
> The refcnt failure detects the detached state but does not protect
> against the wrong lock use:
> A: B: C:
> lock_vma_under_rcu()
> vma = mas_walk()
> vma_start_read()
> vm_lock_seq == mm->mm_lock_seq.sequence
>
> vma_start_write
> vma detached and freed
>
> vm_area_dup()
> - vma reallocated
vm_refcnt is still 0 after reallocation.
>
> rwsem_acquire_read(old lock)
> __refcount_inc_not_zero_limited() fails
Are we looking at the same code
(https://lore.kernel.org/all/20250109023025.2242447-12-surenb@google.com/)?
The sequence should be reversed here like this:
if (!__refcount_inc_not_zero_limited())
return false;
rwsem_acquire_read(old lock)
and because __refcount_inc_not_zero_limited() fails,
rwsem_acquire_read(old lock) should never be called.
>
> vma_init_lock();
>
> rwsem_release(new lock)
The reader will not call rwsem_release(new lock) because it failed to
acquire the lock.
>
> I don't think avoiding the copy of the ref count from the old vma is
> enough to stop these races?
>
> Thanks,
> Liam
>
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
* Suren Baghdasaryan <surenb@google.com> [250110 15:35]: > On Fri, Jan 10, 2025 at 11:46 AM 'Liam R. Howlett' via kernel-team > > > > > rwsem_acquire_read(old lock) > > __refcount_inc_not_zero_limited() fails > > Are we looking at the same code > (https://lore.kernel.org/all/20250109023025.2242447-12-surenb@google.com/)? > The sequence should be reversed here like this: > > if (!__refcount_inc_not_zero_limited()) > return false; > rwsem_acquire_read(old lock) > > and because __refcount_inc_not_zero_limited() fails, > rwsem_acquire_read(old lock) should never be called. > We are not, I had an older version of the patch from the 7th. Sorry for the confusion. Thanks, Liam
On Fri, Jan 10, 2025 at 12:47 PM 'Liam R. Howlett' via kernel-team <kernel-team@android.com> wrote: > > * Suren Baghdasaryan <surenb@google.com> [250110 15:35]: > > On Fri, Jan 10, 2025 at 11:46 AM 'Liam R. Howlett' via kernel-team > > > > > > > > rwsem_acquire_read(old lock) > > > __refcount_inc_not_zero_limited() fails > > > > Are we looking at the same code > > (https://lore.kernel.org/all/20250109023025.2242447-12-surenb@google.com/)? > > The sequence should be reversed here like this: > > > > if (!__refcount_inc_not_zero_limited()) > > return false; > > rwsem_acquire_read(old lock) > > > > and because __refcount_inc_not_zero_limited() fails, > > rwsem_acquire_read(old lock) should never be called. > > > > We are not, I had an older version of the patch from the 7th. > > Sorry for the confusion. No worries. Thanks for taking a look! > > Thanks, > Liam > > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com. >
On 1/9/25 3:30 AM, Suren Baghdasaryan wrote: > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that > object reuse before RCU grace period is over will be detected by > lock_vma_under_rcu(). > Current checks are sufficient as long as vma is detached before it is > freed. The only place this is not currently happening is in exit_mmap(). > Add the missing vma_mark_detached() in exit_mmap(). > Another issue which might trick lock_vma_under_rcu() during vma reuse > is vm_area_dup(), which copies the entire content of the vma into a new > one, overriding new vma's vm_refcnt and temporarily making it appear as > attached. This might trick a racing lock_vma_under_rcu() to operate on > a reused vma if it found the vma before it got reused. To prevent this > situation, we should ensure that vm_refcnt stays at detached state (0) > when it is copied and advances to attached state only after it is added > into the vma tree. Introduce vma_copy() which preserves new vma's > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached > state with no current readers when they are freed, lock_vma_under_rcu() > will not be able to take vm_refcnt after vma got detached even if vma > is reused. > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate > vm_area_struct reuse and will minimize the number of call_rcu() calls. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> You could also drop the reset_refcnt parameter of vma_lock_init() now, as the usage in vm_area_dup() should now be just setting 0 over 0. Maybe a VM_WARN_ON if it's not 0 already? And a comment in vm_area_struct definition to consider vma_copy() when adding any new field? > + /* > + * src->shared.rb may be modified concurrently, but the clone > + * will be reinitialized. > + */ > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); The comment makes it sound as if we didn't need to do it at all? But I didn't verify. If we do need it in some cases (i.e. the just allocated vma might have garbage from previous lifetime, but src is well defined and it's a case where it's not reinitialized afterwards) maybe the comment should say? Or if it's either reinitialized later or zeroes at src, we could memset() the zeroes instead of memcpying them, etc.
On Fri, Jan 10, 2025 at 7:31 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 1/9/25 3:30 AM, Suren Baghdasaryan wrote: > > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that > > object reuse before RCU grace period is over will be detected by > > lock_vma_under_rcu(). > > Current checks are sufficient as long as vma is detached before it is > > freed. The only place this is not currently happening is in exit_mmap(). > > Add the missing vma_mark_detached() in exit_mmap(). > > Another issue which might trick lock_vma_under_rcu() during vma reuse > > is vm_area_dup(), which copies the entire content of the vma into a new > > one, overriding new vma's vm_refcnt and temporarily making it appear as > > attached. This might trick a racing lock_vma_under_rcu() to operate on > > a reused vma if it found the vma before it got reused. To prevent this > > situation, we should ensure that vm_refcnt stays at detached state (0) > > when it is copied and advances to attached state only after it is added > > into the vma tree. Introduce vma_copy() which preserves new vma's > > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached > > state with no current readers when they are freed, lock_vma_under_rcu() > > will not be able to take vm_refcnt after vma got detached even if vma > > is reused. > > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate > > vm_area_struct reuse and will minimize the number of call_rcu() calls. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> > > You could also drop the reset_refcnt parameter of vma_lock_init() now, > as the usage in vm_area_dup() should now be just setting 0 over 0. Maybe > a VM_WARN_ON if it's not 0 already? Yeah, that's a good idea. Will do. > And a comment in vm_area_struct definition to consider vma_copy() when > adding any new field? Sure, will add. > > > + /* > > + * src->shared.rb may be modified concurrently, but the clone > > + * will be reinitialized. > > + */ > > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); > > The comment makes it sound as if we didn't need to do it at all? But I > didn't verify. If we do need it in some cases (i.e. the just allocated > vma might have garbage from previous lifetime, but src is well defined > and it's a case where it's not reinitialized afterwards) maybe the > comment should say? Or if it's either reinitialized later or zeroes at > src, we could memset() the zeroes instead of memcpying them, etc. I see vm_area_dup() being used in dup_mmap() and I think this comment is about this usage in case the src vma changes from under us. However, vm_area_dup() is also used when we simply duplicate an existing vma while holding an mmap_write_lock, like in __split_vma(). In these cases there is no possibility of a race and copied value should hold. Maybe I should amend this comment like this: /* * src->shared.rb may be modified concurrently when called from dup_mmap(), * but the clone will reinitialize it. */ WDYT? > >
On Fri, Jan 10, 2025 at 8:07 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Fri, Jan 10, 2025 at 7:31 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > > > On 1/9/25 3:30 AM, Suren Baghdasaryan wrote: > > > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that > > > object reuse before RCU grace period is over will be detected by > > > lock_vma_under_rcu(). > > > Current checks are sufficient as long as vma is detached before it is > > > freed. The only place this is not currently happening is in exit_mmap(). > > > Add the missing vma_mark_detached() in exit_mmap(). > > > Another issue which might trick lock_vma_under_rcu() during vma reuse > > > is vm_area_dup(), which copies the entire content of the vma into a new > > > one, overriding new vma's vm_refcnt and temporarily making it appear as > > > attached. This might trick a racing lock_vma_under_rcu() to operate on > > > a reused vma if it found the vma before it got reused. To prevent this > > > situation, we should ensure that vm_refcnt stays at detached state (0) > > > when it is copied and advances to attached state only after it is added > > > into the vma tree. Introduce vma_copy() which preserves new vma's > > > vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached > > > state with no current readers when they are freed, lock_vma_under_rcu() > > > will not be able to take vm_refcnt after vma got detached even if vma > > > is reused. > > > Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate > > > vm_area_struct reuse and will minimize the number of call_rcu() calls. > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> > > > > You could also drop the reset_refcnt parameter of vma_lock_init() now, > > as the usage in vm_area_dup() should now be just setting 0 over 0. Maybe > > a VM_WARN_ON if it's not 0 already? > > Yeah, that's a good idea. Will do. Ugh, once I made this change, the newly added VM_WARN_ON() immediately triggered because vm_area_dup() does not memset(0) the entire vma and kmem_cache_alloc(vm_area_cachep) does not always return a reused vma. I could add a vm_area_cachep constructor to always initialize vm_refcnt to 0 but that would lead to more changes. I think I'll keep reset_refcnt for now and will add vm_area_cachep constructor as a follow-up optimization after this patchset is merged. > > > And a comment in vm_area_struct definition to consider vma_copy() when > > adding any new field? > > Sure, will add. > > > > > > + /* > > > + * src->shared.rb may be modified concurrently, but the clone > > > + * will be reinitialized. > > > + */ > > > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); > > > > The comment makes it sound as if we didn't need to do it at all? But I > > didn't verify. If we do need it in some cases (i.e. the just allocated > > vma might have garbage from previous lifetime, but src is well defined > > and it's a case where it's not reinitialized afterwards) maybe the > > comment should say? Or if it's either reinitialized later or zeroes at > > src, we could memset() the zeroes instead of memcpying them, etc. > > I see vm_area_dup() being used in dup_mmap() and I think this comment > is about this usage in case the src vma changes from under us. > However, vm_area_dup() is also used when we simply duplicate an > existing vma while holding an mmap_write_lock, like in __split_vma(). > In these cases there is no possibility of a race and copied value > should hold. Maybe I should amend this comment like this: > > /* > * src->shared.rb may be modified concurrently when called from dup_mmap(), > * but the clone will reinitialize it. > */ > > WDYT? > > > > >
On 1/10/25 17:07, Suren Baghdasaryan wrote: > On Fri, Jan 10, 2025 at 7:31 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > I see vm_area_dup() being used in dup_mmap() and I think this comment > is about this usage in case the src vma changes from under us. > However, vm_area_dup() is also used when we simply duplicate an > existing vma while holding an mmap_write_lock, like in __split_vma(). > In these cases there is no possibility of a race and copied value > should hold. Maybe I should amend this comment like this: > > /* > * src->shared.rb may be modified concurrently when called from dup_mmap(), > * but the clone will reinitialize it. > */ > > WDYT? Sounds good, thanks! >> >>
© 2016 - 2025 Red Hat, Inc.