[v3] uprobes,mm: speculative lockless VMA-to-uprobe lookup

[PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Andrii Nakryiko 1 year, 4 months ago

To increase mm->mm_lock_seq robustness, switch it from int to long, so
that it's a 64-bit counter on 64-bit systems and we can stop worrying
about it wrapping around in just ~4 billion iterations. Same goes for
VMA's matching vm_lock_seq, which is derived from mm_lock_seq.

I didn't use __u64 outright to keep 32-bit architectures unaffected, but
if it seems important enough, I have nothing against using __u64.

Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/mm.h        | 6 +++---
 include/linux/mm_types.h  | 4 ++--
 include/linux/mmap_lock.h | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..97819437832e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -730,7 +730,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
 }
 
 /* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, long *mm_lock_seq)
 {
 	mmap_assert_write_locked(vma->vm_mm);
 
@@ -749,7 +749,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
  */
 static inline void vma_start_write(struct vm_area_struct *vma)
 {
-	int mm_lock_seq;
+	long mm_lock_seq;
 
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
@@ -767,7 +767,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 {
-	int mm_lock_seq;
+	long mm_lock_seq;
 
 	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5d8cdebd42bc..0dc57d6cfe38 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -715,7 +715,7 @@ struct vm_area_struct {
 	 * counter reuse can only lead to occasional unnecessary use of the
 	 * slowpath.
 	 */
-	int vm_lock_seq;
+	long vm_lock_seq;
 	/* Unstable RCU readers are allowed to read this. */
 	struct vma_lock *vm_lock;
 #endif
@@ -898,7 +898,7 @@ struct mm_struct {
 		 * Can be read with ACQUIRE semantics if not holding write
 		 * mmap_lock.
 		 */
-		int mm_lock_seq;
+		long mm_lock_seq;
 #endif
 
 
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 9d23635bc701..f8fd6d879aa9 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -105,7 +105,7 @@ static inline void inc_mm_lock_seq(struct mm_struct *mm, bool acquire)
 	}
 }
 
-static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq)
+static inline bool mmap_lock_speculation_start(struct mm_struct *mm, long *seq)
 {
 	/* Pairs with RELEASE semantics in inc_mm_lock_seq(). */
 	*seq = smp_load_acquire(&mm->mm_lock_seq);
@@ -113,7 +113,7 @@ static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq)
 	return (*seq & 1) == 0;
 }
 
-static inline bool mmap_lock_speculation_end(struct mm_struct *mm, int seq)
+static inline bool mmap_lock_speculation_end(struct mm_struct *mm, long seq)
 {
 	/* Pairs with ACQUIRE semantics in inc_mm_lock_seq(). */
 	smp_rmb();
@@ -123,8 +123,8 @@ static inline bool mmap_lock_speculation_end(struct mm_struct *mm, int seq)
 #else
 static inline void init_mm_lock_seq(struct mm_struct *mm) {}
 static inline void inc_mm_lock_seq(struct mm_struct *mm, bool acquire) {}
-static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq) { return false; }
-static inline bool mmap_lock_speculation_end(struct mm_struct *mm, int seq) { return false; }
+static inline bool mmap_lock_speculation_start(struct mm_struct *mm, long *seq) { return false; }
+static inline bool mmap_lock_speculation_end(struct mm_struct *mm, long seq) { return false; }
 #endif
 
 /*
-- 
2.43.5

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Peter Zijlstra 1 year, 3 months ago

On Thu, Oct 10, 2024 at 01:56:42PM -0700, Andrii Nakryiko wrote:
> To increase mm->mm_lock_seq robustness, switch it from int to long, so
> that it's a 64-bit counter on 64-bit systems and we can stop worrying
> about it wrapping around in just ~4 billion iterations. Same goes for
> VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
> 
> I didn't use __u64 outright to keep 32-bit architectures unaffected, but
> if it seems important enough, I have nothing against using __u64.

(__uXX are the uapi types)

> 
> Suggested-by: Jann Horn <jannh@google.com>

Jann, do you see problems with the normal seqcount being unsigned (int)?
I suppose especially for preemptible seqcounts it might already be
entirely feasible to wrap them?

Doing u64 is tricky but not impossible, it would require something like
we do for GUP_GET_PXX_LOW_HIGH. OTOH, I don't think we really care about
32bit enough to bother.

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Shakeel Butt 1 year, 3 months ago

On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote:
> To increase mm->mm_lock_seq robustness, switch it from int to long, so
> that it's a 64-bit counter on 64-bit systems and we can stop worrying
> about it wrapping around in just ~4 billion iterations. Same goes for
> VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
> 
> I didn't use __u64 outright to keep 32-bit architectures unaffected, but
> if it seems important enough, I have nothing against using __u64.
> 
> Suggested-by: Jann Horn <jannh@google.com>
> Signed-off-by: Andrii Nakryiko <andrii@kernel.org>

Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Suren Baghdasaryan 1 year, 3 months ago

On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote:
> > To increase mm->mm_lock_seq robustness, switch it from int to long, so
> > that it's a 64-bit counter on 64-bit systems and we can stop worrying
> > about it wrapping around in just ~4 billion iterations. Same goes for
> > VMA's matching vm_lock_seq, which is derived from mm_lock_seq.

vm_lock_seq does not need to be long but for consistency I guess that
makes sense. While at it, can you please change these seq counters to
be unsigned?
Also, did you check with pahole if the vm_area_struct layout change
pushes some members into a difference cacheline or creates new gaps?

> >
> > I didn't use __u64 outright to keep 32-bit architectures unaffected, but
> > if it seems important enough, I have nothing against using __u64.
> >
> > Suggested-by: Jann Horn <jannh@google.com>
> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
>
> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Peter Zijlstra 1 year, 3 months ago

On Wed, Oct 16, 2024 at 07:01:59PM -0700, Suren Baghdasaryan wrote:
> On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote:
> > > To increase mm->mm_lock_seq robustness, switch it from int to long, so
> > > that it's a 64-bit counter on 64-bit systems and we can stop worrying
> > > about it wrapping around in just ~4 billion iterations. Same goes for
> > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
> 
> vm_lock_seq does not need to be long but for consistency I guess that
> makes sense. While at it, can you please change these seq counters to
> be unsigned?

Yeah, that. Kees is waging war on signed types that 'overflow'. These
sequence counter thingies are designed to wrap and should very much be
unsigned.

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Andrii Nakryiko 1 year, 3 months ago

On Wed, Oct 23, 2024 at 12:02 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Oct 16, 2024 at 07:01:59PM -0700, Suren Baghdasaryan wrote:
> > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote:
> > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so
> > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying
> > > > about it wrapping around in just ~4 billion iterations. Same goes for
> > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
> >
> > vm_lock_seq does not need to be long but for consistency I guess that
> > makes sense. While at it, can you please change these seq counters to
> > be unsigned?
>
> Yeah, that. Kees is waging war on signed types that 'overflow'. These
> sequence counter thingies are designed to wrap and should very much be
> unsigned.

Ah, ok, I already forgot I need to update this. Will do.

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Andrii Nakryiko 1 year, 3 months ago

On Wed, Oct 16, 2024 at 7:02 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote:
> > > To increase mm->mm_lock_seq robustness, switch it from int to long, so
> > > that it's a 64-bit counter on 64-bit systems and we can stop worrying
> > > about it wrapping around in just ~4 billion iterations. Same goes for
> > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
>
> vm_lock_seq does not need to be long but for consistency I guess that

How come, we literally assign vm_lock_seq from mm_lock_seq and do
direct comparisons. They have to be exactly the same type, no?

> makes sense. While at it, can you please change these seq counters to
> be unsigned?

There is `vma->vm_lock_seq = -1;` in kernel/fork.c, should it be
switched to ULONG_MAX then? In general, unless this is critical for
correctness, I'd very much like stuff like this to be done in the mm
tree afterwards, but it seems trivial enough, so if you insist I'll do
it.

> Also, did you check with pahole if the vm_area_struct layout change
> pushes some members into a difference cacheline or creates new gaps?
>

Just did. We had 3 byte hole after `bool detached;`, it now grew to 7
bytes (so +4) and then vm_lock_seq itself is now 8 bytes (so +4),
which now does push rb and rb_subtree_last into *THE SAME* cache line
(which sounds like an improvement to me). vm_lock_seq and vm_lock stay
in the same cache line. vm_pgoff and vm_file are now in the same cache
line, and given they are probably always accessed together, seems like
a good accidental change as well. See below pahole outputs before and
after.

That singular detached bool looks like a complete waste, tbh. Maybe it
would be better to roll it into vm_flags and save 8 bytes? (not that I
want to do those mm changes in this patch set, of course...).
vm_area_struct is otherwise nicely tightly packed.

tl;dr, seems fine, and detached would be best to get rid of, if
possible (but that's a completely separate thing)

BEFORE
======
struct vm_area_struct {
        union {
                struct {
                        long unsigned int vm_start;      /*     0     8 */
                        long unsigned int vm_end;        /*     8     8 */
                };                                       /*     0    16 */
                struct callback_head vm_rcu;             /*     0    16 */
        } __attribute__((__aligned__(8)));               /*     0    16 */
        struct mm_struct *         vm_mm;                /*    16     8 */
        pgprot_t                   vm_page_prot;         /*    24     8 */
        union {
                const vm_flags_t   vm_flags;             /*    32     8 */
                vm_flags_t         __vm_flags;           /*    32     8 */
        };                                               /*    32     8 */
        bool                       detached;             /*    40     1 */

        /* XXX 3 bytes hole, try to pack */

        int                        vm_lock_seq;          /*    44     4 */
        struct vma_lock *          vm_lock;              /*    48     8 */
        struct {
                struct rb_node     rb;                   /*    56    24 */
                /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */
                long unsigned int  rb_subtree_last;      /*    80     8 */
        }                                                /*    56    32 */
        struct list_head           anon_vma_chain;       /*    88    16 */
        struct anon_vma *          anon_vma;             /*   104     8 */
        const struct vm_operations_struct  * vm_ops;     /*   112     8 */
        long unsigned int          vm_pgoff;             /*   120     8 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        struct file *              vm_file;              /*   128     8 */
        void *                     vm_private_data;      /*   136     8 */
        atomic_long_t              swap_readahead_info;  /*   144     8 */
        struct mempolicy *         vm_policy;            /*   152     8 */
        struct vma_numab_state *   numab_state;          /*   160     8 */
        struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   168     8 */

        /* size: 176, cachelines: 3, members: 18 */
        /* sum members: 173, holes: 1, sum holes: 3 */
        /* forced alignments: 2 */
        /* last cacheline: 48 bytes */
} __attribute__((__aligned__(8)));

AFTER
=====
struct vm_area_struct {
        union {
                struct {
                        long unsigned int vm_start;      /*     0     8 */
                        long unsigned int vm_end;        /*     8     8 */
                };                                       /*     0    16 */
                struct callback_head vm_rcu;             /*     0    16 */
        } __attribute__((__aligned__(8)));               /*     0    16 */
        struct mm_struct *         vm_mm;                /*    16     8 */
        pgprot_t                   vm_page_prot;         /*    24     8 */
        union {
                const vm_flags_t   vm_flags;             /*    32     8 */
                vm_flags_t         __vm_flags;           /*    32     8 */
        };                                               /*    32     8 */
        bool                       detached;             /*    40     1 */

        /* XXX 7 bytes hole, try to pack */

        long int                   vm_lock_seq;          /*    48     8 */
        struct vma_lock *          vm_lock;              /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct {
                struct rb_node     rb;                   /*    64    24 */
                long unsigned int  rb_subtree_last;      /*    88     8 */
        }                                                /*    64    32 */
        struct list_head           anon_vma_chain;       /*    96    16 */
        struct anon_vma *          anon_vma;             /*   112     8 */
        const struct vm_operations_struct  * vm_ops;     /*   120     8 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        long unsigned int          vm_pgoff;             /*   128     8 */
        struct file *              vm_file;              /*   136     8 */
        void *                     vm_private_data;      /*   144     8 */
        atomic_long_t              swap_readahead_info;  /*   152     8 */
        struct mempolicy *         vm_policy;            /*   160     8 */
        struct vma_numab_state *   numab_state;          /*   168     8 */
        struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   176     8 */

        /* size: 184, cachelines: 3, members: 18 */
        /* sum members: 177, holes: 1, sum holes: 7 */
        /* forced alignments: 2 */
        /* last cacheline: 56 bytes */
} __attribute__((__aligned__(8)));


> > >
> > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but
> > > if it seems important enough, I have nothing against using __u64.
> > >
> > > Suggested-by: Jann Horn <jannh@google.com>
> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> >
> > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
>

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Suren Baghdasaryan 1 year, 3 months ago

On Thu, Oct 17, 2024 at 11:55 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, Oct 16, 2024 at 7:02 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote:
> > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so
> > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying
> > > > about it wrapping around in just ~4 billion iterations. Same goes for
> > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
> >
> > vm_lock_seq does not need to be long but for consistency I guess that
>
> How come, we literally assign vm_lock_seq from mm_lock_seq and do
> direct comparisons. They have to be exactly the same type, no?

Not necessarily. vm_lock_seq is a snapshot of the mm_lock_seq but it
does not have to be a "complete" snapshot. Just something that has a
very high probability of identifying a match and a rare false positive
is not a problem (see comment in
https://elixir.bootlin.com/linux/v6.11.3/source/include/linux/mm.h#L678).
So, something like this for taking and comparing a snapshot would do:

vma->vm_lock_seq = (unsigned int)mm->mm_lock_seq;
if (vma->vm_lock_seq == (unsigned int)mm->mm_lock_seq)

>
> > makes sense. While at it, can you please change these seq counters to
> > be unsigned?
>
> There is `vma->vm_lock_seq = -1;` in kernel/fork.c, should it be
> switched to ULONG_MAX then? In general, unless this is critical for
> correctness, I'd very much like stuff like this to be done in the mm
> tree afterwards, but it seems trivial enough, so if you insist I'll do
> it.

Yeah, ULONG_MAX should work fine here. vma->vm_lock_seq is initialized
to -1 to avoid false initial match with mm->mm_lock_seq which is
initialized to 0. As I said, a false match is not a problem but if we
can avoid it, that's better.

>
> > Also, did you check with pahole if the vm_area_struct layout change
> > pushes some members into a difference cacheline or creates new gaps?
> >
>
> Just did. We had 3 byte hole after `bool detached;`, it now grew to 7
> bytes (so +4) and then vm_lock_seq itself is now 8 bytes (so +4),
> which now does push rb and rb_subtree_last into *THE SAME* cache line
> (which sounds like an improvement to me). vm_lock_seq and vm_lock stay
> in the same cache line. vm_pgoff and vm_file are now in the same cache
> line, and given they are probably always accessed together, seems like
> a good accidental change as well. See below pahole outputs before and
> after.

Ok, sounds good to me. Looks like keeping both sequence numbers 64bit
is not an issue. Changing them to unsigned would be nice and trivial
but I don't insist. You can add:

Reviewed-by: Suren Baghdasaryan <surenb@google.com>

>
> That singular detached bool looks like a complete waste, tbh. Maybe it
> would be better to roll it into vm_flags and save 8 bytes? (not that I
> want to do those mm changes in this patch set, of course...).
> vm_area_struct is otherwise nicely tightly packed.
>
> tl;dr, seems fine, and detached would be best to get rid of, if
> possible (but that's a completely separate thing)

Yeah, I'll take a look at that. Thanks!

>
> BEFORE
> ======
> struct vm_area_struct {
>         union {
>                 struct {
>                         long unsigned int vm_start;      /*     0     8 */
>                         long unsigned int vm_end;        /*     8     8 */
>                 };                                       /*     0    16 */
>                 struct callback_head vm_rcu;             /*     0    16 */
>         } __attribute__((__aligned__(8)));               /*     0    16 */
>         struct mm_struct *         vm_mm;                /*    16     8 */
>         pgprot_t                   vm_page_prot;         /*    24     8 */
>         union {
>                 const vm_flags_t   vm_flags;             /*    32     8 */
>                 vm_flags_t         __vm_flags;           /*    32     8 */
>         };                                               /*    32     8 */
>         bool                       detached;             /*    40     1 */
>
>         /* XXX 3 bytes hole, try to pack */
>
>         int                        vm_lock_seq;          /*    44     4 */
>         struct vma_lock *          vm_lock;              /*    48     8 */
>         struct {
>                 struct rb_node     rb;                   /*    56    24 */
>                 /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */
>                 long unsigned int  rb_subtree_last;      /*    80     8 */
>         }                                                /*    56    32 */
>         struct list_head           anon_vma_chain;       /*    88    16 */
>         struct anon_vma *          anon_vma;             /*   104     8 */
>         const struct vm_operations_struct  * vm_ops;     /*   112     8 */
>         long unsigned int          vm_pgoff;             /*   120     8 */
>         /* --- cacheline 2 boundary (128 bytes) --- */
>         struct file *              vm_file;              /*   128     8 */
>         void *                     vm_private_data;      /*   136     8 */
>         atomic_long_t              swap_readahead_info;  /*   144     8 */
>         struct mempolicy *         vm_policy;            /*   152     8 */
>         struct vma_numab_state *   numab_state;          /*   160     8 */
>         struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   168     8 */
>
>         /* size: 176, cachelines: 3, members: 18 */
>         /* sum members: 173, holes: 1, sum holes: 3 */
>         /* forced alignments: 2 */
>         /* last cacheline: 48 bytes */
> } __attribute__((__aligned__(8)));
>
> AFTER
> =====
> struct vm_area_struct {
>         union {
>                 struct {
>                         long unsigned int vm_start;      /*     0     8 */
>                         long unsigned int vm_end;        /*     8     8 */
>                 };                                       /*     0    16 */
>                 struct callback_head vm_rcu;             /*     0    16 */
>         } __attribute__((__aligned__(8)));               /*     0    16 */
>         struct mm_struct *         vm_mm;                /*    16     8 */
>         pgprot_t                   vm_page_prot;         /*    24     8 */
>         union {
>                 const vm_flags_t   vm_flags;             /*    32     8 */
>                 vm_flags_t         __vm_flags;           /*    32     8 */
>         };                                               /*    32     8 */
>         bool                       detached;             /*    40     1 */
>
>         /* XXX 7 bytes hole, try to pack */
>
>         long int                   vm_lock_seq;          /*    48     8 */
>         struct vma_lock *          vm_lock;              /*    56     8 */
>         /* --- cacheline 1 boundary (64 bytes) --- */
>         struct {
>                 struct rb_node     rb;                   /*    64    24 */
>                 long unsigned int  rb_subtree_last;      /*    88     8 */
>         }                                                /*    64    32 */
>         struct list_head           anon_vma_chain;       /*    96    16 */
>         struct anon_vma *          anon_vma;             /*   112     8 */
>         const struct vm_operations_struct  * vm_ops;     /*   120     8 */
>         /* --- cacheline 2 boundary (128 bytes) --- */
>         long unsigned int          vm_pgoff;             /*   128     8 */
>         struct file *              vm_file;              /*   136     8 */
>         void *                     vm_private_data;      /*   144     8 */
>         atomic_long_t              swap_readahead_info;  /*   152     8 */
>         struct mempolicy *         vm_policy;            /*   160     8 */
>         struct vma_numab_state *   numab_state;          /*   168     8 */
>         struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   176     8 */
>
>         /* size: 184, cachelines: 3, members: 18 */
>         /* sum members: 177, holes: 1, sum holes: 7 */
>         /* forced alignments: 2 */
>         /* last cacheline: 56 bytes */
> } __attribute__((__aligned__(8)));
>
>
> > > >
> > > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but
> > > > if it seems important enough, I have nothing against using __u64.
> > > >
> > > > Suggested-by: Jann Horn <jannh@google.com>
> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > >
> > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> >

Re: [PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures

Posted by Andrii Nakryiko 1 year, 3 months ago

On Thu, Oct 17, 2024 at 12:42 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Oct 17, 2024 at 11:55 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Wed, Oct 16, 2024 at 7:02 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > >
> > > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote:
> > > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so
> > > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying
> > > > > about it wrapping around in just ~4 billion iterations. Same goes for
> > > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
> > >
> > > vm_lock_seq does not need to be long but for consistency I guess that
> >
> > How come, we literally assign vm_lock_seq from mm_lock_seq and do
> > direct comparisons. They have to be exactly the same type, no?
>
> Not necessarily. vm_lock_seq is a snapshot of the mm_lock_seq but it
> does not have to be a "complete" snapshot. Just something that has a
> very high probability of identifying a match and a rare false positive
> is not a problem (see comment in
> https://elixir.bootlin.com/linux/v6.11.3/source/include/linux/mm.h#L678).
> So, something like this for taking and comparing a snapshot would do:
>
> vma->vm_lock_seq = (unsigned int)mm->mm_lock_seq;
> if (vma->vm_lock_seq == (unsigned int)mm->mm_lock_seq)

Ah, ok, I see what's the idea behind it, makes sense.

>
> >
> > > makes sense. While at it, can you please change these seq counters to
> > > be unsigned?
> >
> > There is `vma->vm_lock_seq = -1;` in kernel/fork.c, should it be
> > switched to ULONG_MAX then? In general, unless this is critical for
> > correctness, I'd very much like stuff like this to be done in the mm
> > tree afterwards, but it seems trivial enough, so if you insist I'll do
> > it.
>
> Yeah, ULONG_MAX should work fine here. vma->vm_lock_seq is initialized
> to -1 to avoid false initial match with mm->mm_lock_seq which is
> initialized to 0. As I said, a false match is not a problem but if we
> can avoid it, that's better.

ok, ULONG_MAX and unsigned long it is, will update

>
> >
> > > Also, did you check with pahole if the vm_area_struct layout change
> > > pushes some members into a difference cacheline or creates new gaps?
> > >
> >
> > Just did. We had 3 byte hole after `bool detached;`, it now grew to 7
> > bytes (so +4) and then vm_lock_seq itself is now 8 bytes (so +4),
> > which now does push rb and rb_subtree_last into *THE SAME* cache line
> > (which sounds like an improvement to me). vm_lock_seq and vm_lock stay
> > in the same cache line. vm_pgoff and vm_file are now in the same cache
> > line, and given they are probably always accessed together, seems like
> > a good accidental change as well. See below pahole outputs before and
> > after.
>
> Ok, sounds good to me. Looks like keeping both sequence numbers 64bit
> is not an issue. Changing them to unsigned would be nice and trivial
> but I don't insist. You can add:
>
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>

thanks, will switch to unsigned in the next revision (next week,
probably, to let some of the pending patches land)

>
> >
> > That singular detached bool looks like a complete waste, tbh. Maybe it
> > would be better to roll it into vm_flags and save 8 bytes? (not that I
> > want to do those mm changes in this patch set, of course...).
> > vm_area_struct is otherwise nicely tightly packed.
> >
> > tl;dr, seems fine, and detached would be best to get rid of, if
> > possible (but that's a completely separate thing)
>
> Yeah, I'll take a look at that. Thanks!
>
> >
> > BEFORE
> > ======
> > struct vm_area_struct {
> >         union {
> >                 struct {
> >                         long unsigned int vm_start;      /*     0     8 */
> >                         long unsigned int vm_end;        /*     8     8 */
> >                 };                                       /*     0    16 */
> >                 struct callback_head vm_rcu;             /*     0    16 */
> >         } __attribute__((__aligned__(8)));               /*     0    16 */
> >         struct mm_struct *         vm_mm;                /*    16     8 */
> >         pgprot_t                   vm_page_prot;         /*    24     8 */
> >         union {
> >                 const vm_flags_t   vm_flags;             /*    32     8 */
> >                 vm_flags_t         __vm_flags;           /*    32     8 */
> >         };                                               /*    32     8 */
> >         bool                       detached;             /*    40     1 */
> >
> >         /* XXX 3 bytes hole, try to pack */
> >
> >         int                        vm_lock_seq;          /*    44     4 */
> >         struct vma_lock *          vm_lock;              /*    48     8 */
> >         struct {
> >                 struct rb_node     rb;                   /*    56    24 */
> >                 /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */
> >                 long unsigned int  rb_subtree_last;      /*    80     8 */
> >         }                                                /*    56    32 */
> >         struct list_head           anon_vma_chain;       /*    88    16 */
> >         struct anon_vma *          anon_vma;             /*   104     8 */
> >         const struct vm_operations_struct  * vm_ops;     /*   112     8 */
> >         long unsigned int          vm_pgoff;             /*   120     8 */
> >         /* --- cacheline 2 boundary (128 bytes) --- */
> >         struct file *              vm_file;              /*   128     8 */
> >         void *                     vm_private_data;      /*   136     8 */
> >         atomic_long_t              swap_readahead_info;  /*   144     8 */
> >         struct mempolicy *         vm_policy;            /*   152     8 */
> >         struct vma_numab_state *   numab_state;          /*   160     8 */
> >         struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   168     8 */
> >
> >         /* size: 176, cachelines: 3, members: 18 */
> >         /* sum members: 173, holes: 1, sum holes: 3 */
> >         /* forced alignments: 2 */
> >         /* last cacheline: 48 bytes */
> > } __attribute__((__aligned__(8)));
> >
> > AFTER
> > =====
> > struct vm_area_struct {
> >         union {
> >                 struct {
> >                         long unsigned int vm_start;      /*     0     8 */
> >                         long unsigned int vm_end;        /*     8     8 */
> >                 };                                       /*     0    16 */
> >                 struct callback_head vm_rcu;             /*     0    16 */
> >         } __attribute__((__aligned__(8)));               /*     0    16 */
> >         struct mm_struct *         vm_mm;                /*    16     8 */
> >         pgprot_t                   vm_page_prot;         /*    24     8 */
> >         union {
> >                 const vm_flags_t   vm_flags;             /*    32     8 */
> >                 vm_flags_t         __vm_flags;           /*    32     8 */
> >         };                                               /*    32     8 */
> >         bool                       detached;             /*    40     1 */
> >
> >         /* XXX 7 bytes hole, try to pack */
> >
> >         long int                   vm_lock_seq;          /*    48     8 */
> >         struct vma_lock *          vm_lock;              /*    56     8 */
> >         /* --- cacheline 1 boundary (64 bytes) --- */
> >         struct {
> >                 struct rb_node     rb;                   /*    64    24 */
> >                 long unsigned int  rb_subtree_last;      /*    88     8 */
> >         }                                                /*    64    32 */
> >         struct list_head           anon_vma_chain;       /*    96    16 */
> >         struct anon_vma *          anon_vma;             /*   112     8 */
> >         const struct vm_operations_struct  * vm_ops;     /*   120     8 */
> >         /* --- cacheline 2 boundary (128 bytes) --- */
> >         long unsigned int          vm_pgoff;             /*   128     8 */
> >         struct file *              vm_file;              /*   136     8 */
> >         void *                     vm_private_data;      /*   144     8 */
> >         atomic_long_t              swap_readahead_info;  /*   152     8 */
> >         struct mempolicy *         vm_policy;            /*   160     8 */
> >         struct vma_numab_state *   numab_state;          /*   168     8 */
> >         struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   176     8 */
> >
> >         /* size: 184, cachelines: 3, members: 18 */
> >         /* sum members: 177, holes: 1, sum holes: 7 */
> >         /* forced alignments: 2 */
> >         /* last cacheline: 56 bytes */
> > } __attribute__((__aligned__(8)));
> >
> >
> > > > >
> > > > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but
> > > > > if it seems important enough, I have nothing against using __u64.
> > > > >
> > > > > Suggested-by: Jann Horn <jannh@google.com>
> > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
> > > >
> > > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> > >

[PATCH v3 tip/perf/core 1/4] mm: introduce mmap_lock_speculation_{start|end}
[PATCH v3 tip/perf/core 2/4] mm: switch to 64-bit mm_lock_seq/vm_lock_seq on 64-bit architectures
[PATCH v3 tip/perf/core 3/4] uprobes: simplify find_active_uprobe_rcu() VMA checks
[PATCH v3 tip/perf/core 4/4] uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution