To increase mm->mm_lock_seq robustness, switch it from int to long, so
that it's a 64-bit counter on 64-bit systems and we can stop worrying
about it wrapping around in just ~4 billion iterations. Same goes for
VMA's matching vm_lock_seq, which is derived from mm_lock_seq.
I didn't use __u64 outright to keep 32-bit architectures unaffected, but
if it seems important enough, I have nothing against using __u64.
Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
include/linux/mm.h | 6 +++---
include/linux/mm_types.h | 4 ++--
include/linux/mmap_lock.h | 8 ++++----
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..97819437832e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -730,7 +730,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
}
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, long *mm_lock_seq)
{
mmap_assert_write_locked(vma->vm_mm);
@@ -749,7 +749,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
- int mm_lock_seq;
+ long mm_lock_seq;
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
@@ -767,7 +767,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
- int mm_lock_seq;
+ long mm_lock_seq;
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5d8cdebd42bc..0dc57d6cfe38 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -715,7 +715,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the
* slowpath.
*/
- int vm_lock_seq;
+ long vm_lock_seq;
/* Unstable RCU readers are allowed to read this. */
struct vma_lock *vm_lock;
#endif
@@ -898,7 +898,7 @@ struct mm_struct {
* Can be read with ACQUIRE semantics if not holding write
* mmap_lock.
*/
- int mm_lock_seq;
+ long mm_lock_seq;
#endif
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 9d23635bc701..f8fd6d879aa9 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -105,7 +105,7 @@ static inline void inc_mm_lock_seq(struct mm_struct *mm, bool acquire)
}
}
-static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq)
+static inline bool mmap_lock_speculation_start(struct mm_struct *mm, long *seq)
{
/* Pairs with RELEASE semantics in inc_mm_lock_seq(). */
*seq = smp_load_acquire(&mm->mm_lock_seq);
@@ -113,7 +113,7 @@ static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq)
return (*seq & 1) == 0;
}
-static inline bool mmap_lock_speculation_end(struct mm_struct *mm, int seq)
+static inline bool mmap_lock_speculation_end(struct mm_struct *mm, long seq)
{
/* Pairs with ACQUIRE semantics in inc_mm_lock_seq(). */
smp_rmb();
@@ -123,8 +123,8 @@ static inline bool mmap_lock_speculation_end(struct mm_struct *mm, int seq)
#else
static inline void init_mm_lock_seq(struct mm_struct *mm) {}
static inline void inc_mm_lock_seq(struct mm_struct *mm, bool acquire) {}
-static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq) { return false; }
-static inline bool mmap_lock_speculation_end(struct mm_struct *mm, int seq) { return false; }
+static inline bool mmap_lock_speculation_start(struct mm_struct *mm, long *seq) { return false; }
+static inline bool mmap_lock_speculation_end(struct mm_struct *mm, long seq) { return false; }
#endif
/*
--
2.43.5
On Thu, Oct 10, 2024 at 01:56:42PM -0700, Andrii Nakryiko wrote: > To increase mm->mm_lock_seq robustness, switch it from int to long, so > that it's a 64-bit counter on 64-bit systems and we can stop worrying > about it wrapping around in just ~4 billion iterations. Same goes for > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but > if it seems important enough, I have nothing against using __u64. (__uXX are the uapi types) > > Suggested-by: Jann Horn <jannh@google.com> Jann, do you see problems with the normal seqcount being unsigned (int)? I suppose especially for preemptible seqcounts it might already be entirely feasible to wrap them? Doing u64 is tricky but not impossible, it would require something like we do for GUP_GET_PXX_LOW_HIGH. OTOH, I don't think we really care about 32bit enough to bother.
On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote: > To increase mm->mm_lock_seq robustness, switch it from int to long, so > that it's a 64-bit counter on 64-bit systems and we can stop worrying > about it wrapping around in just ~4 billion iterations. Same goes for > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but > if it seems important enough, I have nothing against using __u64. > > Suggested-by: Jann Horn <jannh@google.com> > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote: > > To increase mm->mm_lock_seq robustness, switch it from int to long, so > > that it's a 64-bit counter on 64-bit systems and we can stop worrying > > about it wrapping around in just ~4 billion iterations. Same goes for > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. vm_lock_seq does not need to be long but for consistency I guess that makes sense. While at it, can you please change these seq counters to be unsigned? Also, did you check with pahole if the vm_area_struct layout change pushes some members into a difference cacheline or creates new gaps? > > > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but > > if it seems important enough, I have nothing against using __u64. > > > > Suggested-by: Jann Horn <jannh@google.com> > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
On Wed, Oct 16, 2024 at 07:01:59PM -0700, Suren Baghdasaryan wrote: > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote: > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying > > > about it wrapping around in just ~4 billion iterations. Same goes for > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. > > vm_lock_seq does not need to be long but for consistency I guess that > makes sense. While at it, can you please change these seq counters to > be unsigned? Yeah, that. Kees is waging war on signed types that 'overflow'. These sequence counter thingies are designed to wrap and should very much be unsigned.
On Wed, Oct 23, 2024 at 12:02 PM Peter Zijlstra <peterz@infradead.org> wrote: > > On Wed, Oct 16, 2024 at 07:01:59PM -0700, Suren Baghdasaryan wrote: > > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > > > > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote: > > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so > > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying > > > > about it wrapping around in just ~4 billion iterations. Same goes for > > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. > > > > vm_lock_seq does not need to be long but for consistency I guess that > > makes sense. While at it, can you please change these seq counters to > > be unsigned? > > Yeah, that. Kees is waging war on signed types that 'overflow'. These > sequence counter thingies are designed to wrap and should very much be > unsigned. Ah, ok, I already forgot I need to update this. Will do.
On Wed, Oct 16, 2024 at 7:02 PM Suren Baghdasaryan <surenb@google.com> wrote: > > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote: > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying > > > about it wrapping around in just ~4 billion iterations. Same goes for > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. > > vm_lock_seq does not need to be long but for consistency I guess that How come, we literally assign vm_lock_seq from mm_lock_seq and do direct comparisons. They have to be exactly the same type, no? > makes sense. While at it, can you please change these seq counters to > be unsigned? There is `vma->vm_lock_seq = -1;` in kernel/fork.c, should it be switched to ULONG_MAX then? In general, unless this is critical for correctness, I'd very much like stuff like this to be done in the mm tree afterwards, but it seems trivial enough, so if you insist I'll do it. > Also, did you check with pahole if the vm_area_struct layout change > pushes some members into a difference cacheline or creates new gaps? > Just did. We had 3 byte hole after `bool detached;`, it now grew to 7 bytes (so +4) and then vm_lock_seq itself is now 8 bytes (so +4), which now does push rb and rb_subtree_last into *THE SAME* cache line (which sounds like an improvement to me). vm_lock_seq and vm_lock stay in the same cache line. vm_pgoff and vm_file are now in the same cache line, and given they are probably always accessed together, seems like a good accidental change as well. See below pahole outputs before and after. That singular detached bool looks like a complete waste, tbh. Maybe it would be better to roll it into vm_flags and save 8 bytes? (not that I want to do those mm changes in this patch set, of course...). vm_area_struct is otherwise nicely tightly packed. tl;dr, seems fine, and detached would be best to get rid of, if possible (but that's a completely separate thing) BEFORE ====== struct vm_area_struct { union { struct { long unsigned int vm_start; /* 0 8 */ long unsigned int vm_end; /* 8 8 */ }; /* 0 16 */ struct callback_head vm_rcu; /* 0 16 */ } __attribute__((__aligned__(8))); /* 0 16 */ struct mm_struct * vm_mm; /* 16 8 */ pgprot_t vm_page_prot; /* 24 8 */ union { const vm_flags_t vm_flags; /* 32 8 */ vm_flags_t __vm_flags; /* 32 8 */ }; /* 32 8 */ bool detached; /* 40 1 */ /* XXX 3 bytes hole, try to pack */ int vm_lock_seq; /* 44 4 */ struct vma_lock * vm_lock; /* 48 8 */ struct { struct rb_node rb; /* 56 24 */ /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ long unsigned int rb_subtree_last; /* 80 8 */ } /* 56 32 */ struct list_head anon_vma_chain; /* 88 16 */ struct anon_vma * anon_vma; /* 104 8 */ const struct vm_operations_struct * vm_ops; /* 112 8 */ long unsigned int vm_pgoff; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ struct file * vm_file; /* 128 8 */ void * vm_private_data; /* 136 8 */ atomic_long_t swap_readahead_info; /* 144 8 */ struct mempolicy * vm_policy; /* 152 8 */ struct vma_numab_state * numab_state; /* 160 8 */ struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 168 8 */ /* size: 176, cachelines: 3, members: 18 */ /* sum members: 173, holes: 1, sum holes: 3 */ /* forced alignments: 2 */ /* last cacheline: 48 bytes */ } __attribute__((__aligned__(8))); AFTER ===== struct vm_area_struct { union { struct { long unsigned int vm_start; /* 0 8 */ long unsigned int vm_end; /* 8 8 */ }; /* 0 16 */ struct callback_head vm_rcu; /* 0 16 */ } __attribute__((__aligned__(8))); /* 0 16 */ struct mm_struct * vm_mm; /* 16 8 */ pgprot_t vm_page_prot; /* 24 8 */ union { const vm_flags_t vm_flags; /* 32 8 */ vm_flags_t __vm_flags; /* 32 8 */ }; /* 32 8 */ bool detached; /* 40 1 */ /* XXX 7 bytes hole, try to pack */ long int vm_lock_seq; /* 48 8 */ struct vma_lock * vm_lock; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct { struct rb_node rb; /* 64 24 */ long unsigned int rb_subtree_last; /* 88 8 */ } /* 64 32 */ struct list_head anon_vma_chain; /* 96 16 */ struct anon_vma * anon_vma; /* 112 8 */ const struct vm_operations_struct * vm_ops; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ long unsigned int vm_pgoff; /* 128 8 */ struct file * vm_file; /* 136 8 */ void * vm_private_data; /* 144 8 */ atomic_long_t swap_readahead_info; /* 152 8 */ struct mempolicy * vm_policy; /* 160 8 */ struct vma_numab_state * numab_state; /* 168 8 */ struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 176 8 */ /* size: 184, cachelines: 3, members: 18 */ /* sum members: 177, holes: 1, sum holes: 7 */ /* forced alignments: 2 */ /* last cacheline: 56 bytes */ } __attribute__((__aligned__(8))); > > > > > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but > > > if it seems important enough, I have nothing against using __u64. > > > > > > Suggested-by: Jann Horn <jannh@google.com> > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > > > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> >
On Thu, Oct 17, 2024 at 11:55 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Wed, Oct 16, 2024 at 7:02 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > > > > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote: > > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so > > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying > > > > about it wrapping around in just ~4 billion iterations. Same goes for > > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. > > > > vm_lock_seq does not need to be long but for consistency I guess that > > How come, we literally assign vm_lock_seq from mm_lock_seq and do > direct comparisons. They have to be exactly the same type, no? Not necessarily. vm_lock_seq is a snapshot of the mm_lock_seq but it does not have to be a "complete" snapshot. Just something that has a very high probability of identifying a match and a rare false positive is not a problem (see comment in https://elixir.bootlin.com/linux/v6.11.3/source/include/linux/mm.h#L678). So, something like this for taking and comparing a snapshot would do: vma->vm_lock_seq = (unsigned int)mm->mm_lock_seq; if (vma->vm_lock_seq == (unsigned int)mm->mm_lock_seq) > > > makes sense. While at it, can you please change these seq counters to > > be unsigned? > > There is `vma->vm_lock_seq = -1;` in kernel/fork.c, should it be > switched to ULONG_MAX then? In general, unless this is critical for > correctness, I'd very much like stuff like this to be done in the mm > tree afterwards, but it seems trivial enough, so if you insist I'll do > it. Yeah, ULONG_MAX should work fine here. vma->vm_lock_seq is initialized to -1 to avoid false initial match with mm->mm_lock_seq which is initialized to 0. As I said, a false match is not a problem but if we can avoid it, that's better. > > > Also, did you check with pahole if the vm_area_struct layout change > > pushes some members into a difference cacheline or creates new gaps? > > > > Just did. We had 3 byte hole after `bool detached;`, it now grew to 7 > bytes (so +4) and then vm_lock_seq itself is now 8 bytes (so +4), > which now does push rb and rb_subtree_last into *THE SAME* cache line > (which sounds like an improvement to me). vm_lock_seq and vm_lock stay > in the same cache line. vm_pgoff and vm_file are now in the same cache > line, and given they are probably always accessed together, seems like > a good accidental change as well. See below pahole outputs before and > after. Ok, sounds good to me. Looks like keeping both sequence numbers 64bit is not an issue. Changing them to unsigned would be nice and trivial but I don't insist. You can add: Reviewed-by: Suren Baghdasaryan <surenb@google.com> > > That singular detached bool looks like a complete waste, tbh. Maybe it > would be better to roll it into vm_flags and save 8 bytes? (not that I > want to do those mm changes in this patch set, of course...). > vm_area_struct is otherwise nicely tightly packed. > > tl;dr, seems fine, and detached would be best to get rid of, if > possible (but that's a completely separate thing) Yeah, I'll take a look at that. Thanks! > > BEFORE > ====== > struct vm_area_struct { > union { > struct { > long unsigned int vm_start; /* 0 8 */ > long unsigned int vm_end; /* 8 8 */ > }; /* 0 16 */ > struct callback_head vm_rcu; /* 0 16 */ > } __attribute__((__aligned__(8))); /* 0 16 */ > struct mm_struct * vm_mm; /* 16 8 */ > pgprot_t vm_page_prot; /* 24 8 */ > union { > const vm_flags_t vm_flags; /* 32 8 */ > vm_flags_t __vm_flags; /* 32 8 */ > }; /* 32 8 */ > bool detached; /* 40 1 */ > > /* XXX 3 bytes hole, try to pack */ > > int vm_lock_seq; /* 44 4 */ > struct vma_lock * vm_lock; /* 48 8 */ > struct { > struct rb_node rb; /* 56 24 */ > /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ > long unsigned int rb_subtree_last; /* 80 8 */ > } /* 56 32 */ > struct list_head anon_vma_chain; /* 88 16 */ > struct anon_vma * anon_vma; /* 104 8 */ > const struct vm_operations_struct * vm_ops; /* 112 8 */ > long unsigned int vm_pgoff; /* 120 8 */ > /* --- cacheline 2 boundary (128 bytes) --- */ > struct file * vm_file; /* 128 8 */ > void * vm_private_data; /* 136 8 */ > atomic_long_t swap_readahead_info; /* 144 8 */ > struct mempolicy * vm_policy; /* 152 8 */ > struct vma_numab_state * numab_state; /* 160 8 */ > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 168 8 */ > > /* size: 176, cachelines: 3, members: 18 */ > /* sum members: 173, holes: 1, sum holes: 3 */ > /* forced alignments: 2 */ > /* last cacheline: 48 bytes */ > } __attribute__((__aligned__(8))); > > AFTER > ===== > struct vm_area_struct { > union { > struct { > long unsigned int vm_start; /* 0 8 */ > long unsigned int vm_end; /* 8 8 */ > }; /* 0 16 */ > struct callback_head vm_rcu; /* 0 16 */ > } __attribute__((__aligned__(8))); /* 0 16 */ > struct mm_struct * vm_mm; /* 16 8 */ > pgprot_t vm_page_prot; /* 24 8 */ > union { > const vm_flags_t vm_flags; /* 32 8 */ > vm_flags_t __vm_flags; /* 32 8 */ > }; /* 32 8 */ > bool detached; /* 40 1 */ > > /* XXX 7 bytes hole, try to pack */ > > long int vm_lock_seq; /* 48 8 */ > struct vma_lock * vm_lock; /* 56 8 */ > /* --- cacheline 1 boundary (64 bytes) --- */ > struct { > struct rb_node rb; /* 64 24 */ > long unsigned int rb_subtree_last; /* 88 8 */ > } /* 64 32 */ > struct list_head anon_vma_chain; /* 96 16 */ > struct anon_vma * anon_vma; /* 112 8 */ > const struct vm_operations_struct * vm_ops; /* 120 8 */ > /* --- cacheline 2 boundary (128 bytes) --- */ > long unsigned int vm_pgoff; /* 128 8 */ > struct file * vm_file; /* 136 8 */ > void * vm_private_data; /* 144 8 */ > atomic_long_t swap_readahead_info; /* 152 8 */ > struct mempolicy * vm_policy; /* 160 8 */ > struct vma_numab_state * numab_state; /* 168 8 */ > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 176 8 */ > > /* size: 184, cachelines: 3, members: 18 */ > /* sum members: 177, holes: 1, sum holes: 7 */ > /* forced alignments: 2 */ > /* last cacheline: 56 bytes */ > } __attribute__((__aligned__(8))); > > > > > > > > > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but > > > > if it seems important enough, I have nothing against using __u64. > > > > > > > > Suggested-by: Jann Horn <jannh@google.com> > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > > > > > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> > >
On Thu, Oct 17, 2024 at 12:42 PM Suren Baghdasaryan <surenb@google.com> wrote: > > On Thu, Oct 17, 2024 at 11:55 AM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Wed, Oct 16, 2024 at 7:02 PM Suren Baghdasaryan <surenb@google.com> wrote: > > > > > > On Sun, Oct 13, 2024 at 12:56 AM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > > > > > > > On Thu, Oct 10, 2024 at 01:56:42PM GMT, Andrii Nakryiko wrote: > > > > > To increase mm->mm_lock_seq robustness, switch it from int to long, so > > > > > that it's a 64-bit counter on 64-bit systems and we can stop worrying > > > > > about it wrapping around in just ~4 billion iterations. Same goes for > > > > > VMA's matching vm_lock_seq, which is derived from mm_lock_seq. > > > > > > vm_lock_seq does not need to be long but for consistency I guess that > > > > How come, we literally assign vm_lock_seq from mm_lock_seq and do > > direct comparisons. They have to be exactly the same type, no? > > Not necessarily. vm_lock_seq is a snapshot of the mm_lock_seq but it > does not have to be a "complete" snapshot. Just something that has a > very high probability of identifying a match and a rare false positive > is not a problem (see comment in > https://elixir.bootlin.com/linux/v6.11.3/source/include/linux/mm.h#L678). > So, something like this for taking and comparing a snapshot would do: > > vma->vm_lock_seq = (unsigned int)mm->mm_lock_seq; > if (vma->vm_lock_seq == (unsigned int)mm->mm_lock_seq) Ah, ok, I see what's the idea behind it, makes sense. > > > > > > makes sense. While at it, can you please change these seq counters to > > > be unsigned? > > > > There is `vma->vm_lock_seq = -1;` in kernel/fork.c, should it be > > switched to ULONG_MAX then? In general, unless this is critical for > > correctness, I'd very much like stuff like this to be done in the mm > > tree afterwards, but it seems trivial enough, so if you insist I'll do > > it. > > Yeah, ULONG_MAX should work fine here. vma->vm_lock_seq is initialized > to -1 to avoid false initial match with mm->mm_lock_seq which is > initialized to 0. As I said, a false match is not a problem but if we > can avoid it, that's better. ok, ULONG_MAX and unsigned long it is, will update > > > > > > Also, did you check with pahole if the vm_area_struct layout change > > > pushes some members into a difference cacheline or creates new gaps? > > > > > > > Just did. We had 3 byte hole after `bool detached;`, it now grew to 7 > > bytes (so +4) and then vm_lock_seq itself is now 8 bytes (so +4), > > which now does push rb and rb_subtree_last into *THE SAME* cache line > > (which sounds like an improvement to me). vm_lock_seq and vm_lock stay > > in the same cache line. vm_pgoff and vm_file are now in the same cache > > line, and given they are probably always accessed together, seems like > > a good accidental change as well. See below pahole outputs before and > > after. > > Ok, sounds good to me. Looks like keeping both sequence numbers 64bit > is not an issue. Changing them to unsigned would be nice and trivial > but I don't insist. You can add: > > Reviewed-by: Suren Baghdasaryan <surenb@google.com> thanks, will switch to unsigned in the next revision (next week, probably, to let some of the pending patches land) > > > > > That singular detached bool looks like a complete waste, tbh. Maybe it > > would be better to roll it into vm_flags and save 8 bytes? (not that I > > want to do those mm changes in this patch set, of course...). > > vm_area_struct is otherwise nicely tightly packed. > > > > tl;dr, seems fine, and detached would be best to get rid of, if > > possible (but that's a completely separate thing) > > Yeah, I'll take a look at that. Thanks! > > > > > BEFORE > > ====== > > struct vm_area_struct { > > union { > > struct { > > long unsigned int vm_start; /* 0 8 */ > > long unsigned int vm_end; /* 8 8 */ > > }; /* 0 16 */ > > struct callback_head vm_rcu; /* 0 16 */ > > } __attribute__((__aligned__(8))); /* 0 16 */ > > struct mm_struct * vm_mm; /* 16 8 */ > > pgprot_t vm_page_prot; /* 24 8 */ > > union { > > const vm_flags_t vm_flags; /* 32 8 */ > > vm_flags_t __vm_flags; /* 32 8 */ > > }; /* 32 8 */ > > bool detached; /* 40 1 */ > > > > /* XXX 3 bytes hole, try to pack */ > > > > int vm_lock_seq; /* 44 4 */ > > struct vma_lock * vm_lock; /* 48 8 */ > > struct { > > struct rb_node rb; /* 56 24 */ > > /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ > > long unsigned int rb_subtree_last; /* 80 8 */ > > } /* 56 32 */ > > struct list_head anon_vma_chain; /* 88 16 */ > > struct anon_vma * anon_vma; /* 104 8 */ > > const struct vm_operations_struct * vm_ops; /* 112 8 */ > > long unsigned int vm_pgoff; /* 120 8 */ > > /* --- cacheline 2 boundary (128 bytes) --- */ > > struct file * vm_file; /* 128 8 */ > > void * vm_private_data; /* 136 8 */ > > atomic_long_t swap_readahead_info; /* 144 8 */ > > struct mempolicy * vm_policy; /* 152 8 */ > > struct vma_numab_state * numab_state; /* 160 8 */ > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 168 8 */ > > > > /* size: 176, cachelines: 3, members: 18 */ > > /* sum members: 173, holes: 1, sum holes: 3 */ > > /* forced alignments: 2 */ > > /* last cacheline: 48 bytes */ > > } __attribute__((__aligned__(8))); > > > > AFTER > > ===== > > struct vm_area_struct { > > union { > > struct { > > long unsigned int vm_start; /* 0 8 */ > > long unsigned int vm_end; /* 8 8 */ > > }; /* 0 16 */ > > struct callback_head vm_rcu; /* 0 16 */ > > } __attribute__((__aligned__(8))); /* 0 16 */ > > struct mm_struct * vm_mm; /* 16 8 */ > > pgprot_t vm_page_prot; /* 24 8 */ > > union { > > const vm_flags_t vm_flags; /* 32 8 */ > > vm_flags_t __vm_flags; /* 32 8 */ > > }; /* 32 8 */ > > bool detached; /* 40 1 */ > > > > /* XXX 7 bytes hole, try to pack */ > > > > long int vm_lock_seq; /* 48 8 */ > > struct vma_lock * vm_lock; /* 56 8 */ > > /* --- cacheline 1 boundary (64 bytes) --- */ > > struct { > > struct rb_node rb; /* 64 24 */ > > long unsigned int rb_subtree_last; /* 88 8 */ > > } /* 64 32 */ > > struct list_head anon_vma_chain; /* 96 16 */ > > struct anon_vma * anon_vma; /* 112 8 */ > > const struct vm_operations_struct * vm_ops; /* 120 8 */ > > /* --- cacheline 2 boundary (128 bytes) --- */ > > long unsigned int vm_pgoff; /* 128 8 */ > > struct file * vm_file; /* 136 8 */ > > void * vm_private_data; /* 144 8 */ > > atomic_long_t swap_readahead_info; /* 152 8 */ > > struct mempolicy * vm_policy; /* 160 8 */ > > struct vma_numab_state * numab_state; /* 168 8 */ > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 176 8 */ > > > > /* size: 184, cachelines: 3, members: 18 */ > > /* sum members: 177, holes: 1, sum holes: 7 */ > > /* forced alignments: 2 */ > > /* last cacheline: 56 bytes */ > > } __attribute__((__aligned__(8))); > > > > > > > > > > > > > > I didn't use __u64 outright to keep 32-bit architectures unaffected, but > > > > > if it seems important enough, I have nothing against using __u64. > > > > > > > > > > Suggested-by: Jann Horn <jannh@google.com> > > > > > Signed-off-by: Andrii Nakryiko <andrii@kernel.org> > > > > > > > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> > > >
© 2016 - 2024 Red Hat, Inc.