[v1] move per-vma lock into vm_area_struct

[PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 2 months ago

rw_semaphore is a sizable structure of 40 bytes and consumes
considerable space for each vm_area_struct. However vma_lock has
two important specifics which can be used to replace rw_semaphore
with a simpler structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode.
Because of these requirements, full rw_semaphore functionality is not
needed and we can replace rw_semaphore with an atomic variable.
When a reader takes read lock, it increments the atomic, unless the
top two bits are set indicating a writer is present.
When writer takes write lock, it sets VMA_LOCK_WR_LOCKED bit if there
are no readers or VMA_LOCK_WR_WAIT bit if readers are holding the lock
and puts itself onto newly introduced mm.vma_writer_wait. Since all
writers take mmap_lock in write mode first, there can be only one writer
at a time. The last reader to release the lock will signal the writer
to wake up.
atomic_t might overflow if there are many competing readers, therefore
vma_start_read() implements an overflow check and if that occurs it
exits with a failure to lock. vma_start_read_locked{_nested} may cause
an overflow but it is later handled by __vma_end_read().

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h        | 142 ++++++++++++++++++++++++++++++++++----
 include/linux/mm_types.h  |  18 ++++-
 include/linux/mmap_lock.h |   3 +
 kernel/fork.c             |   2 +-
 mm/init-mm.c              |   2 +
 5 files changed, 151 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c1c2899464db..27c0e9ba81c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -686,7 +686,41 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
 #ifdef CONFIG_PER_VMA_LOCK
 static inline void vma_lock_init(struct vma_lock *vm_lock)
 {
-	init_rwsem(&vm_lock->lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	static struct lock_class_key lockdep_key;
+
+	lockdep_init_map(&vm_lock->dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+	atomic_set(&vm_lock->count, VMA_LOCK_UNLOCKED);
+}
+
+static inline unsigned int vma_lock_reader_count(unsigned int counter)
+{
+	return counter & VMA_LOCK_RD_COUNT_MASK;
+}
+
+static inline void __vma_end_read(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+	unsigned int count, prev, new;
+
+	count = (unsigned int)atomic_read(&vma->vm_lock.count);
+	for (;;) {
+		if (unlikely(vma_lock_reader_count(count) == 0)) {
+			/*
+			 * Overflow was possible in vma_start_read_locked().
+			 * When detected, wrap around preserving writer bits.
+			 */
+			new = count | ~(VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT);
+		} else
+			new = count - 1;
+		prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+		if (prev == count)
+			break;
+		count = prev;
+	}
+	rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+	if (vma_lock_reader_count(new) == 0 && (new & VMA_LOCK_WR_WAIT))
+		wake_up(&mm->vma_writer_wait);
 }
 
 /*
@@ -696,6 +730,9 @@ static inline void vma_lock_init(struct vma_lock *vm_lock)
  */
 static inline bool vma_start_read(struct vm_area_struct *vma)
 {
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned int count, prev, new;
+
 	/*
 	 * Check before locking. A race might cause false locked result.
 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -703,11 +740,35 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * we don't rely on for anything - the mm_lock_seq read against which we
 	 * need ordering is below.
 	 */
-	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
+	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
-		return false;
+	rwsem_acquire_read(&vma->vm_lock.dep_map, 0, 0, _RET_IP_);
+	count = (unsigned int)atomic_read(&vma->vm_lock.count);
+	for (;;) {
+		/* Is VMA is write-locked or writer is waiting? */
+		if (count & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)) {
+			rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+			return false;
+		}
+
+		new = count + 1;
+		/* If atomic_t overflows, fail to lock. */
+		if (new & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)) {
+			rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+			return false;
+		}
+
+		/*
+		 * Atomic RMW will provide implicit mb on success to pair with smp_wmb in
+		 * vma_start_write, on failure we retry.
+		 */
+		prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+		if (prev == count)
+			break;
+		count = prev;
+	}
+	lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
 
 	/*
 	 * Overflow might produce false locked result.
@@ -720,8 +781,8 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * after it has been unlocked.
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
-	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->vm_lock.lock);
+	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+		__vma_end_read(mm, vma);
 		return false;
 	}
 	return true;
@@ -733,8 +794,30 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
  */
 static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
 {
-	mmap_assert_locked(vma->vm_mm);
-	down_read_nested(&vma->vm_lock.lock, subclass);
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned int count, prev, new;
+
+	mmap_assert_locked(mm);
+
+	rwsem_acquire_read(&vma->vm_lock.dep_map, subclass, 0, _RET_IP_);
+	count = (unsigned int)atomic_read(&vma->vm_lock.count);
+	for (;;) {
+		/* We are holding mmap_lock, no active or waiting writers are possible. */
+		VM_BUG_ON_VMA(count & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT), vma);
+		new = count + 1;
+		/* Unlikely but if atomic_t overflows, wrap around to. */
+		if (WARN_ON(new & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)))
+			new = 0;
+		/*
+		 * Atomic RMW will provide implicit mb on success to pair with smp_wmb in
+		 * vma_start_write, on failure we retry.
+		 */
+		prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+		if (prev == count)
+			break;
+		count = prev;
+	}
+	lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
 }
 
 /*
@@ -743,14 +826,15 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
  */
 static inline void vma_start_read_locked(struct vm_area_struct *vma)
 {
-	mmap_assert_locked(vma->vm_mm);
-	down_read(&vma->vm_lock.lock);
+	vma_start_read_locked_nested(vma, 0);
 }
 
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
+	struct mm_struct *mm = vma->vm_mm;
+
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->vm_lock.lock);
+	__vma_end_read(mm, vma);
 	rcu_read_unlock();
 }
 
@@ -774,12 +858,34 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
  */
 static inline void vma_start_write(struct vm_area_struct *vma)
 {
+	unsigned int count, prev, new;
 	unsigned int mm_lock_seq;
 
+	might_sleep();
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	down_write(&vma->vm_lock.lock);
+	rwsem_acquire(&vma->vm_lock.dep_map, 0, 0, _RET_IP_);
+	count = (unsigned int)atomic_read(&vma->vm_lock.count);
+	for (;;) {
+		if (vma_lock_reader_count(count) > 0)
+			new = count | VMA_LOCK_WR_WAIT;
+		else
+			new = count | VMA_LOCK_WR_LOCKED;
+		prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+		if (prev == count)
+			break;
+		count = prev;
+	}
+	if (new & VMA_LOCK_WR_WAIT) {
+		lock_contended(&vma->vm_lock.dep_map, _RET_IP_);
+		wait_event(vma->vm_mm->vma_writer_wait,
+			   atomic_cmpxchg(&vma->vm_lock.count, VMA_LOCK_WR_WAIT,
+					  VMA_LOCK_WR_LOCKED) == VMA_LOCK_WR_WAIT);
+
+	}
+	lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
+
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
 	 * from the early lockless pessimistic check in vma_start_read().
@@ -787,7 +893,10 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
 	 */
 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-	up_write(&vma->vm_lock.lock);
+	/* Write barrier to ensure vm_lock_seq change is visible before count */
+	smp_wmb();
+	rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+	atomic_set(&vma->vm_lock.count, VMA_LOCK_UNLOCKED);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -797,9 +906,14 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
 }
 
+static inline bool is_vma_read_locked(struct vm_area_struct *vma)
+{
+	return vma_lock_reader_count((unsigned int)atomic_read(&vma->vm_lock.count)) > 0;
+}
+
 static inline void vma_assert_locked(struct vm_area_struct *vma)
 {
-	if (!rwsem_is_locked(&vma->vm_lock.lock))
+	if (!is_vma_read_locked(vma))
 		vma_assert_write_locked(vma);
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5c4bfdcfac72..789bccc05520 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -615,8 +615,23 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
 }
 #endif
 
+#define VMA_LOCK_UNLOCKED		0
+#define VMA_LOCK_WR_LOCKED		(1 << 31)
+#define VMA_LOCK_WR_WAIT		(1 << 30)
+
+#define VMA_LOCK_RD_COUNT_MASK		(VMA_LOCK_WR_WAIT - 1)
+
 struct vma_lock {
-	struct rw_semaphore lock;
+	/*
+	 * count & VMA_LOCK_RD_COUNT_MASK > 0 ==> read-locked with 'count' number of readers
+	 * count & VMA_LOCK_WR_LOCKED != 0 ==> write-locked
+	 * count & VMA_LOCK_WR_WAIT != 0 ==> writer is waiting
+	 * count = 0 ==> unlocked
+	 */
+	atomic_t count;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 };
 
 struct vma_numab_state {
@@ -883,6 +898,7 @@ struct mm_struct {
 					  * by mmlist_lock
 					  */
 #ifdef CONFIG_PER_VMA_LOCK
+		struct wait_queue_head vma_writer_wait;
 		/*
 		 * This field has lock-like semantics, meaning it is sometimes
 		 * accessed with ACQUIRE/RELEASE semantics.
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 58dde2e35f7e..769ab97fff3e 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -121,6 +121,9 @@ static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
 	mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+	init_waitqueue_head(&mm->vma_writer_wait);
+#endif
 }
 
 static inline void mmap_write_lock(struct mm_struct *mm)
diff --git a/kernel/fork.c b/kernel/fork.c
index 9e504105f24f..726050c557e2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -486,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
+	VM_BUG_ON_VMA(is_vma_read_locked(vma), vma);
 	__vm_area_free(vma);
 }
 #endif
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 6af3ad675930..db058873ba18 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,6 +40,8 @@ struct mm_struct init_mm = {
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 #ifdef CONFIG_PER_VMA_LOCK
+	.vma_writer_wait =
+		__WAIT_QUEUE_HEAD_INITIALIZER(init_mm.vma_writer_wait),
 	.mm_lock_seq	= SEQCNT_ZERO(init_mm.mm_lock_seq),
 #endif
 	.user_ns	= &init_user_ns,
-- 
2.47.0.277.g8800431eea-goog

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Matthew Wilcox 1 year, 2 months ago

On Mon, Nov 11, 2024 at 12:55:05PM -0800, Suren Baghdasaryan wrote:
> When a reader takes read lock, it increments the atomic, unless the
> top two bits are set indicating a writer is present.
> When writer takes write lock, it sets VMA_LOCK_WR_LOCKED bit if there
> are no readers or VMA_LOCK_WR_WAIT bit if readers are holding the lock
> and puts itself onto newly introduced mm.vma_writer_wait. Since all
> writers take mmap_lock in write mode first, there can be only one writer
> at a time. The last reader to release the lock will signal the writer
> to wake up.

I don't think you need two bits.  You can do it this way:

0x8000'0000 - No readers, no writers
0x1-7fff'ffff - Some number of readers
0x0 - Writer held
0x8000'0001-0xffff'ffff - Reader held, writer waiting

A prospective writer subtracts 0x8000'0000.  If the result is 0, it got
the lock, otherwise it sleeps until it is 0.

A writer unlocks by adding 0x8000'0000 (not by setting the value to
0x8000'0000).

A reader unlocks by adding 1.  If the result is 0, it wakes the writer.

A prospective reader subtracts 1.  If the result is positive, it got the
lock, otherwise it does the unlock above (this might be the one which
wakes the writer).

And ... that's it.  See how we use the CPU arithmetic flags to tell us
everything we need to know without doing arithmetic separately?

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 2 months ago

On Mon, Nov 11, 2024 at 8:58 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Nov 11, 2024 at 12:55:05PM -0800, Suren Baghdasaryan wrote:
> > When a reader takes read lock, it increments the atomic, unless the
> > top two bits are set indicating a writer is present.
> > When writer takes write lock, it sets VMA_LOCK_WR_LOCKED bit if there
> > are no readers or VMA_LOCK_WR_WAIT bit if readers are holding the lock
> > and puts itself onto newly introduced mm.vma_writer_wait. Since all
> > writers take mmap_lock in write mode first, there can be only one writer
> > at a time. The last reader to release the lock will signal the writer
> > to wake up.
>
> I don't think you need two bits.  You can do it this way:
>
> 0x8000'0000 - No readers, no writers
> 0x1-7fff'ffff - Some number of readers
> 0x0 - Writer held
> 0x8000'0001-0xffff'ffff - Reader held, writer waiting
>
> A prospective writer subtracts 0x8000'0000.  If the result is 0, it got
> the lock, otherwise it sleeps until it is 0.
>
> A writer unlocks by adding 0x8000'0000 (not by setting the value to
> 0x8000'0000).
>
> A reader unlocks by adding 1.  If the result is 0, it wakes the writer.
>
> A prospective reader subtracts 1.  If the result is positive, it got the
> lock, otherwise it does the unlock above (this might be the one which
> wakes the writer).
>
> And ... that's it.  See how we use the CPU arithmetic flags to tell us
> everything we need to know without doing arithmetic separately?

Yes, this is neat! You are using the fact that write-locked == no
readers to eliminate unnecessary state. I'll give that a try. Thanks!

>

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Peter Zijlstra 1 year, 1 month ago

On Tue, Nov 12, 2024 at 07:18:45AM -0800, Suren Baghdasaryan wrote:
> On Mon, Nov 11, 2024 at 8:58 PM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Mon, Nov 11, 2024 at 12:55:05PM -0800, Suren Baghdasaryan wrote:
> > > When a reader takes read lock, it increments the atomic, unless the
> > > top two bits are set indicating a writer is present.
> > > When writer takes write lock, it sets VMA_LOCK_WR_LOCKED bit if there
> > > are no readers or VMA_LOCK_WR_WAIT bit if readers are holding the lock
> > > and puts itself onto newly introduced mm.vma_writer_wait. Since all
> > > writers take mmap_lock in write mode first, there can be only one writer
> > > at a time. The last reader to release the lock will signal the writer
> > > to wake up.
> >
> > I don't think you need two bits.  You can do it this way:
> >
> > 0x8000'0000 - No readers, no writers
> > 0x1-7fff'ffff - Some number of readers
> > 0x0 - Writer held
> > 0x8000'0001-0xffff'ffff - Reader held, writer waiting
> >
> > A prospective writer subtracts 0x8000'0000.  If the result is 0, it got
> > the lock, otherwise it sleeps until it is 0.
> >
> > A writer unlocks by adding 0x8000'0000 (not by setting the value to
> > 0x8000'0000).
> >
> > A reader unlocks by adding 1.  If the result is 0, it wakes the writer.
> >
> > A prospective reader subtracts 1.  If the result is positive, it got the
> > lock, otherwise it does the unlock above (this might be the one which
> > wakes the writer).
> >
> > And ... that's it.  See how we use the CPU arithmetic flags to tell us
> > everything we need to know without doing arithmetic separately?
> 
> Yes, this is neat! You are using the fact that write-locked == no
> readers to eliminate unnecessary state. I'll give that a try. Thanks!

The reason I got here is that Vlastimil poked me about the whole
TYPESAFE_BY_RCU thing.

So the normal way those things work is with a refcount, if the refcount
is non-zero, the identifying fields should be stable and you can
determine if you have the right object, otherwise tough luck.

And I was thinking that since you abuse this rwsem you have, you might
as well turn that into a refcount with some extra.

So I would propose a slightly different solution.

Replace vm_lock with vm_refcnt. Replace vm_detached with vm_refcnt == 0
-- that is, attach sets refcount to 1 to indicate it is part of the mas,
detached is the final 'put'.

RCU lookup does the inc_not_zero thing, when increment succeeds, compare
mm/addr to validate.

vma_start_write() already relies on mmap_lock being held for writing,
and thus does not have to worry about writer-vs-writer contention, that
is fully resolved by mmap_sem. This means we only need to wait for
readers to drop out.

vma_start_write()
	add(0x8000'0001); // could fetch_add and double check the high
			  // bit wasn't already set.
	wait-until(refcnt == 0x8000'0002); // mas + writer ref
	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
	sub(0x8000'0000);

vma_end_write()
	put();

vma_start_read() then becomes something like:

	if (vm_lock_seq == mm_lock_seq)
	  return false;

	cnt = fetch_inc(1);
	if (cnt & msb || vm_lock_seq == mm_lock_seq) {
	  put();
	  return false;
	}

	return true;
	
vma_end_read() then becomes:
	put();


and the down_read() from uffffffd requires mmap_read_lock() and thus
does not have to worry about writers, it can simpy be inc() and put(),
no?

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Tue, Dec 10, 2024 at 2:39 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Nov 12, 2024 at 07:18:45AM -0800, Suren Baghdasaryan wrote:
> > On Mon, Nov 11, 2024 at 8:58 PM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Mon, Nov 11, 2024 at 12:55:05PM -0800, Suren Baghdasaryan wrote:
> > > > When a reader takes read lock, it increments the atomic, unless the
> > > > top two bits are set indicating a writer is present.
> > > > When writer takes write lock, it sets VMA_LOCK_WR_LOCKED bit if there
> > > > are no readers or VMA_LOCK_WR_WAIT bit if readers are holding the lock
> > > > and puts itself onto newly introduced mm.vma_writer_wait. Since all
> > > > writers take mmap_lock in write mode first, there can be only one writer
> > > > at a time. The last reader to release the lock will signal the writer
> > > > to wake up.
> > >
> > > I don't think you need two bits.  You can do it this way:
> > >
> > > 0x8000'0000 - No readers, no writers
> > > 0x1-7fff'ffff - Some number of readers
> > > 0x0 - Writer held
> > > 0x8000'0001-0xffff'ffff - Reader held, writer waiting
> > >
> > > A prospective writer subtracts 0x8000'0000.  If the result is 0, it got
> > > the lock, otherwise it sleeps until it is 0.
> > >
> > > A writer unlocks by adding 0x8000'0000 (not by setting the value to
> > > 0x8000'0000).
> > >
> > > A reader unlocks by adding 1.  If the result is 0, it wakes the writer.
> > >
> > > A prospective reader subtracts 1.  If the result is positive, it got the
> > > lock, otherwise it does the unlock above (this might be the one which
> > > wakes the writer).
> > >
> > > And ... that's it.  See how we use the CPU arithmetic flags to tell us
> > > everything we need to know without doing arithmetic separately?
> >
> > Yes, this is neat! You are using the fact that write-locked == no
> > readers to eliminate unnecessary state. I'll give that a try. Thanks!
>
> The reason I got here is that Vlastimil poked me about the whole
> TYPESAFE_BY_RCU thing.
>
> So the normal way those things work is with a refcount, if the refcount
> is non-zero, the identifying fields should be stable and you can
> determine if you have the right object, otherwise tough luck.
>
> And I was thinking that since you abuse this rwsem you have, you might
> as well turn that into a refcount with some extra.
>
> So I would propose a slightly different solution.
>
> Replace vm_lock with vm_refcnt. Replace vm_detached with vm_refcnt == 0
> -- that is, attach sets refcount to 1 to indicate it is part of the mas,
> detached is the final 'put'.

I need to double-check if we ever write-lock a detached vma. I don't
think we do but better be safe. If we do then that wait-until() should
accept 0x8000'0001 as well.

>
> RCU lookup does the inc_not_zero thing, when increment succeeds, compare
> mm/addr to validate.
>
> vma_start_write() already relies on mmap_lock being held for writing,
> and thus does not have to worry about writer-vs-writer contention, that
> is fully resolved by mmap_sem. This means we only need to wait for
> readers to drop out.
>
> vma_start_write()
>         add(0x8000'0001); // could fetch_add and double check the high
>                           // bit wasn't already set.
>         wait-until(refcnt == 0x8000'0002); // mas + writer ref
>         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
>         sub(0x8000'0000);
>
> vma_end_write()
>         put();

We don't really have vma_end_write(). Instead it's vma_end_write_all()
which increments mm_lock_seq unlocking all write-locked VMAs.
Therefore in vma_start_write() I think we can sub(0x8000'0001) at the
end.

>
> vma_start_read() then becomes something like:
>
>         if (vm_lock_seq == mm_lock_seq)
>           return false;
>
>         cnt = fetch_inc(1);
>         if (cnt & msb || vm_lock_seq == mm_lock_seq) {
>           put();
>           return false;
>         }
>
>         return true;
>
> vma_end_read() then becomes:
>         put();
>
>
> and the down_read() from uffffffd requires mmap_read_lock() and thus
> does not have to worry about writers, it can simpy be inc() and put(),
> no?

I think your proposal should work. Let me try to code it and see if
something breaks.
Thanks Peter!

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Peter Zijlstra 1 year, 1 month ago

On Tue, Dec 10, 2024 at 03:37:50PM -0800, Suren Baghdasaryan wrote:

> > Replace vm_lock with vm_refcnt. Replace vm_detached with vm_refcnt == 0
> > -- that is, attach sets refcount to 1 to indicate it is part of the mas,
> > detached is the final 'put'.
> 
> I need to double-check if we ever write-lock a detached vma. I don't
> think we do but better be safe. If we do then that wait-until() should
> accept 0x8000'0001 as well.

vma_start_write()
  __is_vma_write_locked()
    mmap_assert_write_locked(vma->vm_mm);

So this really should hold afaict.

> > RCU lookup does the inc_not_zero thing, when increment succeeds, compare
> > mm/addr to validate.
> >
> > vma_start_write() already relies on mmap_lock being held for writing,
> > and thus does not have to worry about writer-vs-writer contention, that
> > is fully resolved by mmap_sem. This means we only need to wait for
> > readers to drop out.
> >
> > vma_start_write()
> >         add(0x8000'0001); // could fetch_add and double check the high
> >                           // bit wasn't already set.
> >         wait-until(refcnt == 0x8000'0002); // mas + writer ref
> >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> >         sub(0x8000'0000);
> >
> > vma_end_write()
> >         put();
> 
> We don't really have vma_end_write(). Instead it's vma_end_write_all()
> which increments mm_lock_seq unlocking all write-locked VMAs.
> Therefore in vma_start_write() I think we can sub(0x8000'0001) at the
> end.

Right, I know you don't, but you should :-), I've suggested adding this
before.

> > vma_start_read() then becomes something like:
> >
> >         if (vm_lock_seq == mm_lock_seq)
> >           return false;
> >
> >         cnt = fetch_inc(1);
> >         if (cnt & msb || vm_lock_seq == mm_lock_seq) {
> >           put();
> >           return false;
> >         }
> >
> >         return true;
> >
> > vma_end_read() then becomes:
> >         put();
> >
> >
> > and the down_read() from uffffffd requires mmap_read_lock() and thus
> > does not have to worry about writers, it can simpy be inc() and put(),
> > no?
> 
> I think your proposal should work. Let me try to code it and see if
> something breaks.

Btw, for the wait-until() and put() you can use rcuwait; that is the
simplest wait form we have. It's suitable because we only ever have the
one waiter.

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Wed, Dec 11, 2024 at 12:25 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Dec 10, 2024 at 03:37:50PM -0800, Suren Baghdasaryan wrote:
>
> > > Replace vm_lock with vm_refcnt. Replace vm_detached with vm_refcnt == 0
> > > -- that is, attach sets refcount to 1 to indicate it is part of the mas,
> > > detached is the final 'put'.
> >
> > I need to double-check if we ever write-lock a detached vma. I don't
> > think we do but better be safe. If we do then that wait-until() should
> > accept 0x8000'0001 as well.
>
> vma_start_write()
>   __is_vma_write_locked()
>     mmap_assert_write_locked(vma->vm_mm);
>
> So this really should hold afaict.
>
> > > RCU lookup does the inc_not_zero thing, when increment succeeds, compare
> > > mm/addr to validate.
> > >
> > > vma_start_write() already relies on mmap_lock being held for writing,
> > > and thus does not have to worry about writer-vs-writer contention, that
> > > is fully resolved by mmap_sem. This means we only need to wait for
> > > readers to drop out.
> > >
> > > vma_start_write()
> > >         add(0x8000'0001); // could fetch_add and double check the high
> > >                           // bit wasn't already set.
> > >         wait-until(refcnt == 0x8000'0002); // mas + writer ref
> > >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > >         sub(0x8000'0000);
> > >
> > > vma_end_write()
> > >         put();
> >
> > We don't really have vma_end_write(). Instead it's vma_end_write_all()
> > which increments mm_lock_seq unlocking all write-locked VMAs.
> > Therefore in vma_start_write() I think we can sub(0x8000'0001) at the
> > end.
>
> Right, I know you don't, but you should :-), I've suggested adding this
> before.

I'll look into adding it. IIRC there was some issue with that but I
can't recall...

>
> > > vma_start_read() then becomes something like:
> > >
> > >         if (vm_lock_seq == mm_lock_seq)
> > >           return false;
> > >
> > >         cnt = fetch_inc(1);
> > >         if (cnt & msb || vm_lock_seq == mm_lock_seq) {
> > >           put();
> > >           return false;
> > >         }
> > >
> > >         return true;
> > >
> > > vma_end_read() then becomes:
> > >         put();
> > >
> > >
> > > and the down_read() from uffffffd requires mmap_read_lock() and thus
> > > does not have to worry about writers, it can simpy be inc() and put(),
> > > no?
> >
> > I think your proposal should work. Let me try to code it and see if
> > something breaks.
>
> Btw, for the wait-until() and put() you can use rcuwait; that is the
> simplest wait form we have. It's suitable because we only ever have the
> one waiter.

Yes, Davidlohr mentioned that before. I'll do that. Thanks!

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Wed, Dec 11, 2024 at 7:20 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Dec 11, 2024 at 12:25 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Dec 10, 2024 at 03:37:50PM -0800, Suren Baghdasaryan wrote:
> >
> > > > Replace vm_lock with vm_refcnt. Replace vm_detached with vm_refcnt == 0
> > > > -- that is, attach sets refcount to 1 to indicate it is part of the mas,
> > > > detached is the final 'put'.
> > >
> > > I need to double-check if we ever write-lock a detached vma. I don't
> > > think we do but better be safe. If we do then that wait-until() should
> > > accept 0x8000'0001 as well.
> >
> > vma_start_write()
> >   __is_vma_write_locked()
> >     mmap_assert_write_locked(vma->vm_mm);
> >
> > So this really should hold afaict.
> >
> > > > RCU lookup does the inc_not_zero thing, when increment succeeds, compare
> > > > mm/addr to validate.
> > > >
> > > > vma_start_write() already relies on mmap_lock being held for writing,
> > > > and thus does not have to worry about writer-vs-writer contention, that
> > > > is fully resolved by mmap_sem. This means we only need to wait for
> > > > readers to drop out.
> > > >
> > > > vma_start_write()
> > > >         add(0x8000'0001); // could fetch_add and double check the high
> > > >                           // bit wasn't already set.
> > > >         wait-until(refcnt == 0x8000'0002); // mas + writer ref
> > > >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > > >         sub(0x8000'0000);
> > > >
> > > > vma_end_write()
> > > >         put();
> > >
> > > We don't really have vma_end_write(). Instead it's vma_end_write_all()
> > > which increments mm_lock_seq unlocking all write-locked VMAs.
> > > Therefore in vma_start_write() I think we can sub(0x8000'0001) at the
> > > end.
> >
> > Right, I know you don't, but you should :-), I've suggested adding this
> > before.
>
> I'll look into adding it. IIRC there was some issue with that but I
> can't recall...
>
> >
> > > > vma_start_read() then becomes something like:
> > > >
> > > >         if (vm_lock_seq == mm_lock_seq)
> > > >           return false;
> > > >
> > > >         cnt = fetch_inc(1);
> > > >         if (cnt & msb || vm_lock_seq == mm_lock_seq) {
> > > >           put();
> > > >           return false;
> > > >         }
> > > >
> > > >         return true;
> > > >
> > > > vma_end_read() then becomes:
> > > >         put();
> > > >
> > > >
> > > > and the down_read() from uffffffd requires mmap_read_lock() and thus
> > > > does not have to worry about writers, it can simpy be inc() and put(),
> > > > no?
> > >
> > > I think your proposal should work. Let me try to code it and see if
> > > something breaks.

Ok, I tried it out and things are a bit more complex:
1. We should allow write-locking a detached VMA, IOW vma_start_write()
can be called when vm_refcnt is 0. In that situation add(0x8000'0001)
leads to "addition on 0; use-after-free". Maybe I can introduce a new
refcnt function which does not complain when adding to 0?
2. Adding 0x80000000 saturates refcnt, so I have to use a lower bit
0x40000000 to denote writers.
3. Currently vma_mark_attached() can be called on an already attached
VMA. With vma->detached being a separate attribute that works fine but
when we combine it with the vm_lock things break (extra attach would
leak into lock count). I'll see if I can catch all the cases when we
do this and clean them up (not call vma_mark_attached() when not
necessary).

> >
> > Btw, for the wait-until() and put() you can use rcuwait; that is the
> > simplest wait form we have. It's suitable because we only ever have the
> > one waiter.
>
> Yes, Davidlohr mentioned that before. I'll do that. Thanks!

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Peter Zijlstra 1 year, 1 month ago

On Wed, Dec 11, 2024 at 07:01:16PM -0800, Suren Baghdasaryan wrote:

> > > > I think your proposal should work. Let me try to code it and see if
> > > > something breaks.
> 
> Ok, I tried it out and things are a bit more complex:
> 1. We should allow write-locking a detached VMA, IOW vma_start_write()
> can be called when vm_refcnt is 0. 

This sounds dodgy, refcnt being zero basically means the object is dead
and you shouldn't be touching it no more. Where does this happen and
why?

Notably, it being 0 means it is no longer in the mas tree and can't be
found anymore.

> 2. Adding 0x80000000 saturates refcnt, so I have to use a lower bit
> 0x40000000 to denote writers.

I'm confused, what? We're talking about atomic_t, right?

> 3. Currently vma_mark_attached() can be called on an already attached
> VMA. With vma->detached being a separate attribute that works fine but
> when we combine it with the vm_lock things break (extra attach would
> leak into lock count). I'll see if I can catch all the cases when we
> do this and clean them up (not call vma_mark_attached() when not
> necessary).

Right, I hadn't looked at that thing in detail, that sounds like it
needs a wee cleanup like you suggest.

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Thu, Dec 12, 2024 at 1:17 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Dec 11, 2024 at 07:01:16PM -0800, Suren Baghdasaryan wrote:
>
> > > > > I think your proposal should work. Let me try to code it and see if
> > > > > something breaks.
> >
> > Ok, I tried it out and things are a bit more complex:
> > 1. We should allow write-locking a detached VMA, IOW vma_start_write()
> > can be called when vm_refcnt is 0.
>
> This sounds dodgy, refcnt being zero basically means the object is dead
> and you shouldn't be touching it no more. Where does this happen and
> why?
>
> Notably, it being 0 means it is no longer in the mas tree and can't be
> found anymore.

It happens when a newly created vma that was not yet attached
(vma->vm_refcnt = 0) is write-locked before being added into the vma
tree. For example:
mmap()
  mmap_write_lock()
  vma = vm_area_alloc() // vma->vm_refcnt = 0 (detached)
  //vma attributes are initialized
  vma_start_write() // write 0x8000 0001 into vma->vm_refcnt
  mas_store_gfp()
  vma_mark_attached()
  mmap_write_lock() // vma_end_write_all()

In this scenario, we write-lock the VMA before adding it into the tree
to prevent readers (pagefaults) from using it until we drop the
mmap_write_lock(). In your proposal, the first thing vma_start_write()
does is add(0x8000'0001) and that will trigger a warning.
For now instead of add(0x8000'0001) I can play this game to avoid the warning:

if (refcount_inc_not_zero(&vma->vm_refcnt))
    refcount_add(0x80000000, &vma->vm_refcnt);
else
    refcount_set(&vma->vm_refcnt, 0x80000001);

this refcount_set() works because vma with vm_refcnt==0 could not be
found by readers. I'm not sure this will still work when we add
TYPESAFE_BY_RCU and introduce vma reuse possibility.

>
> > 2. Adding 0x80000000 saturates refcnt, so I have to use a lower bit
> > 0x40000000 to denote writers.
>
> I'm confused, what? We're talking about atomic_t, right?

I thought you suggested using refcount_t. According to
https://elixir.bootlin.com/linux/v6.13-rc2/source/include/linux/refcount.h#L22
valid values would be [0..0x7fff_ffff] and 0x80000000 is outside of
that range. What am I missing?

>
> > 3. Currently vma_mark_attached() can be called on an already attached
> > VMA. With vma->detached being a separate attribute that works fine but
> > when we combine it with the vm_lock things break (extra attach would
> > leak into lock count). I'll see if I can catch all the cases when we
> > do this and clean them up (not call vma_mark_attached() when not
> > necessary).
>
> Right, I hadn't looked at that thing in detail, that sounds like it
> needs a wee cleanup like you suggest.

Yes, I'll embark on that today. Will see how much of a problem that is.

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Peter Zijlstra 1 year, 1 month ago

On Thu, Dec 12, 2024 at 06:17:44AM -0800, Suren Baghdasaryan wrote:
> On Thu, Dec 12, 2024 at 1:17 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Dec 11, 2024 at 07:01:16PM -0800, Suren Baghdasaryan wrote:
> >
> > > > > > I think your proposal should work. Let me try to code it and see if
> > > > > > something breaks.
> > >
> > > Ok, I tried it out and things are a bit more complex:
> > > 1. We should allow write-locking a detached VMA, IOW vma_start_write()
> > > can be called when vm_refcnt is 0.
> >
> > This sounds dodgy, refcnt being zero basically means the object is dead
> > and you shouldn't be touching it no more. Where does this happen and
> > why?
> >
> > Notably, it being 0 means it is no longer in the mas tree and can't be
> > found anymore.
> 
> It happens when a newly created vma that was not yet attached
> (vma->vm_refcnt = 0) is write-locked before being added into the vma
> tree. For example:
> mmap()
>   mmap_write_lock()
>   vma = vm_area_alloc() // vma->vm_refcnt = 0 (detached)
>   //vma attributes are initialized
>   vma_start_write() // write 0x8000 0001 into vma->vm_refcnt
>   mas_store_gfp()
>   vma_mark_attached()
>   mmap_write_lock() // vma_end_write_all()
> 
> In this scenario, we write-lock the VMA before adding it into the tree
> to prevent readers (pagefaults) from using it until we drop the
> mmap_write_lock().

Ah, but you can do that by setting vma->vm_lock_seq and setting the ref
to 1 before adding it (its not visible before adding anyway, so nobody
cares).

You'll note that the read thing checks both the msb (or other high bit
depending on the actual type you're going with) *and* the seq. That is
needed because we must not set the sequence number before all existing
readers are drained, but since this is pre-add that is not a concern.

> > > 2. Adding 0x80000000 saturates refcnt, so I have to use a lower bit
> > > 0x40000000 to denote writers.
> >
> > I'm confused, what? We're talking about atomic_t, right?
> 
> I thought you suggested using refcount_t. According to
> https://elixir.bootlin.com/linux/v6.13-rc2/source/include/linux/refcount.h#L22
> valid values would be [0..0x7fff_ffff] and 0x80000000 is outside of
> that range. What am I missing?

I was talking about atomic_t :-), but yeah, maybe we can use refcount_t,
but I hadn't initially considered that.

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Fri, Dec 13, 2024 at 1:22 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 12, 2024 at 06:17:44AM -0800, Suren Baghdasaryan wrote:
> > On Thu, Dec 12, 2024 at 1:17 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Dec 11, 2024 at 07:01:16PM -0800, Suren Baghdasaryan wrote:
> > >
> > > > > > > I think your proposal should work. Let me try to code it and see if
> > > > > > > something breaks.
> > > >
> > > > Ok, I tried it out and things are a bit more complex:
> > > > 1. We should allow write-locking a detached VMA, IOW vma_start_write()
> > > > can be called when vm_refcnt is 0.
> > >
> > > This sounds dodgy, refcnt being zero basically means the object is dead
> > > and you shouldn't be touching it no more. Where does this happen and
> > > why?
> > >
> > > Notably, it being 0 means it is no longer in the mas tree and can't be
> > > found anymore.
> >
> > It happens when a newly created vma that was not yet attached
> > (vma->vm_refcnt = 0) is write-locked before being added into the vma
> > tree. For example:
> > mmap()
> >   mmap_write_lock()
> >   vma = vm_area_alloc() // vma->vm_refcnt = 0 (detached)
> >   //vma attributes are initialized
> >   vma_start_write() // write 0x8000 0001 into vma->vm_refcnt
> >   mas_store_gfp()
> >   vma_mark_attached()
> >   mmap_write_lock() // vma_end_write_all()
> >
> > In this scenario, we write-lock the VMA before adding it into the tree
> > to prevent readers (pagefaults) from using it until we drop the
> > mmap_write_lock().
>
> Ah, but you can do that by setting vma->vm_lock_seq and setting the ref
> to 1 before adding it (its not visible before adding anyway, so nobody
> cares).
>
> You'll note that the read thing checks both the msb (or other high bit
> depending on the actual type you're going with) *and* the seq. That is
> needed because we must not set the sequence number before all existing
> readers are drained, but since this is pre-add that is not a concern.

Yes, I realized that there is an interesting rule that help in this
case: vma_mark_attached() is called only on a newly created vma which
can't be found by anyone else or the vma is already locked, therefore
vma_start_write() can never race with vma_mark_attached(). Considering
that vma_mark_attached() is the only one that can raise vma->vm_refcnt
from 0, vma_start_write() can set vma->vm_lock_seq without modifying
vma->vm_refcnt at all if vma->vm_refcnt==0.

>
> > > > 2. Adding 0x80000000 saturates refcnt, so I have to use a lower bit
> > > > 0x40000000 to denote writers.
> > >
> > > I'm confused, what? We're talking about atomic_t, right?
> >
> > I thought you suggested using refcount_t. According to
> > https://elixir.bootlin.com/linux/v6.13-rc2/source/include/linux/refcount.h#L22
> > valid values would be [0..0x7fff_ffff] and 0x80000000 is outside of
> > that range. What am I missing?
>
> I was talking about atomic_t :-), but yeah, maybe we can use refcount_t,
> but I hadn't initially considered that.

My current implementation is using refcount_t but I might have to
change that to atomic_t to handle vm_refcnt overflows in
vma_start_read().

>

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Thu, Dec 12, 2024 at 6:17 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Dec 12, 2024 at 1:17 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Dec 11, 2024 at 07:01:16PM -0800, Suren Baghdasaryan wrote:
> >
> > > > > > I think your proposal should work. Let me try to code it and see if
> > > > > > something breaks.
> > >
> > > Ok, I tried it out and things are a bit more complex:
> > > 1. We should allow write-locking a detached VMA, IOW vma_start_write()
> > > can be called when vm_refcnt is 0.
> >
> > This sounds dodgy, refcnt being zero basically means the object is dead
> > and you shouldn't be touching it no more. Where does this happen and
> > why?
> >
> > Notably, it being 0 means it is no longer in the mas tree and can't be
> > found anymore.
>
> It happens when a newly created vma that was not yet attached
> (vma->vm_refcnt = 0) is write-locked before being added into the vma
> tree. For example:
> mmap()
>   mmap_write_lock()
>   vma = vm_area_alloc() // vma->vm_refcnt = 0 (detached)
>   //vma attributes are initialized
>   vma_start_write() // write 0x8000 0001 into vma->vm_refcnt
>   mas_store_gfp()
>   vma_mark_attached()
>   mmap_write_lock() // vma_end_write_all()

s/mmap_write_lock()/mmap_write_unlock()
>
> In this scenario, we write-lock the VMA before adding it into the tree
> to prevent readers (pagefaults) from using it until we drop the
> mmap_write_lock(). In your proposal, the first thing vma_start_write()
> does is add(0x8000'0001) and that will trigger a warning.
> For now instead of add(0x8000'0001) I can play this game to avoid the warning:
>
> if (refcount_inc_not_zero(&vma->vm_refcnt))
>     refcount_add(0x80000000, &vma->vm_refcnt);
> else
>     refcount_set(&vma->vm_refcnt, 0x80000001);
>
> this refcount_set() works because vma with vm_refcnt==0 could not be
> found by readers. I'm not sure this will still work when we add
> TYPESAFE_BY_RCU and introduce vma reuse possibility.
>
> >
> > > 2. Adding 0x80000000 saturates refcnt, so I have to use a lower bit
> > > 0x40000000 to denote writers.
> >
> > I'm confused, what? We're talking about atomic_t, right?
>
> I thought you suggested using refcount_t. According to
> https://elixir.bootlin.com/linux/v6.13-rc2/source/include/linux/refcount.h#L22
> valid values would be [0..0x7fff_ffff] and 0x80000000 is outside of
> that range. What am I missing?
>
> >
> > > 3. Currently vma_mark_attached() can be called on an already attached
> > > VMA. With vma->detached being a separate attribute that works fine but
> > > when we combine it with the vm_lock things break (extra attach would
> > > leak into lock count). I'll see if I can catch all the cases when we
> > > do this and clean them up (not call vma_mark_attached() when not
> > > necessary).
> >
> > Right, I hadn't looked at that thing in detail, that sounds like it
> > needs a wee cleanup like you suggest.
>
> Yes, I'll embark on that today. Will see how much of a problem that is.

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Thu, Dec 12, 2024 at 6:19 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Dec 12, 2024 at 6:17 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Thu, Dec 12, 2024 at 1:17 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Dec 11, 2024 at 07:01:16PM -0800, Suren Baghdasaryan wrote:
> > >
> > > > > > > I think your proposal should work. Let me try to code it and see if
> > > > > > > something breaks.
> > > >
> > > > Ok, I tried it out and things are a bit more complex:
> > > > 1. We should allow write-locking a detached VMA, IOW vma_start_write()
> > > > can be called when vm_refcnt is 0.
> > >
> > > This sounds dodgy, refcnt being zero basically means the object is dead
> > > and you shouldn't be touching it no more. Where does this happen and
> > > why?
> > >
> > > Notably, it being 0 means it is no longer in the mas tree and can't be
> > > found anymore.
> >
> > It happens when a newly created vma that was not yet attached
> > (vma->vm_refcnt = 0) is write-locked before being added into the vma
> > tree. For example:
> > mmap()
> >   mmap_write_lock()
> >   vma = vm_area_alloc() // vma->vm_refcnt = 0 (detached)
> >   //vma attributes are initialized
> >   vma_start_write() // write 0x8000 0001 into vma->vm_refcnt
> >   mas_store_gfp()
> >   vma_mark_attached()
> >   mmap_write_lock() // vma_end_write_all()
>
> s/mmap_write_lock()/mmap_write_unlock()
> >
> > In this scenario, we write-lock the VMA before adding it into the tree
> > to prevent readers (pagefaults) from using it until we drop the
> > mmap_write_lock(). In your proposal, the first thing vma_start_write()
> > does is add(0x8000'0001) and that will trigger a warning.
> > For now instead of add(0x8000'0001) I can play this game to avoid the warning:
> >
> > if (refcount_inc_not_zero(&vma->vm_refcnt))
> >     refcount_add(0x80000000, &vma->vm_refcnt);
> > else
> >     refcount_set(&vma->vm_refcnt, 0x80000001);
> >
> > this refcount_set() works because vma with vm_refcnt==0 could not be
> > found by readers. I'm not sure this will still work when we add
> > TYPESAFE_BY_RCU and introduce vma reuse possibility.
> >
> > >
> > > > 2. Adding 0x80000000 saturates refcnt, so I have to use a lower bit
> > > > 0x40000000 to denote writers.
> > >
> > > I'm confused, what? We're talking about atomic_t, right?
> >
> > I thought you suggested using refcount_t. According to
> > https://elixir.bootlin.com/linux/v6.13-rc2/source/include/linux/refcount.h#L22
> > valid values would be [0..0x7fff_ffff] and 0x80000000 is outside of
> > that range. What am I missing?
> >
> > >
> > > > 3. Currently vma_mark_attached() can be called on an already attached
> > > > VMA. With vma->detached being a separate attribute that works fine but
> > > > when we combine it with the vm_lock things break (extra attach would
> > > > leak into lock count). I'll see if I can catch all the cases when we
> > > > do this and clean them up (not call vma_mark_attached() when not
> > > > necessary).
> > >
> > > Right, I hadn't looked at that thing in detail, that sounds like it
> > > needs a wee cleanup like you suggest.
> >
> > Yes, I'll embark on that today. Will see how much of a problem that is.

Ok, I think I was able to implement this in a way that ignores
duplicate attach/detach calls. One issue that I hit and don't know a
good way to fix is a circular dependency in the header files once I
try adding rcuwait into mm_struct. Once I include rcuwait.h into
mm_types.h leads to the following cycle:

In file included from ./arch/x86/include/asm/uaccess.h:12,
                 from ./include/linux/uaccess.h:12,
                 from ./include/linux/sched/task.h:13,
                 from ./include/linux/sched/signal.h:9,
                 from ./include/linux/rcuwait.h:6,
                 from ./include/linux/mm_types.h:22,

./arch/x86/include/asm/uaccess.h includes mm_types.h. But in fact
there is a shorter cycle:

rcuwait.h needs signal.h since it uses uses inlined signal_pending_state()
signal.h needs mm_types.h since it uses vm_fault_t

The way I worked around it for now is by removing signal.h include
from rcuwait.h and wrapping signal_pending_state() into a non-inlined
function so I can forward-declare it. That requires adding
linux/sched/signal.h or linux/sched/task.h into many other places:

 arch/x86/coco/sev/core.c                                     | 1 +
 arch/x86/kernel/fpu/xstate.c                                 | 1 +
 block/blk-lib.c                                              | 1 +
 block/ioctl.c                                                | 1 +
 drivers/accel/qaic/qaic_control.c                            | 1 +
 drivers/base/firmware_loader/main.c                          | 1 +
 drivers/block/ublk_drv.c                                     | 1 +
 drivers/crypto/ccp/sev-dev.c                                 | 1 +
 drivers/dma-buf/heaps/cma_heap.c                             | 1 +
 drivers/dma-buf/heaps/system_heap.c                          | 1 +
 drivers/gpio/gpio-sloppy-logic-analyzer.c                    | 1 +
 drivers/gpu/drm/etnaviv/etnaviv_gpu.c                        | 1 +
 drivers/iommu/iommufd/ioas.c                                 | 1 +
 drivers/iommu/iommufd/pages.c                                | 1 +
 .../staging/vc04_services/interface/vchiq_arm/vchiq_dev.c    | 1 +
 drivers/tty/n_tty.c                                          | 1 +
 fs/bcachefs/fs-io-buffered.c                                 | 1 +
 fs/bcachefs/journal_reclaim.c                                | 1 +
 fs/bcachefs/thread_with_file.c                               | 1 +
 fs/bcachefs/util.c                                           | 1 +
 fs/btrfs/defrag.h                                            | 1 +
 fs/btrfs/fiemap.c                                            | 2 ++
 fs/btrfs/free-space-cache.h                                  | 1 +
 fs/btrfs/reflink.c                                           | 1 +
 fs/exfat/balloc.c                                            | 1 +
 fs/gfs2/ops_fstype.c                                         | 1 +
 fs/kernel_read_file.c                                        | 1 +
 fs/netfs/buffered_write.c                                    | 1 +
 fs/zonefs/file.c                                             | 1 +
 include/linux/fs.h                                           | 2 +-
 include/linux/rcuwait.h                                      | 4 ++--
 io_uring/io_uring.h                                          | 1 +
 kernel/dma/map_benchmark.c                                   | 1 +
 kernel/futex/core.c                                          | 1 +
 kernel/futex/pi.c                                            | 1 +
 kernel/rcu/update.c                                          | 5 +++++
 kernel/task_work.c                                           | 1 +
 lib/kunit/user_alloc.c                                       | 1 +
 mm/damon/vaddr.c                                             | 1 +
 mm/memcontrol-v1.c                                           | 1 +
 mm/shrinker_debug.c                                          | 1 +
 net/dns_resolver/dns_key.c                                   | 1 +
 42 files changed, 48 insertions(+), 3 deletions(-)

I'm not sure if this is the best way to deal with this circular
dependency. Any other ideas?

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Peter Zijlstra 1 year, 1 month ago

On Thu, Dec 12, 2024 at 08:48:52PM -0800, Suren Baghdasaryan wrote:

> I'm not sure if this is the best way to deal with this circular
> dependency. Any other ideas?

Move the waiting into an out-of-line slow-path?

  if (atomic_read(&vma->refcnt) != 2)
    __vma_write_start_wait(mm, vma);

or somesuch..

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Fri, Dec 13, 2024 at 1:57 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Dec 12, 2024 at 08:48:52PM -0800, Suren Baghdasaryan wrote:
>
> > I'm not sure if this is the best way to deal with this circular
> > dependency. Any other ideas?
>
> Move the waiting into an out-of-line slow-path?
>
>   if (atomic_read(&vma->refcnt) != 2)
>     __vma_write_start_wait(mm, vma);

The problem is not a function but the addition of struct rcuwait into
struct mm_struct. The type should be known and I don't want to add it
via a pointer to avoid extra dereferences. After thinking it over, I
think the simplest way would be to move the definition of struct
rcuwait into a separate header and include that header in mm_types.h.
That, with some uninlining like you suggested, should do the trick.

>
> or somesuch..

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Peter Zijlstra 1 year, 1 month ago

On Fri, Dec 13, 2024 at 09:45:33AM -0800, Suren Baghdasaryan wrote:
> On Fri, Dec 13, 2024 at 1:57 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Thu, Dec 12, 2024 at 08:48:52PM -0800, Suren Baghdasaryan wrote:
> >
> > > I'm not sure if this is the best way to deal with this circular
> > > dependency. Any other ideas?
> >
> > Move the waiting into an out-of-line slow-path?
> >
> >   if (atomic_read(&vma->refcnt) != 2)
> >     __vma_write_start_wait(mm, vma);
> 
> The problem is not a function but the addition of struct rcuwait into

Durr, in my brain that was a struct task_struct pointer, totally forgot
we had a type there. Yeah, as Willy says, move it to compiler_types.h or
somesuch.

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Fri, Dec 13, 2024 at 10:35 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Dec 13, 2024 at 09:45:33AM -0800, Suren Baghdasaryan wrote:
> > On Fri, Dec 13, 2024 at 1:57 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Thu, Dec 12, 2024 at 08:48:52PM -0800, Suren Baghdasaryan wrote:
> > >
> > > > I'm not sure if this is the best way to deal with this circular
> > > > dependency. Any other ideas?
> > >
> > > Move the waiting into an out-of-line slow-path?
> > >
> > >   if (atomic_read(&vma->refcnt) != 2)
> > >     __vma_write_start_wait(mm, vma);
> >
> > The problem is not a function but the addition of struct rcuwait into
>
> Durr, in my brain that was a struct task_struct pointer, totally forgot
> we had a type there. Yeah, as Willy says, move it to compiler_types.h or
> somesuch.

Got it. Thank you both!

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 1 month ago

On Fri, Dec 13, 2024 at 10:37 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Dec 13, 2024 at 10:35 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Fri, Dec 13, 2024 at 09:45:33AM -0800, Suren Baghdasaryan wrote:
> > > On Fri, Dec 13, 2024 at 1:57 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Thu, Dec 12, 2024 at 08:48:52PM -0800, Suren Baghdasaryan wrote:
> > > >
> > > > > I'm not sure if this is the best way to deal with this circular
> > > > > dependency. Any other ideas?
> > > >
> > > > Move the waiting into an out-of-line slow-path?
> > > >
> > > >   if (atomic_read(&vma->refcnt) != 2)
> > > >     __vma_write_start_wait(mm, vma);
> > >
> > > The problem is not a function but the addition of struct rcuwait into
> >
> > Durr, in my brain that was a struct task_struct pointer, totally forgot
> > we had a type there. Yeah, as Willy says, move it to compiler_types.h or
> > somesuch.
>
> Got it. Thank you both!

I posted the implementation of the mechanism we discussed here at:
https://lore.kernel.org/all/20241216192419.2970941-11-surenb@google.com/
It ended up becoming a large patchset but most patches are very small
logical units.
Feedback is greatly appreciated!

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Matthew Wilcox 1 year, 1 month ago

On Fri, Dec 13, 2024 at 09:45:33AM -0800, Suren Baghdasaryan wrote:
> think the simplest way would be to move the definition of struct
> rcuwait into a separate header and include that header in mm_types.h.
> That, with some uninlining like you suggested, should do the trick.

I'd suggest struct rcuwait should live in types.h alongside list_head,
rcuref and so on.

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Davidlohr Bueso 1 year, 2 months ago

On Mon, 11 Nov 2024, Suren Baghdasaryan wrote:

>@@ -787,7 +893,10 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
>	 */
>	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
>-	up_write(&vma->vm_lock.lock);
>+	/* Write barrier to ensure vm_lock_seq change is visible before count */
>+	smp_wmb();
>+	rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
>+	atomic_set(&vma->vm_lock.count, VMA_LOCK_UNLOCKED);

Too many barriers here. Just do atomic_set_release and remove that
smp_wmb(). And what you are doing is really ensuring nothing leaks out
of the critical region, so that comment should also be more generic.

I would expect regression testing to catch this sort of thing.

...

> #ifdef CONFIG_PER_VMA_LOCK
>+		struct wait_queue_head vma_writer_wait;

You might want to use rcuwait here instead, which is much more
optimized for the single waiter requirement vmas have.

Thanks,
Davidlohr

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 2 months ago

On Mon, Nov 11, 2024 at 4:35 PM Davidlohr Bueso <dave@stgolabs.net> wrote:
>
> On Mon, 11 Nov 2024, Suren Baghdasaryan wrote:
>
> >@@ -787,7 +893,10 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> >        * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> >        */
> >       WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> >-      up_write(&vma->vm_lock.lock);
> >+      /* Write barrier to ensure vm_lock_seq change is visible before count */
> >+      smp_wmb();
> >+      rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
> >+      atomic_set(&vma->vm_lock.count, VMA_LOCK_UNLOCKED);
>
> Too many barriers here. Just do atomic_set_release and remove that
> smp_wmb(). And what you are doing is really ensuring nothing leaks out
> of the critical region, so that comment should also be more generic.

Uh, yes. I missed that.

>
> I would expect regression testing to catch this sort of thing.

Well, it's in vma_start_write(), which is in the write-locking path.
Maybe that's why it's not very visible.

>
> ...
>
> > #ifdef CONFIG_PER_VMA_LOCK
> >+              struct wait_queue_head vma_writer_wait;
>
> You might want to use rcuwait here instead, which is much more
> optimized for the single waiter requirement vmas have.

Thanks for the suggestion! I'll give it a try.

>
> Thanks,
> Davidlohr

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Andrew Morton 1 year, 2 months ago

On Mon, 11 Nov 2024 12:55:05 -0800 Suren Baghdasaryan <surenb@google.com> wrote:

>  include/linux/mm.h        | 142 ++++++++++++++++++++++++++++++++++----

There's soooo much inlining happening here.  Perhaps we should have a
"default to uninlined, unless a benefit is demonstrated" guideline?

Re: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock

Posted by Suren Baghdasaryan 1 year, 2 months ago

On Mon, Nov 11, 2024 at 4:07 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Mon, 11 Nov 2024 12:55:05 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
>
> >  include/linux/mm.h        | 142 ++++++++++++++++++++++++++++++++++----
>
> There's soooo much inlining happening here.  Perhaps we should have a
> "default to uninlined, unless a benefit is demonstrated" guideline?

Implementing this lock as a separate type as you suggested should help
with the amount of inlining here. I'll try to keep it sane once I
finalize that.