[PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count

Suren Baghdasaryan posted 16 patches 11 months, 1 week ago
[PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Suren Baghdasaryan 11 months, 1 week ago
rw_semaphore is a sizable structure of 40 bytes and consumes
considerable space for each vm_area_struct. However vma_lock has
two important specifics which can be used to replace rw_semaphore
with a simpler structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode.
Because of these requirements, full rw_semaphore functionality is not
needed and we can replace rw_semaphore and the vma->detached flag with
a refcount (vm_refcnt).
When vma is in detached state, vm_refcnt is 0 and only a call to
vma_mark_attached() can take it out of this state. Note that unlike
before, now we enforce both vma_mark_attached() and vma_mark_detached()
to be done only after vma has been write-locked. vma_mark_attached()
changes vm_refcnt to 1 to indicate that it has been attached to the vma
tree. When a reader takes read lock, it increments vm_refcnt, unless the
top usable bit of vm_refcnt (0x40000000) is set, indicating presence of
a writer. When writer takes write lock, it sets the top usable bit to
indicate its presence. If there are readers, writer will wait using newly
introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
mode first, there can be only one writer at a time. The last reader to
release the lock will signal the writer to wake up.
refcount might overflow if there are many competing readers, in which case
read-locking will fail. Readers are expected to handle such failures.
In summary:
1. all readers increment the vm_refcnt;
2. writer sets top usable (writer) bit of vm_refcnt;
3. readers cannot increment the vm_refcnt if the writer bit is set;
4. in the presence of readers, writer must wait for the vm_refcnt to drop
to 1 (ignoring the writer bit), indicating an attached vma with no readers;
5. vm_refcnt overflow is handled by the readers.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h               | 98 ++++++++++++++++++++++----------
 include/linux/mm_types.h         | 22 ++++---
 kernel/fork.c                    | 13 ++---
 mm/init-mm.c                     |  1 +
 mm/memory.c                      | 77 +++++++++++++++++++++----
 tools/testing/vma/linux/atomic.h |  5 ++
 tools/testing/vma/vma_internal.h | 66 +++++++++++----------
 7 files changed, 193 insertions(+), 89 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc8067de41c5..ec7c064792ff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -32,6 +32,7 @@
 #include <linux/memremap.h>
 #include <linux/slab.h>
 #include <linux/cacheinfo.h>
+#include <linux/rcuwait.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -697,12 +698,41 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_lock_init(struct vm_area_struct *vma)
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
 {
-	init_rwsem(&vma->vm_lock.lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	static struct lock_class_key lockdep_key;
+
+	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+	if (reset_refcnt)
+		refcount_set(&vma->vm_refcnt, 0);
 	vma->vm_lock_seq = UINT_MAX;
 }
 
+static inline bool is_vma_writer_only(int refcnt)
+{
+	/*
+	 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
+	 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
+	 * a detached vma happens only in vma_mark_detached() and is a rare
+	 * case, therefore most of the time there will be no unnecessary wakeup.
+	 */
+	return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+}
+
+static inline void vma_refcount_put(struct vm_area_struct *vma)
+{
+	int oldcnt;
+
+	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+
+		if (is_vma_writer_only(oldcnt - 1))
+			rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
+	}
+}
+
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -710,6 +740,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma)
  */
 static inline bool vma_start_read(struct vm_area_struct *vma)
 {
+	int oldcnt;
+
 	/*
 	 * Check before locking. A race might cause false locked result.
 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -720,13 +752,19 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
+	/*
+	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited() will fail
+	 * because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+	 */
+	if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
+						      VMA_REF_LIMIT)))
 		return false;
 
+	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
 	/*
-	 * Overflow might produce false locked result.
+	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
-	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
 	 * modification invalidates all existing locks.
 	 *
 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
@@ -735,9 +773,10 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->vm_lock.lock);
+		vma_refcount_put(vma);
 		return false;
 	}
+
 	return true;
 }
 
@@ -749,8 +788,14 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
  */
 static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
 {
+	int oldcnt;
+
 	mmap_assert_locked(vma->vm_mm);
-	down_read_nested(&vma->vm_lock.lock, subclass);
+	if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt,
+						      VMA_REF_LIMIT)))
+		return false;
+
+	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
 	return true;
 }
 
@@ -762,15 +807,13 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int
  */
 static inline bool vma_start_read_locked(struct vm_area_struct *vma)
 {
-	mmap_assert_locked(vma->vm_mm);
-	down_read(&vma->vm_lock.lock);
-	return true;
+	return vma_start_read_locked_nested(vma, 0);
 }
 
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->vm_lock.lock);
+	vma_refcount_put(vma);
 	rcu_read_unlock();
 }
 
@@ -813,36 +856,33 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 
 static inline void vma_assert_locked(struct vm_area_struct *vma)
 {
-	if (!rwsem_is_locked(&vma->vm_lock.lock))
+	if (refcount_read(&vma->vm_refcnt) <= 1)
 		vma_assert_write_locked(vma);
 }
 
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
 static inline void vma_assert_attached(struct vm_area_struct *vma)
 {
-	VM_BUG_ON_VMA(vma->detached, vma);
+	VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma);
 }
 
 static inline void vma_assert_detached(struct vm_area_struct *vma)
 {
-	VM_BUG_ON_VMA(!vma->detached, vma);
+	VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma);
 }
 
 static inline void vma_mark_attached(struct vm_area_struct *vma)
 {
-	vma->detached = false;
-}
-
-static inline void vma_mark_detached(struct vm_area_struct *vma)
-{
-	/* When detaching vma should be write-locked */
 	vma_assert_write_locked(vma);
-	vma->detached = true;
+	vma_assert_detached(vma);
+	refcount_set(&vma->vm_refcnt, 1);
 }
 
-static inline bool is_vma_detached(struct vm_area_struct *vma)
-{
-	return vma->detached;
-}
+void vma_mark_detached(struct vm_area_struct *vma);
 
 static inline void release_fault_lock(struct vm_fault *vmf)
 {
@@ -865,7 +905,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 
 #else /* CONFIG_PER_VMA_LOCK */
 
-static inline void vma_lock_init(struct vm_area_struct *vma) {}
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
 static inline bool vma_start_read(struct vm_area_struct *vma)
 		{ return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
@@ -908,12 +948,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-#ifdef CONFIG_PER_VMA_LOCK
-	/* vma is not locked, can't use vma_mark_detached() */
-	vma->detached = true;
-#endif
 	vma_numab_state_init(vma);
-	vma_lock_init(vma);
+	vma_lock_init(vma, false);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0ca63dee1902..2d83d79d1899 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
 #include <linux/percpu_counter.h>
+#include <linux/types.h>
 
 #include <asm/mmu.h>
 
@@ -637,9 +638,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
 }
 #endif
 
-struct vma_lock {
-	struct rw_semaphore lock;
-};
+#define VMA_LOCK_OFFSET	0x40000000
+#define VMA_REF_LIMIT	(VMA_LOCK_OFFSET - 1)
 
 struct vma_numab_state {
 	/*
@@ -717,19 +717,13 @@ struct vm_area_struct {
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
-	/*
-	 * Flag to indicate areas detached from the mm->mm_mt tree.
-	 * Unstable RCU readers are allowed to read this.
-	 */
-	bool detached;
-
 	/*
 	 * Can only be written (using WRITE_ONCE()) while holding both:
 	 *  - mmap_lock (in write mode)
-	 *  - vm_lock->lock (in write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set
 	 * Can be read reliably while holding one of:
 	 *  - mmap_lock (in read or write mode)
-	 *  - vm_lock->lock (in read or write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
 	 * while holding nothing (except RCU to keep the VMA struct allocated).
 	 *
@@ -792,7 +786,10 @@ struct vm_area_struct {
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_PER_VMA_LOCK
 	/* Unstable RCU readers are allowed to read this. */
-	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
+	refcount_t vm_refcnt ____cacheline_aligned_in_smp;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map vmlock_dep_map;
+#endif
 #endif
 } __randomize_layout;
 
@@ -927,6 +924,7 @@ struct mm_struct {
 					  * by mmlist_lock
 					  */
 #ifdef CONFIG_PER_VMA_LOCK
+		struct rcuwait vma_writer_wait;
 		/*
 		 * This field has lock-like semantics, meaning it is sometimes
 		 * accessed with ACQUIRE/RELEASE semantics.
diff --git a/kernel/fork.c b/kernel/fork.c
index d4c75428ccaf..9d9275783cf8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -463,12 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	 * will be reinitialized.
 	 */
 	data_race(memcpy(new, orig, sizeof(*new)));
-	vma_lock_init(new);
+	vma_lock_init(new, true);
 	INIT_LIST_HEAD(&new->anon_vma_chain);
-#ifdef CONFIG_PER_VMA_LOCK
-	/* vma is not locked, can't use vma_mark_detached() */
-	new->detached = true;
-#endif
 	vma_numab_state_init(new);
 	dup_anon_vma_name(orig, new);
 
@@ -477,6 +473,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
+	/* The vma should be detached while being destroyed. */
+	vma_assert_detached(vma);
 	vma_numab_state_free(vma);
 	free_anon_vma_name(vma);
 	kmem_cache_free(vm_area_cachep, vma);
@@ -488,8 +486,6 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
 						  vm_rcu);
 
-	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -1223,6 +1219,9 @@ static inline void mmap_init_lock(struct mm_struct *mm)
 {
 	init_rwsem(&mm->mmap_lock);
 	mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+	rcuwait_init(&mm->vma_writer_wait);
+#endif
 }
 
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 6af3ad675930..4600e7605cab 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,6 +40,7 @@ struct mm_struct init_mm = {
 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 #ifdef CONFIG_PER_VMA_LOCK
+	.vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
 	.mm_lock_seq	= SEQCNT_ZERO(init_mm.mm_lock_seq),
 #endif
 	.user_ns	= &init_user_ns,
diff --git a/mm/memory.c b/mm/memory.c
index 26569a44fb5c..fe1b47c34052 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6370,9 +6370,41 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
 #endif
 
 #ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt)
+{
+	/*
+	 * If vma is detached then only vma_mark_attached() can raise the
+	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+	 */
+	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+		return false;
+
+	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+		   TASK_UNINTERRUPTIBLE);
+	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+	return true;
+}
+
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+}
+
 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
 {
-	down_write(&vma->vm_lock.lock);
+	bool locked;
+
+	/*
+	 * __vma_enter_locked() returns false immediately if the vma is not
+	 * attached, otherwise it waits until refcnt is (VMA_LOCK_OFFSET + 1)
+	 * indicating that vma is attached with no readers.
+	 */
+	locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 1);
+
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
 	 * from the early lockless pessimistic check in vma_start_read().
@@ -6380,10 +6412,43 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
 	 */
 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-	up_write(&vma->vm_lock.lock);
+
+	if (locked) {
+		bool detached;
+
+		__vma_exit_locked(vma, &detached);
+		VM_BUG_ON_VMA(detached, vma); /* vma should remain attached */
+	}
 }
 EXPORT_SYMBOL_GPL(__vma_start_write);
 
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+	vma_assert_write_locked(vma);
+	vma_assert_attached(vma);
+
+	/*
+	 * We are the only writer, so no need to use vma_refcount_put().
+	 * The condition below is unlikely because the vma has been already
+	 * write-locked and readers can increment vm_refcnt only temporarily
+	 * before they check vm_lock_seq, realize the vma is locked and drop
+	 * back the vm_refcnt. That is a narrow window for observing a raised
+	 * vm_refcnt.
+	 */
+	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+		/*
+		 * Wait until refcnt is VMA_LOCK_OFFSET => detached with no
+		 * readers.
+		 */
+		if (__vma_enter_locked(vma, VMA_LOCK_OFFSET)) {
+			bool detached;
+
+			__vma_exit_locked(vma, &detached);
+			VM_BUG_ON_VMA(!detached, vma);
+		}
+	}
+}
+
 /*
  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
  * stable and not isolated. If the VMA is not found or is being modified the
@@ -6396,7 +6461,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	struct vm_area_struct *vma;
 
 	rcu_read_lock();
-retry:
 	vma = mas_walk(&mas);
 	if (!vma)
 		goto inval;
@@ -6404,13 +6468,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	if (!vma_start_read(vma))
 		goto inval;
 
-	/* Check if the VMA got isolated after we found it */
-	if (is_vma_detached(vma)) {
-		vma_end_read(vma);
-		count_vm_vma_lock_event(VMA_LOCK_MISS);
-		/* The area was replaced with another one */
-		goto retry;
-	}
 	/*
 	 * At this point, we have a stable reference to a VMA: The VMA is
 	 * locked and we know it hasn't already been isolated.
diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h
index 3e1b6adc027b..788c597c4fde 100644
--- a/tools/testing/vma/linux/atomic.h
+++ b/tools/testing/vma/linux/atomic.h
@@ -9,4 +9,9 @@
 #define atomic_set(x, y) uatomic_set(x, y)
 #define U8_MAX UCHAR_MAX
 
+#ifndef atomic_cmpxchg_relaxed
+#define  atomic_cmpxchg_relaxed		uatomic_cmpxchg
+#define  atomic_cmpxchg_release         uatomic_cmpxchg
+#endif /* atomic_cmpxchg_relaxed */
+
 #endif	/* _LINUX_ATOMIC_H */
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 47c8b03ffbbd..2ce032943861 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -25,7 +25,7 @@
 #include <linux/maple_tree.h>
 #include <linux/mm.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
+#include <linux/refcount.h>
 
 extern unsigned long stack_guard_gap;
 #ifdef CONFIG_MMU
@@ -134,10 +134,6 @@ typedef __bitwise unsigned int vm_fault_t;
  */
 #define pr_warn_once pr_err
 
-typedef struct refcount_struct {
-	atomic_t refs;
-} refcount_t;
-
 struct kref {
 	refcount_t refcount;
 };
@@ -232,15 +228,12 @@ struct mm_struct {
 	unsigned long flags; /* Must use atomic bitops to access */
 };
 
-struct vma_lock {
-	struct rw_semaphore lock;
-};
-
-
 struct file {
 	struct address_space	*f_mapping;
 };
 
+#define VMA_LOCK_OFFSET	0x40000000
+
 struct vm_area_struct {
 	/* The first cache line has the info for VMA tree walking. */
 
@@ -268,16 +261,13 @@ struct vm_area_struct {
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
-	/* Flag to indicate areas detached from the mm->mm_mt tree */
-	bool detached;
-
 	/*
 	 * Can only be written (using WRITE_ONCE()) while holding both:
 	 *  - mmap_lock (in write mode)
-	 *  - vm_lock.lock (in write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set
 	 * Can be read reliably while holding one of:
 	 *  - mmap_lock (in read or write mode)
-	 *  - vm_lock.lock (in read or write mode)
+	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
 	 * while holding nothing (except RCU to keep the VMA struct allocated).
 	 *
@@ -286,7 +276,6 @@ struct vm_area_struct {
 	 * slowpath.
 	 */
 	unsigned int vm_lock_seq;
-	struct vma_lock vm_lock;
 #endif
 
 	/*
@@ -339,6 +328,10 @@ struct vm_area_struct {
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Unstable RCU readers are allowed to read this. */
+	refcount_t vm_refcnt;
+#endif
 } __randomize_layout;
 
 struct vm_fault {};
@@ -463,23 +456,41 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
 	return mas_find(&vmi->mas, ULONG_MAX);
 }
 
-static inline void vma_lock_init(struct vm_area_struct *vma)
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
+static inline void vma_assert_attached(struct vm_area_struct *vma)
 {
-	init_rwsem(&vma->vm_lock.lock);
-	vma->vm_lock_seq = UINT_MAX;
+	VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma);
 }
 
-static inline void vma_mark_attached(struct vm_area_struct *vma)
+static inline void vma_assert_detached(struct vm_area_struct *vma)
 {
-	vma->detached = false;
+	VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *);
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+	vma_assert_write_locked(vma);
+	vma_assert_detached(vma);
+	refcount_set(&vma->vm_refcnt, 1);
+}
+
 static inline void vma_mark_detached(struct vm_area_struct *vma)
 {
-	/* When detaching vma should be write-locked */
 	vma_assert_write_locked(vma);
-	vma->detached = true;
+	vma_assert_attached(vma);
+
+	/* We are the only writer, so no need to use vma_refcount_put(). */
+	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+		/*
+		 * Reader must have temporarily raised vm_refcnt but it will
+		 * drop it without using the vma since vma is write-locked.
+		 */
+	}
 }
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
@@ -492,9 +503,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-	/* vma is not locked, can't use vma_mark_detached() */
-	vma->detached = true;
-	vma_lock_init(vma);
+	vma->vm_lock_seq = UINT_MAX;
 }
 
 static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
@@ -517,10 +526,9 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		return NULL;
 
 	memcpy(new, orig, sizeof(*new));
-	vma_lock_init(new);
+	refcount_set(&new->vm_refcnt, 0);
+	new->vm_lock_seq = UINT_MAX;
 	INIT_LIST_HEAD(&new->anon_vma_chain);
-	/* vma is not locked, can't use vma_mark_detached() */
-	new->detached = true;
 
 	return new;
 }
-- 
2.47.1.613.gc27f4b7a9f-goog
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Vlastimil Babka 11 months, 1 week ago
On 1/9/25 3:30 AM, Suren Baghdasaryan wrote:
> rw_semaphore is a sizable structure of 40 bytes and consumes
> considerable space for each vm_area_struct. However vma_lock has
> two important specifics which can be used to replace rw_semaphore
> with a simpler structure:
> 1. Readers never wait. They try to take the vma_lock and fall back to
> mmap_lock if that fails.
> 2. Only one writer at a time will ever try to write-lock a vma_lock
> because writers first take mmap_lock in write mode.
> Because of these requirements, full rw_semaphore functionality is not
> needed and we can replace rw_semaphore and the vma->detached flag with
> a refcount (vm_refcnt).
> When vma is in detached state, vm_refcnt is 0 and only a call to
> vma_mark_attached() can take it out of this state. Note that unlike
> before, now we enforce both vma_mark_attached() and vma_mark_detached()
> to be done only after vma has been write-locked. vma_mark_attached()
> changes vm_refcnt to 1 to indicate that it has been attached to the vma
> tree. When a reader takes read lock, it increments vm_refcnt, unless the
> top usable bit of vm_refcnt (0x40000000) is set, indicating presence of
> a writer. When writer takes write lock, it sets the top usable bit to
> indicate its presence. If there are readers, writer will wait using newly
> introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
> mode first, there can be only one writer at a time. The last reader to
> release the lock will signal the writer to wake up.
> refcount might overflow if there are many competing readers, in which case
> read-locking will fail. Readers are expected to handle such failures.
> In summary:
> 1. all readers increment the vm_refcnt;
> 2. writer sets top usable (writer) bit of vm_refcnt;
> 3. readers cannot increment the vm_refcnt if the writer bit is set;
> 4. in the presence of readers, writer must wait for the vm_refcnt to drop
> to 1 (ignoring the writer bit), indicating an attached vma with no readers;
> 5. vm_refcnt overflow is handled by the readers.
> 
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Suggested-by: Matthew Wilcox <willy@infradead.org>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

But think there's a problem that will manifest after patch 15.
Also I don't feel qualified enough about the lockdep parts though
(although I think I spotted another issue with those, below) so best if
PeterZ can review those.
Some nits below too.

> +
> +static inline void vma_refcount_put(struct vm_area_struct *vma)
> +{
> +	int oldcnt;
> +
> +	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> +		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);

Shouldn't we rwsem_release always? And also shouldn't it precede the
refcount operation itself?

> +		if (is_vma_writer_only(oldcnt - 1))
> +			rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);

Hmm hmm we should maybe read the vm_mm pointer before dropping the
refcount? In case this races in a way that is_vma_writer_only tests true
but the writer meanwhile finishes and frees the vma. It's safe now but
not after making the cache SLAB_TYPESAFE_BY_RCU ?

> +	}
> +}
> +

>  static inline void vma_end_read(struct vm_area_struct *vma)
>  {
>  	rcu_read_lock(); /* keeps vma alive till the end of up_read */

This should refer to vma_refcount_put(). But after fixing it I think we
could stop doing this altogether? It will no longer keep vma "alive"
with SLAB_TYPESAFE_BY_RCU.

> -	up_read(&vma->vm_lock.lock);
> +	vma_refcount_put(vma);
>  	rcu_read_unlock();
>  }
>  

<snip>

> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -6370,9 +6370,41 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
>  #endif
>  
>  #ifdef CONFIG_PER_VMA_LOCK
> +static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt)
> +{
> +	/*
> +	 * If vma is detached then only vma_mark_attached() can raise the
> +	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
> +	 */
> +	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> +		return false;
> +
> +	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> +	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> +		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> +		   TASK_UNINTERRUPTIBLE);
> +	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> +
> +	return true;
> +}
> +
> +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> +{
> +	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
> +	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> +}
> +
>  void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
>  {
> -	down_write(&vma->vm_lock.lock);
> +	bool locked;
> +
> +	/*
> +	 * __vma_enter_locked() returns false immediately if the vma is not
> +	 * attached, otherwise it waits until refcnt is (VMA_LOCK_OFFSET + 1)
> +	 * indicating that vma is attached with no readers.
> +	 */
> +	locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 1);

Wonder if it would be slightly better if tgt_refcount was just 1 (or 0
below in vma_mark_detached()) and the VMA_LOCK_OFFSET added to it in
__vma_enter_locked() itself as it's the one adding it in the first place.
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Suren Baghdasaryan 11 months, 1 week ago
On Fri, Jan 10, 2025 at 6:33 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/9/25 3:30 AM, Suren Baghdasaryan wrote:
> > rw_semaphore is a sizable structure of 40 bytes and consumes
> > considerable space for each vm_area_struct. However vma_lock has
> > two important specifics which can be used to replace rw_semaphore
> > with a simpler structure:
> > 1. Readers never wait. They try to take the vma_lock and fall back to
> > mmap_lock if that fails.
> > 2. Only one writer at a time will ever try to write-lock a vma_lock
> > because writers first take mmap_lock in write mode.
> > Because of these requirements, full rw_semaphore functionality is not
> > needed and we can replace rw_semaphore and the vma->detached flag with
> > a refcount (vm_refcnt).
> > When vma is in detached state, vm_refcnt is 0 and only a call to
> > vma_mark_attached() can take it out of this state. Note that unlike
> > before, now we enforce both vma_mark_attached() and vma_mark_detached()
> > to be done only after vma has been write-locked. vma_mark_attached()
> > changes vm_refcnt to 1 to indicate that it has been attached to the vma
> > tree. When a reader takes read lock, it increments vm_refcnt, unless the
> > top usable bit of vm_refcnt (0x40000000) is set, indicating presence of
> > a writer. When writer takes write lock, it sets the top usable bit to
> > indicate its presence. If there are readers, writer will wait using newly
> > introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
> > mode first, there can be only one writer at a time. The last reader to
> > release the lock will signal the writer to wake up.
> > refcount might overflow if there are many competing readers, in which case
> > read-locking will fail. Readers are expected to handle such failures.
> > In summary:
> > 1. all readers increment the vm_refcnt;
> > 2. writer sets top usable (writer) bit of vm_refcnt;
> > 3. readers cannot increment the vm_refcnt if the writer bit is set;
> > 4. in the presence of readers, writer must wait for the vm_refcnt to drop
> > to 1 (ignoring the writer bit), indicating an attached vma with no readers;
> > 5. vm_refcnt overflow is handled by the readers.
> >
> > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > Suggested-by: Matthew Wilcox <willy@infradead.org>
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
>
> But think there's a problem that will manifest after patch 15.
> Also I don't feel qualified enough about the lockdep parts though
> (although I think I spotted another issue with those, below) so best if
> PeterZ can review those.
> Some nits below too.
>
> > +
> > +static inline void vma_refcount_put(struct vm_area_struct *vma)
> > +{
> > +     int oldcnt;
> > +
> > +     if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
>
> Shouldn't we rwsem_release always? And also shouldn't it precede the
> refcount operation itself?

Yes. Hillf pointed to the same issue. It will be fixed in the next version.

>
> > +             if (is_vma_writer_only(oldcnt - 1))
> > +                     rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
>
> Hmm hmm we should maybe read the vm_mm pointer before dropping the
> refcount? In case this races in a way that is_vma_writer_only tests true
> but the writer meanwhile finishes and frees the vma. It's safe now but
> not after making the cache SLAB_TYPESAFE_BY_RCU ?

Hmm. But if is_vma_writer_only() is true that means the writed is
blocked and is waiting for the reader to drop the vm_refcnt. IOW, it
won't proceed and free the vma until the reader calls
rcuwait_wake_up(). Your suggested change is trivial and I can do it
but I want to make sure I'm not missing something. Am I?

>
> > +     }
> > +}
> > +
>
> >  static inline void vma_end_read(struct vm_area_struct *vma)
> >  {
> >       rcu_read_lock(); /* keeps vma alive till the end of up_read */
>
> This should refer to vma_refcount_put(). But after fixing it I think we
> could stop doing this altogether? It will no longer keep vma "alive"
> with SLAB_TYPESAFE_BY_RCU.

Yeah, I think the comment along with rcu_read_lock()/rcu_read_unlock()
here can be safely removed.

>
> > -     up_read(&vma->vm_lock.lock);
> > +     vma_refcount_put(vma);
> >       rcu_read_unlock();
> >  }
> >
>
> <snip>
>
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -6370,9 +6370,41 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> >  #endif
> >
> >  #ifdef CONFIG_PER_VMA_LOCK
> > +static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt)
> > +{
> > +     /*
> > +      * If vma is detached then only vma_mark_attached() can raise the
> > +      * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
> > +      */
> > +     if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> > +             return false;
> > +
> > +     rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> > +     rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> > +                refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> > +                TASK_UNINTERRUPTIBLE);
> > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> > +
> > +     return true;
> > +}
> > +
> > +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> > +{
> > +     *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
> > +     rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > +}
> > +
> >  void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> >  {
> > -     down_write(&vma->vm_lock.lock);
> > +     bool locked;
> > +
> > +     /*
> > +      * __vma_enter_locked() returns false immediately if the vma is not
> > +      * attached, otherwise it waits until refcnt is (VMA_LOCK_OFFSET + 1)
> > +      * indicating that vma is attached with no readers.
> > +      */
> > +     locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 1);
>
> Wonder if it would be slightly better if tgt_refcount was just 1 (or 0
> below in vma_mark_detached()) and the VMA_LOCK_OFFSET added to it in
> __vma_enter_locked() itself as it's the one adding it in the first place.

Well, it won't be called tgt_refcount then. Maybe "bool vma_attached"
and inside __vma_enter_locked() we do:

unsigned int tgt_refcnt = VMA_LOCK_OFFSET + vma_attached ? 1 : 0;

Is that better?

>
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Vlastimil Babka 11 months, 1 week ago
On 1/10/25 16:56, Suren Baghdasaryan wrote:
>> > --- a/mm/memory.c
>> > +++ b/mm/memory.c
>> > @@ -6370,9 +6370,41 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
>> >  #endif
>> >
>> >  #ifdef CONFIG_PER_VMA_LOCK
>> > +static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt)
>> > +{
>> > +     /*
>> > +      * If vma is detached then only vma_mark_attached() can raise the
>> > +      * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
>> > +      */
>> > +     if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
>> > +             return false;
>> > +
>> > +     rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
>> > +     rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
>> > +                refcount_read(&vma->vm_refcnt) == tgt_refcnt,
>> > +                TASK_UNINTERRUPTIBLE);
>> > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
>> > +
>> > +     return true;
>> > +}
>> > +
>> > +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
>> > +{
>> > +     *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
>> > +     rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
>> > +}
>> > +
>> >  void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
>> >  {
>> > -     down_write(&vma->vm_lock.lock);
>> > +     bool locked;
>> > +
>> > +     /*
>> > +      * __vma_enter_locked() returns false immediately if the vma is not
>> > +      * attached, otherwise it waits until refcnt is (VMA_LOCK_OFFSET + 1)
>> > +      * indicating that vma is attached with no readers.
>> > +      */
>> > +     locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 1);
>>
>> Wonder if it would be slightly better if tgt_refcount was just 1 (or 0
>> below in vma_mark_detached()) and the VMA_LOCK_OFFSET added to it in
>> __vma_enter_locked() itself as it's the one adding it in the first place.
> 
> Well, it won't be called tgt_refcount then. Maybe "bool vma_attached"
> and inside __vma_enter_locked() we do:
> 
> unsigned int tgt_refcnt = VMA_LOCK_OFFSET + vma_attached ? 1 : 0;
> 
> Is that better?

Yeah I think so as it centralizes the target refcount logic into a single
place __vma_enter_locked().
Hm but then it's weird that __vma_start_write() would set vma_attached to
true and yet it handles also a case where it's not attached.
Maybe call the parameter "detaching" and switch the 0 and 1?
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Suren Baghdasaryan 11 months, 1 week ago
On Fri, Jan 10, 2025 at 2:26 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/10/25 16:56, Suren Baghdasaryan wrote:
> >> > --- a/mm/memory.c
> >> > +++ b/mm/memory.c
> >> > @@ -6370,9 +6370,41 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> >> >  #endif
> >> >
> >> >  #ifdef CONFIG_PER_VMA_LOCK
> >> > +static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt)
> >> > +{
> >> > +     /*
> >> > +      * If vma is detached then only vma_mark_attached() can raise the
> >> > +      * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
> >> > +      */
> >> > +     if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> >> > +             return false;
> >> > +
> >> > +     rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> >> > +     rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> >> > +                refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> >> > +                TASK_UNINTERRUPTIBLE);
> >> > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> >> > +
> >> > +     return true;
> >> > +}
> >> > +
> >> > +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> >> > +{
> >> > +     *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
> >> > +     rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> >> > +}
> >> > +
> >> >  void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> >> >  {
> >> > -     down_write(&vma->vm_lock.lock);
> >> > +     bool locked;
> >> > +
> >> > +     /*
> >> > +      * __vma_enter_locked() returns false immediately if the vma is not
> >> > +      * attached, otherwise it waits until refcnt is (VMA_LOCK_OFFSET + 1)
> >> > +      * indicating that vma is attached with no readers.
> >> > +      */
> >> > +     locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 1);
> >>
> >> Wonder if it would be slightly better if tgt_refcount was just 1 (or 0
> >> below in vma_mark_detached()) and the VMA_LOCK_OFFSET added to it in
> >> __vma_enter_locked() itself as it's the one adding it in the first place.
> >
> > Well, it won't be called tgt_refcount then. Maybe "bool vma_attached"
> > and inside __vma_enter_locked() we do:
> >
> > unsigned int tgt_refcnt = VMA_LOCK_OFFSET + vma_attached ? 1 : 0;
> >
> > Is that better?
>
> Yeah I think so as it centralizes the target refcount logic into a single
> place __vma_enter_locked().
> Hm but then it's weird that __vma_start_write() would set vma_attached to
> true and yet it handles also a case where it's not attached.

Ah, good point.

> Maybe call the parameter "detaching" and switch the 0 and 1?

Yes, that would be less confusing. Thanks for the suggestion, I'll use it.
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Suren Baghdasaryan 11 months, 1 week ago
On Fri, Jan 10, 2025 at 7:56 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Jan 10, 2025 at 6:33 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > On 1/9/25 3:30 AM, Suren Baghdasaryan wrote:
> > > rw_semaphore is a sizable structure of 40 bytes and consumes
> > > considerable space for each vm_area_struct. However vma_lock has
> > > two important specifics which can be used to replace rw_semaphore
> > > with a simpler structure:
> > > 1. Readers never wait. They try to take the vma_lock and fall back to
> > > mmap_lock if that fails.
> > > 2. Only one writer at a time will ever try to write-lock a vma_lock
> > > because writers first take mmap_lock in write mode.
> > > Because of these requirements, full rw_semaphore functionality is not
> > > needed and we can replace rw_semaphore and the vma->detached flag with
> > > a refcount (vm_refcnt).
> > > When vma is in detached state, vm_refcnt is 0 and only a call to
> > > vma_mark_attached() can take it out of this state. Note that unlike
> > > before, now we enforce both vma_mark_attached() and vma_mark_detached()
> > > to be done only after vma has been write-locked. vma_mark_attached()
> > > changes vm_refcnt to 1 to indicate that it has been attached to the vma
> > > tree. When a reader takes read lock, it increments vm_refcnt, unless the
> > > top usable bit of vm_refcnt (0x40000000) is set, indicating presence of
> > > a writer. When writer takes write lock, it sets the top usable bit to
> > > indicate its presence. If there are readers, writer will wait using newly
> > > introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
> > > mode first, there can be only one writer at a time. The last reader to
> > > release the lock will signal the writer to wake up.
> > > refcount might overflow if there are many competing readers, in which case
> > > read-locking will fail. Readers are expected to handle such failures.
> > > In summary:
> > > 1. all readers increment the vm_refcnt;
> > > 2. writer sets top usable (writer) bit of vm_refcnt;
> > > 3. readers cannot increment the vm_refcnt if the writer bit is set;
> > > 4. in the presence of readers, writer must wait for the vm_refcnt to drop
> > > to 1 (ignoring the writer bit), indicating an attached vma with no readers;
> > > 5. vm_refcnt overflow is handled by the readers.
> > >
> > > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > > Suggested-by: Matthew Wilcox <willy@infradead.org>
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> >
> > Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
> >
> > But think there's a problem that will manifest after patch 15.
> > Also I don't feel qualified enough about the lockdep parts though
> > (although I think I spotted another issue with those, below) so best if
> > PeterZ can review those.
> > Some nits below too.
> >
> > > +
> > > +static inline void vma_refcount_put(struct vm_area_struct *vma)
> > > +{
> > > +     int oldcnt;
> > > +
> > > +     if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> > > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> >
> > Shouldn't we rwsem_release always? And also shouldn't it precede the
> > refcount operation itself?
>
> Yes. Hillf pointed to the same issue. It will be fixed in the next version.
>
> >
> > > +             if (is_vma_writer_only(oldcnt - 1))
> > > +                     rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> >
> > Hmm hmm we should maybe read the vm_mm pointer before dropping the
> > refcount? In case this races in a way that is_vma_writer_only tests true
> > but the writer meanwhile finishes and frees the vma. It's safe now but
> > not after making the cache SLAB_TYPESAFE_BY_RCU ?
>
> Hmm. But if is_vma_writer_only() is true that means the writed is
> blocked and is waiting for the reader to drop the vm_refcnt. IOW, it
> won't proceed and free the vma until the reader calls
> rcuwait_wake_up(). Your suggested change is trivial and I can do it
> but I want to make sure I'm not missing something. Am I?

Ok, after thinking some more, I think the race you might be referring
to is this:

writer                                           reader

    __vma_enter_locked
        refcount_add_not_zero(VMA_LOCK_OFFSET, ...)
                                                    vma_refcount_put
                                                       __refcount_dec_and_test()
                                                           if
(is_vma_writer_only())
        rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, ...)
    __vma_exit_locked
        refcount_sub_and_test(VMA_LOCK_OFFSET, ...)
    free the vma

rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);

I think it's possible and your suggestion of storing the mm before
doing __refcount_dec_and_test() should work. Thanks for pointing this
out! I'll fix it in the next version.

>
> >
> > > +     }
> > > +}
> > > +
> >
> > >  static inline void vma_end_read(struct vm_area_struct *vma)
> > >  {
> > >       rcu_read_lock(); /* keeps vma alive till the end of up_read */
> >
> > This should refer to vma_refcount_put(). But after fixing it I think we
> > could stop doing this altogether? It will no longer keep vma "alive"
> > with SLAB_TYPESAFE_BY_RCU.
>
> Yeah, I think the comment along with rcu_read_lock()/rcu_read_unlock()
> here can be safely removed.
>
> >
> > > -     up_read(&vma->vm_lock.lock);
> > > +     vma_refcount_put(vma);
> > >       rcu_read_unlock();
> > >  }
> > >
> >
> > <snip>
> >
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -6370,9 +6370,41 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> > >  #endif
> > >
> > >  #ifdef CONFIG_PER_VMA_LOCK
> > > +static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt)
> > > +{
> > > +     /*
> > > +      * If vma is detached then only vma_mark_attached() can raise the
> > > +      * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
> > > +      */
> > > +     if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> > > +             return false;
> > > +
> > > +     rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> > > +     rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> > > +                refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> > > +                TASK_UNINTERRUPTIBLE);
> > > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> > > +{
> > > +     *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
> > > +     rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > > +}
> > > +
> > >  void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> > >  {
> > > -     down_write(&vma->vm_lock.lock);
> > > +     bool locked;
> > > +
> > > +     /*
> > > +      * __vma_enter_locked() returns false immediately if the vma is not
> > > +      * attached, otherwise it waits until refcnt is (VMA_LOCK_OFFSET + 1)
> > > +      * indicating that vma is attached with no readers.
> > > +      */
> > > +     locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 1);
> >
> > Wonder if it would be slightly better if tgt_refcount was just 1 (or 0
> > below in vma_mark_detached()) and the VMA_LOCK_OFFSET added to it in
> > __vma_enter_locked() itself as it's the one adding it in the first place.
>
> Well, it won't be called tgt_refcount then. Maybe "bool vma_attached"
> and inside __vma_enter_locked() we do:
>
> unsigned int tgt_refcnt = VMA_LOCK_OFFSET + vma_attached ? 1 : 0;
>
> Is that better?
>
> >
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Suren Baghdasaryan 11 months, 1 week ago
On Fri, Jan 10, 2025 at 8:47 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Jan 10, 2025 at 7:56 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Fri, Jan 10, 2025 at 6:33 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > >
> > > On 1/9/25 3:30 AM, Suren Baghdasaryan wrote:
> > > > rw_semaphore is a sizable structure of 40 bytes and consumes
> > > > considerable space for each vm_area_struct. However vma_lock has
> > > > two important specifics which can be used to replace rw_semaphore
> > > > with a simpler structure:
> > > > 1. Readers never wait. They try to take the vma_lock and fall back to
> > > > mmap_lock if that fails.
> > > > 2. Only one writer at a time will ever try to write-lock a vma_lock
> > > > because writers first take mmap_lock in write mode.
> > > > Because of these requirements, full rw_semaphore functionality is not
> > > > needed and we can replace rw_semaphore and the vma->detached flag with
> > > > a refcount (vm_refcnt).
> > > > When vma is in detached state, vm_refcnt is 0 and only a call to
> > > > vma_mark_attached() can take it out of this state. Note that unlike
> > > > before, now we enforce both vma_mark_attached() and vma_mark_detached()
> > > > to be done only after vma has been write-locked. vma_mark_attached()
> > > > changes vm_refcnt to 1 to indicate that it has been attached to the vma
> > > > tree. When a reader takes read lock, it increments vm_refcnt, unless the
> > > > top usable bit of vm_refcnt (0x40000000) is set, indicating presence of
> > > > a writer. When writer takes write lock, it sets the top usable bit to
> > > > indicate its presence. If there are readers, writer will wait using newly
> > > > introduced mm->vma_writer_wait. Since all writers take mmap_lock in write
> > > > mode first, there can be only one writer at a time. The last reader to
> > > > release the lock will signal the writer to wake up.
> > > > refcount might overflow if there are many competing readers, in which case
> > > > read-locking will fail. Readers are expected to handle such failures.
> > > > In summary:
> > > > 1. all readers increment the vm_refcnt;
> > > > 2. writer sets top usable (writer) bit of vm_refcnt;
> > > > 3. readers cannot increment the vm_refcnt if the writer bit is set;
> > > > 4. in the presence of readers, writer must wait for the vm_refcnt to drop
> > > > to 1 (ignoring the writer bit), indicating an attached vma with no readers;
> > > > 5. vm_refcnt overflow is handled by the readers.
> > > >
> > > > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > > > Suggested-by: Matthew Wilcox <willy@infradead.org>
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > >
> > > Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
> > >
> > > But think there's a problem that will manifest after patch 15.
> > > Also I don't feel qualified enough about the lockdep parts though
> > > (although I think I spotted another issue with those, below) so best if
> > > PeterZ can review those.
> > > Some nits below too.
> > >
> > > > +
> > > > +static inline void vma_refcount_put(struct vm_area_struct *vma)
> > > > +{
> > > > +     int oldcnt;
> > > > +
> > > > +     if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> > > > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > >
> > > Shouldn't we rwsem_release always? And also shouldn't it precede the
> > > refcount operation itself?
> >
> > Yes. Hillf pointed to the same issue. It will be fixed in the next version.
> >
> > >
> > > > +             if (is_vma_writer_only(oldcnt - 1))
> > > > +                     rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> > >
> > > Hmm hmm we should maybe read the vm_mm pointer before dropping the
> > > refcount? In case this races in a way that is_vma_writer_only tests true
> > > but the writer meanwhile finishes and frees the vma. It's safe now but
> > > not after making the cache SLAB_TYPESAFE_BY_RCU ?
> >
> > Hmm. But if is_vma_writer_only() is true that means the writed is
> > blocked and is waiting for the reader to drop the vm_refcnt. IOW, it
> > won't proceed and free the vma until the reader calls
> > rcuwait_wake_up(). Your suggested change is trivial and I can do it
> > but I want to make sure I'm not missing something. Am I?
>
> Ok, after thinking some more, I think the race you might be referring
> to is this:
>
> writer                                           reader
>
>     __vma_enter_locked
>         refcount_add_not_zero(VMA_LOCK_OFFSET, ...)
>                                                     vma_refcount_put
>                                                        __refcount_dec_and_test()
>                                                            if
> (is_vma_writer_only())
>         rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, ...)
>     __vma_exit_locked
>         refcount_sub_and_test(VMA_LOCK_OFFSET, ...)
>     free the vma
>
> rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);

Sorry, this should be more readable:

writer             reader

__vma_enter_locked
    refcount_add_not_zero(VMA_LOCK_OFFSET, ...)
                   vma_refcount_put
                       __refcount_dec_and_test()
                           if (is_vma_writer_only())

    rcuwait_wait_event()
__vma_exit_locked
    refcount_sub_and_test(VMA_LOCK_OFFSET, ...)
free the vma

                               rcuwait_wake_up(); <-- access to vma->vm_mm

>
> I think it's possible and your suggestion of storing the mm before
> doing __refcount_dec_and_test() should work. Thanks for pointing this
> out! I'll fix it in the next version.
>
> >
> > >
> > > > +     }
> > > > +}
> > > > +
> > >
> > > >  static inline void vma_end_read(struct vm_area_struct *vma)
> > > >  {
> > > >       rcu_read_lock(); /* keeps vma alive till the end of up_read */
> > >
> > > This should refer to vma_refcount_put(). But after fixing it I think we
> > > could stop doing this altogether? It will no longer keep vma "alive"
> > > with SLAB_TYPESAFE_BY_RCU.
> >
> > Yeah, I think the comment along with rcu_read_lock()/rcu_read_unlock()
> > here can be safely removed.
> >
> > >
> > > > -     up_read(&vma->vm_lock.lock);
> > > > +     vma_refcount_put(vma);
> > > >       rcu_read_unlock();
> > > >  }
> > > >
> > >
> > > <snip>
> > >
> > > > --- a/mm/memory.c
> > > > +++ b/mm/memory.c
> > > > @@ -6370,9 +6370,41 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
> > > >  #endif
> > > >
> > > >  #ifdef CONFIG_PER_VMA_LOCK
> > > > +static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt)
> > > > +{
> > > > +     /*
> > > > +      * If vma is detached then only vma_mark_attached() can raise the
> > > > +      * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
> > > > +      */
> > > > +     if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> > > > +             return false;
> > > > +
> > > > +     rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> > > > +     rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> > > > +                refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> > > > +                TASK_UNINTERRUPTIBLE);
> > > > +     lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
> > > > +
> > > > +     return true;
> > > > +}
> > > > +
> > > > +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> > > > +{
> > > > +     *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
> > > > +     rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
> > > > +}
> > > > +
> > > >  void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> > > >  {
> > > > -     down_write(&vma->vm_lock.lock);
> > > > +     bool locked;
> > > > +
> > > > +     /*
> > > > +      * __vma_enter_locked() returns false immediately if the vma is not
> > > > +      * attached, otherwise it waits until refcnt is (VMA_LOCK_OFFSET + 1)
> > > > +      * indicating that vma is attached with no readers.
> > > > +      */
> > > > +     locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 1);
> > >
> > > Wonder if it would be slightly better if tgt_refcount was just 1 (or 0
> > > below in vma_mark_detached()) and the VMA_LOCK_OFFSET added to it in
> > > __vma_enter_locked() itself as it's the one adding it in the first place.
> >
> > Well, it won't be called tgt_refcount then. Maybe "bool vma_attached"
> > and inside __vma_enter_locked() we do:
> >
> > unsigned int tgt_refcnt = VMA_LOCK_OFFSET + vma_attached ? 1 : 0;
> >
> > Is that better?
> >
> > >
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Hillf Danton 11 months, 1 week ago
On Wed,  8 Jan 2025 18:30:20 -0800 Suren Baghdasaryan <surenb@google.com>
> +
> +static inline void vma_refcount_put(struct vm_area_struct *vma)
> +{
> +	int oldcnt;
> +
> +	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> +		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);

In up_read() rwsem is released reguardless wakeup, which is different
than what is added here. Nit.

> +
> +		if (is_vma_writer_only(oldcnt - 1))
> +			rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> +	}
> +}
...
> @@ -735,9 +773,10 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
>  	 * This pairs with RELEASE semantics in vma_end_write_all().
>  	 */
>  	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> -		up_read(&vma->vm_lock.lock);
> +		vma_refcount_put(vma);
>  		return false;
>  	}

void up_read(struct rw_semaphore *sem)
{
	rwsem_release(&sem->dep_map, _RET_IP_);
	__up_read(sem);
}
Re: [PATCH v8 11/16] mm: replace vm_lock and detached flag with a reference count
Posted by Suren Baghdasaryan 11 months, 1 week ago
On Thu, Jan 9, 2025 at 2:36 AM Hillf Danton <hdanton@sina.com> wrote:
>
> On Wed,  8 Jan 2025 18:30:20 -0800 Suren Baghdasaryan <surenb@google.com>
> > +
> > +static inline void vma_refcount_put(struct vm_area_struct *vma)
> > +{
> > +     int oldcnt;
> > +
> > +     if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
> > +             rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
>
> In up_read() rwsem is released reguardless wakeup, which is different
> than what is added here. Nit.

Good point. I'll send a fixup since it's a small change. Thanks!

>
> > +
> > +             if (is_vma_writer_only(oldcnt - 1))
> > +                     rcuwait_wake_up(&vma->vm_mm->vma_writer_wait);
> > +     }
> > +}
> ...
> > @@ -735,9 +773,10 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> >        * This pairs with RELEASE semantics in vma_end_write_all().
> >        */
> >       if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> > -             up_read(&vma->vm_lock.lock);
> > +             vma_refcount_put(vma);
> >               return false;
> >       }
>
> void up_read(struct rw_semaphore *sem)
> {
>         rwsem_release(&sem->dep_map, _RET_IP_);
>         __up_read(sem);
> }
>