With maple_tree supporting vma tree traversal under RCU and per-vma
locks, /proc/pid/maps can be read while holding individual vma locks
instead of locking the entire address space.
Completely lockless approach (walking vma tree under RCU) would be quite
complex with the main issue being get_vma_name() using callbacks which
might not work correctly with a stable vma copy, requiring original
(unstable) vma - see special_mapping_name() for an example.
When per-vma lock acquisition fails, we take the mmap_lock for reading,
lock the vma, release the mmap_lock and continue. This fallback to mmap
read lock guarantees the reader to make forward progress even during
lock contention. This will interfere with the writer but for a very
short time while we are acquiring the per-vma lock and only when there
was contention on the vma reader is interested in. We shouldn't see a
repeated fallback to mmap read locks in practice, as this require a
very unlikely series of lock contentions (for instance due to repeated
vma split operations). However even if this did somehow happen, we would
still progress.
One case requiring special handling is when vma changes between the
time it was found and the time it got locked. A problematic case would
be if vma got shrunk so that it's start moved higher in the address
space and a new vma was installed at the beginning:
reader found: |--------VMA A--------|
VMA is modified: |-VMA B-|----VMA A----|
reader locks modified VMA A
reader reports VMA A: | gap |----VMA A----|
This would result in reporting a gap in the address space that does not
exist. To prevent this we retry the lookup after locking the vma, however
we do that only when we identify a gap and detect that the address space
was changed after we found the vma.
This change is designed to reduce mmap_lock contention and prevent a
process reading /proc/pid/maps files (often a low priority task, such
as monitoring/data collection services) from blocking address space
updates. Note that this change has a userspace visible disadvantage:
it allows for sub-page data tearing as opposed to the previous mechanism
where data tearing could happen only between pages of generated output
data. Since current userspace considers data tearing between pages to be
acceptable, we assume is will be able to handle sub-page data tearing
as well.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
fs/proc/internal.h | 5 ++
fs/proc/task_mmu.c | 118 ++++++++++++++++++++++++++++++++++----
include/linux/mmap_lock.h | 11 ++++
mm/madvise.c | 3 +-
mm/mmap_lock.c | 88 ++++++++++++++++++++++++++++
5 files changed, 214 insertions(+), 11 deletions(-)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3d48ffe72583..7c235451c5ea 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -384,6 +384,11 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
struct vma_iterator iter;
+ loff_t last_pos;
+#ifdef CONFIG_PER_VMA_LOCK
+ bool mmap_locked;
+ struct vm_area_struct *locked_vma;
+#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b8bc06d05a72..ff3fe488ce51 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -127,15 +127,107 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
- loff_t *ppos)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static void unlock_vma(struct proc_maps_private *priv)
+{
+ if (priv->locked_vma) {
+ vma_end_read(priv->locked_vma);
+ priv->locked_vma = NULL;
+ }
+}
+
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_private *priv)
+{
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read with locking just the vma.
+ */
+ if (m->op != &proc_pid_maps_op) {
+ if (mmap_read_lock_killable(priv->mm))
+ return false;
+
+ priv->mmap_locked = true;
+ } else {
+ rcu_read_lock();
+ priv->locked_vma = NULL;
+ priv->mmap_locked = false;
+ }
+
+ return true;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
+{
+ if (priv->mmap_locked) {
+ mmap_read_unlock(priv->mm);
+ } else {
+ unlock_vma(priv);
+ rcu_read_unlock();
+ }
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ struct vm_area_struct *vma;
+
+ if (priv->mmap_locked)
+ return vma_next(&priv->iter);
+
+ unlock_vma(priv);
+ vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
+ if (!IS_ERR_OR_NULL(vma))
+ priv->locked_vma = vma;
+
+ return vma;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_private *priv)
{
- struct vm_area_struct *vma = vma_next(&priv->iter);
+ return mmap_read_lock_killable(priv->mm) == 0;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
+{
+ mmap_read_unlock(priv->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ return vma_next(&priv->iter);
+}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma;
+
+ vma = get_next_vma(priv, *ppos);
+ /* EINTR is possible */
+ if (IS_ERR(vma))
+ return vma;
+
+ /* Store previous position to be able to restart if needed */
+ priv->last_pos = *ppos;
if (vma) {
- *ppos = vma->vm_start;
+ /*
+ * Track the end of the reported vma to ensure position changes
+ * even if previous vma was merged with the next vma and we
+ * found the extended vma with the same vm_start.
+ */
+ *ppos = vma->vm_end;
} else {
- *ppos = -2;
+ *ppos = -2; /* -2 indicates gate vma */
vma = get_gate_vma(priv->mm);
}
@@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return NULL;
}
- if (mmap_read_lock_killable(mm)) {
+ if (!lock_vma_range(m, priv)) {
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
return ERR_PTR(-EINTR);
}
+ /*
+ * Reset current position if last_addr was set before
+ * and it's not a sentinel.
+ */
+ if (last_addr > 0)
+ *ppos = last_addr = priv->last_pos;
vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
hold_task_mempolicy(priv);
if (last_addr == -2)
return get_gate_vma(mm);
- return proc_get_vma(priv, ppos);
+ return proc_get_vma(m, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
if (*ppos == -2) {
- *ppos = -1;
+ *ppos = -1; /* -1 indicates no more vmas */
return NULL;
}
- return proc_get_vma(m->private, ppos);
+ return proc_get_vma(m, ppos);
}
static void m_stop(struct seq_file *m, void *v)
@@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v)
return;
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ unlock_vma_range(priv);
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 5da384bd0a26..1f4f44951abe 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma);
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address);
+/*
+ * Locks next vma pointed by the iterator. Confirms the locked vma has not
+ * been modified and will retry under mmap_lock protection if modification
+ * was detected. Should be called from read RCU section.
+ * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
+ * process was interrupted.
+ */
+struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
+ struct vma_iterator *iter,
+ unsigned long address);
+
#else /* CONFIG_PER_VMA_LOCK */
static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
diff --git a/mm/madvise.c b/mm/madvise.c
index a34c2c89a53b..e61e32b2cd91 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref)
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- mmap_assert_locked(vma->vm_mm);
+ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ vma_assert_locked(vma);
return vma->anon_name;
}
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 5f725cc67334..ed0e5e2171cd 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
count_vm_vma_lock_event(VMA_LOCK_ABORT);
return NULL;
}
+
+static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm,
+ struct vma_iterator *iter,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Lookup the vma at the last position again under mmap_read_lock */
+ vma_iter_init(iter, mm, address);
+ vma = vma_next(iter);
+ if (vma)
+ vma_start_read_locked(vma);
+
+ mmap_read_unlock(mm);
+
+ return vma;
+}
+
+struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
+ struct vma_iterator *iter,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+ unsigned int mm_wr_seq;
+ bool mmap_unlocked;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
+retry:
+ /* Start mmap_lock speculation in case we need to verify the vma later */
+ mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
+ vma = vma_next(iter);
+ if (!vma)
+ return NULL;
+
+ vma = vma_start_read(mm, vma);
+
+ if (IS_ERR_OR_NULL(vma)) {
+ /*
+ * Retry immediately if the vma gets detached from under us.
+ * Infinite loop should not happen because the vma we find will
+ * have to be constantly knocked out from under us.
+ */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ vma_iter_init(iter, mm, address);
+ goto retry;
+ }
+
+ goto out;
+ }
+
+ /*
+ * Verify the vma we locked belongs to the same address space and it's
+ * not behind of the last search position.
+ */
+ if (unlikely(vma->vm_mm != mm || address >= vma->vm_end))
+ goto out_unlock;
+
+ /*
+ * vma can be ahead of the last search position but we need to verify
+ * it was not shrunk after we found it and another vma has not been
+ * installed ahead of it. Otherwise we might observe a gap that should
+ * not be there.
+ */
+ if (address < vma->vm_start) {
+ /* Verify only if the address space might have changed since vma lookup. */
+ if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
+ vma_iter_init(iter, mm, address);
+ if (vma != vma_next(iter))
+ goto out_unlock;
+ }
+ }
+
+ return vma;
+
+out_unlock:
+ vma_end_read(vma);
+out:
+ rcu_read_unlock();
+ vma = lock_vma_under_mmap_lock(mm, iter, address);
+ rcu_read_lock();
+
+ return vma;
+}
#endif /* CONFIG_PER_VMA_LOCK */
#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
--
2.50.0.727.gbf7dc18ff4-goog
On 7/4/25 08:07, Suren Baghdasaryan wrote: > --- a/mm/mmap_lock.c > +++ b/mm/mmap_lock.c > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > count_vm_vma_lock_event(VMA_LOCK_ABORT); > return NULL; > } > + > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address) > +{ > + struct vm_area_struct *vma; > + int ret; > + > + ret = mmap_read_lock_killable(mm); > + if (ret) > + return ERR_PTR(ret); > + > + /* Lookup the vma at the last position again under mmap_read_lock */ > + vma_iter_init(iter, mm, address); > + vma = vma_next(iter); > + if (vma) > + vma_start_read_locked(vma); This can in theory return false (refcount overflow?) so it should be handled? > + > + mmap_read_unlock(mm); > + > + return vma; > +} > +
On Wed, Jul 9, 2025 at 3:03 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 7/4/25 08:07, Suren Baghdasaryan wrote: > > --- a/mm/mmap_lock.c > > +++ b/mm/mmap_lock.c > > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > count_vm_vma_lock_event(VMA_LOCK_ABORT); > > return NULL; > > } > > + > > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm, > > + struct vma_iterator *iter, > > + unsigned long address) > > +{ > > + struct vm_area_struct *vma; > > + int ret; > > + > > + ret = mmap_read_lock_killable(mm); > > + if (ret) > > + return ERR_PTR(ret); > > + > > + /* Lookup the vma at the last position again under mmap_read_lock */ > > + vma_iter_init(iter, mm, address); > > + vma = vma_next(iter); > > + if (vma) > > + vma_start_read_locked(vma); > > This can in theory return false (refcount overflow?) so it should be handled? Yes, I should handle it by falling back to mmap_lock. Thanks! > > > + > > + mmap_read_unlock(mm); > > + > > + return vma; > > +} > > +
* Suren Baghdasaryan <surenb@google.com> [250704 02:07]: > With maple_tree supporting vma tree traversal under RCU and per-vma > locks, /proc/pid/maps can be read while holding individual vma locks > instead of locking the entire address space. > Completely lockless approach (walking vma tree under RCU) would be quite > complex with the main issue being get_vma_name() using callbacks which > might not work correctly with a stable vma copy, requiring original > (unstable) vma - see special_mapping_name() for an example. > When per-vma lock acquisition fails, we take the mmap_lock for reading, > lock the vma, release the mmap_lock and continue. This fallback to mmap > read lock guarantees the reader to make forward progress even during > lock contention. This will interfere with the writer but for a very > short time while we are acquiring the per-vma lock and only when there > was contention on the vma reader is interested in. We shouldn't see a > repeated fallback to mmap read locks in practice, as this require a > very unlikely series of lock contentions (for instance due to repeated > vma split operations). However even if this did somehow happen, we would > still progress. > One case requiring special handling is when vma changes between the > time it was found and the time it got locked. A problematic case would > be if vma got shrunk so that it's start moved higher in the address > space and a new vma was installed at the beginning: > > reader found: |--------VMA A--------| > VMA is modified: |-VMA B-|----VMA A----| > reader locks modified VMA A > reader reports VMA A: | gap |----VMA A----| > > This would result in reporting a gap in the address space that does not > exist. To prevent this we retry the lookup after locking the vma, however > we do that only when we identify a gap and detect that the address space > was changed after we found the vma. > This change is designed to reduce mmap_lock contention and prevent a > process reading /proc/pid/maps files (often a low priority task, such > as monitoring/data collection services) from blocking address space > updates. Note that this change has a userspace visible disadvantage: > it allows for sub-page data tearing as opposed to the previous mechanism > where data tearing could happen only between pages of generated output > data. Since current userspace considers data tearing between pages to be > acceptable, we assume is will be able to handle sub-page data tearing > as well. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> > --- > fs/proc/internal.h | 5 ++ > fs/proc/task_mmu.c | 118 ++++++++++++++++++++++++++++++++++---- > include/linux/mmap_lock.h | 11 ++++ > mm/madvise.c | 3 +- > mm/mmap_lock.c | 88 ++++++++++++++++++++++++++++ > 5 files changed, 214 insertions(+), 11 deletions(-) > > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > index 3d48ffe72583..7c235451c5ea 100644 > --- a/fs/proc/internal.h > +++ b/fs/proc/internal.h > @@ -384,6 +384,11 @@ struct proc_maps_private { > struct task_struct *task; > struct mm_struct *mm; > struct vma_iterator iter; > + loff_t last_pos; > +#ifdef CONFIG_PER_VMA_LOCK > + bool mmap_locked; > + struct vm_area_struct *locked_vma; > +#endif > #ifdef CONFIG_NUMA > struct mempolicy *task_mempolicy; > #endif > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index b8bc06d05a72..ff3fe488ce51 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct proc_maps_private *priv) > } > #endif > > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, > - loff_t *ppos) > +#ifdef CONFIG_PER_VMA_LOCK > + > +static void unlock_vma(struct proc_maps_private *priv) > +{ > + if (priv->locked_vma) { > + vma_end_read(priv->locked_vma); > + priv->locked_vma = NULL; > + } > +} > + > +static const struct seq_operations proc_pid_maps_op; > + > +static inline bool lock_vma_range(struct seq_file *m, > + struct proc_maps_private *priv) > +{ > + /* > + * smaps and numa_maps perform page table walk, therefore require > + * mmap_lock but maps can be read with locking just the vma. > + */ > + if (m->op != &proc_pid_maps_op) { > + if (mmap_read_lock_killable(priv->mm)) > + return false; > + > + priv->mmap_locked = true; > + } else { > + rcu_read_lock(); > + priv->locked_vma = NULL; > + priv->mmap_locked = false; > + } > + > + return true; > +} > + > +static inline void unlock_vma_range(struct proc_maps_private *priv) > +{ > + if (priv->mmap_locked) { > + mmap_read_unlock(priv->mm); > + } else { > + unlock_vma(priv); > + rcu_read_unlock(); > + } > +} > + > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > + loff_t last_pos) > +{ > + struct vm_area_struct *vma; > + > + if (priv->mmap_locked) > + return vma_next(&priv->iter); > + > + unlock_vma(priv); > + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); > + if (!IS_ERR_OR_NULL(vma)) > + priv->locked_vma = vma; > + > + return vma; > +} > + > +#else /* CONFIG_PER_VMA_LOCK */ > + > +static inline bool lock_vma_range(struct seq_file *m, > + struct proc_maps_private *priv) > { > - struct vm_area_struct *vma = vma_next(&priv->iter); > + return mmap_read_lock_killable(priv->mm) == 0; > +} > + > +static inline void unlock_vma_range(struct proc_maps_private *priv) > +{ > + mmap_read_unlock(priv->mm); > +} > + > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > + loff_t last_pos) > +{ > + return vma_next(&priv->iter); > +} > > +#endif /* CONFIG_PER_VMA_LOCK */ > + > +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) > +{ > + struct proc_maps_private *priv = m->private; > + struct vm_area_struct *vma; > + > + vma = get_next_vma(priv, *ppos); > + /* EINTR is possible */ > + if (IS_ERR(vma)) > + return vma; > + > + /* Store previous position to be able to restart if needed */ > + priv->last_pos = *ppos; > if (vma) { > - *ppos = vma->vm_start; > + /* > + * Track the end of the reported vma to ensure position changes > + * even if previous vma was merged with the next vma and we > + * found the extended vma with the same vm_start. > + */ > + *ppos = vma->vm_end; > } else { > - *ppos = -2; > + *ppos = -2; /* -2 indicates gate vma */ > vma = get_gate_vma(priv->mm); > } > > @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) > return NULL; > } > > - if (mmap_read_lock_killable(mm)) { > + if (!lock_vma_range(m, priv)) { > mmput(mm); > put_task_struct(priv->task); > priv->task = NULL; > return ERR_PTR(-EINTR); > } > > + /* > + * Reset current position if last_addr was set before > + * and it's not a sentinel. > + */ > + if (last_addr > 0) > + *ppos = last_addr = priv->last_pos; > vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); > hold_task_mempolicy(priv); > if (last_addr == -2) > return get_gate_vma(mm); > > - return proc_get_vma(priv, ppos); > + return proc_get_vma(m, ppos); > } > > static void *m_next(struct seq_file *m, void *v, loff_t *ppos) > { > if (*ppos == -2) { > - *ppos = -1; > + *ppos = -1; /* -1 indicates no more vmas */ > return NULL; > } > - return proc_get_vma(m->private, ppos); > + return proc_get_vma(m, ppos); > } > > static void m_stop(struct seq_file *m, void *v) > @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v) > return; > > release_task_mempolicy(priv); > - mmap_read_unlock(mm); > + unlock_vma_range(priv); > mmput(mm); > put_task_struct(priv->task); > priv->task = NULL; > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > index 5da384bd0a26..1f4f44951abe 100644 > --- a/include/linux/mmap_lock.h > +++ b/include/linux/mmap_lock.h > @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma); > struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > unsigned long address); > > +/* > + * Locks next vma pointed by the iterator. Confirms the locked vma has not > + * been modified and will retry under mmap_lock protection if modification > + * was detected. Should be called from read RCU section. > + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the > + * process was interrupted. > + */ > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address); > + > #else /* CONFIG_PER_VMA_LOCK */ > > static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} > diff --git a/mm/madvise.c b/mm/madvise.c > index a34c2c89a53b..e61e32b2cd91 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref) > > struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) > { > - mmap_assert_locked(vma->vm_mm); > + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) > + vma_assert_locked(vma); > > return vma->anon_name; > } > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c > index 5f725cc67334..ed0e5e2171cd 100644 > --- a/mm/mmap_lock.c > +++ b/mm/mmap_lock.c > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > count_vm_vma_lock_event(VMA_LOCK_ABORT); > return NULL; > } > + > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address) > +{ > + struct vm_area_struct *vma; > + int ret; > + > + ret = mmap_read_lock_killable(mm); > + if (ret) > + return ERR_PTR(ret); > + > + /* Lookup the vma at the last position again under mmap_read_lock */ > + vma_iter_init(iter, mm, address); > + vma = vma_next(iter); > + if (vma) > + vma_start_read_locked(vma); > + > + mmap_read_unlock(mm); > + > + return vma; > +} > + > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address) > +{ > + struct vm_area_struct *vma; > + unsigned int mm_wr_seq; > + bool mmap_unlocked; > + > + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); > +retry: > + /* Start mmap_lock speculation in case we need to verify the vma later */ > + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); > + vma = vma_next(iter); > + if (!vma) > + return NULL; > + > + vma = vma_start_read(mm, vma); > + > + if (IS_ERR_OR_NULL(vma)) { > + /* > + * Retry immediately if the vma gets detached from under us. > + * Infinite loop should not happen because the vma we find will > + * have to be constantly knocked out from under us. > + */ > + if (PTR_ERR(vma) == -EAGAIN) { > + vma_iter_init(iter, mm, address); > + goto retry; > + } > + > + goto out; > + } > + > + /* > + * Verify the vma we locked belongs to the same address space and it's > + * not behind of the last search position. > + */ > + if (unlikely(vma->vm_mm != mm || address >= vma->vm_end)) > + goto out_unlock; > + > + /* > + * vma can be ahead of the last search position but we need to verify > + * it was not shrunk after we found it and another vma has not been > + * installed ahead of it. Otherwise we might observe a gap that should > + * not be there. > + */ > + if (address < vma->vm_start) { > + /* Verify only if the address space might have changed since vma lookup. */ > + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { > + vma_iter_init(iter, mm, address); > + if (vma != vma_next(iter)) > + goto out_unlock; > + } > + } > + > + return vma; > + > +out_unlock: > + vma_end_read(vma); > +out: > + rcu_read_unlock(); > + vma = lock_vma_under_mmap_lock(mm, iter, address); > + rcu_read_lock(); > + > + return vma; > +} > #endif /* CONFIG_PER_VMA_LOCK */ > > #ifdef CONFIG_LOCK_MM_AND_FIND_VMA > -- > 2.50.0.727.gbf7dc18ff4-goog >
On Mon, Jul 7, 2025 at 11:21 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Suren Baghdasaryan <surenb@google.com> [250704 02:07]: > > With maple_tree supporting vma tree traversal under RCU and per-vma > > locks, /proc/pid/maps can be read while holding individual vma locks > > instead of locking the entire address space. > > Completely lockless approach (walking vma tree under RCU) would be quite > > complex with the main issue being get_vma_name() using callbacks which > > might not work correctly with a stable vma copy, requiring original > > (unstable) vma - see special_mapping_name() for an example. > > When per-vma lock acquisition fails, we take the mmap_lock for reading, > > lock the vma, release the mmap_lock and continue. This fallback to mmap > > read lock guarantees the reader to make forward progress even during > > lock contention. This will interfere with the writer but for a very > > short time while we are acquiring the per-vma lock and only when there > > was contention on the vma reader is interested in. We shouldn't see a > > repeated fallback to mmap read locks in practice, as this require a > > very unlikely series of lock contentions (for instance due to repeated > > vma split operations). However even if this did somehow happen, we would > > still progress. > > One case requiring special handling is when vma changes between the > > time it was found and the time it got locked. A problematic case would > > be if vma got shrunk so that it's start moved higher in the address > > space and a new vma was installed at the beginning: > > > > reader found: |--------VMA A--------| > > VMA is modified: |-VMA B-|----VMA A----| > > reader locks modified VMA A > > reader reports VMA A: | gap |----VMA A----| > > > > This would result in reporting a gap in the address space that does not > > exist. To prevent this we retry the lookup after locking the vma, however > > we do that only when we identify a gap and detect that the address space > > was changed after we found the vma. > > This change is designed to reduce mmap_lock contention and prevent a > > process reading /proc/pid/maps files (often a low priority task, such > > as monitoring/data collection services) from blocking address space > > updates. Note that this change has a userspace visible disadvantage: > > it allows for sub-page data tearing as opposed to the previous mechanism > > where data tearing could happen only between pages of generated output > > data. Since current userspace considers data tearing between pages to be > > acceptable, we assume is will be able to handle sub-page data tearing > > as well. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Thanks! I'll update addressing Lorenzo's nits and will repost in a couple days. Hopefully by then I can get some reviews for the tests in the series. > > > --- > > fs/proc/internal.h | 5 ++ > > fs/proc/task_mmu.c | 118 ++++++++++++++++++++++++++++++++++---- > > include/linux/mmap_lock.h | 11 ++++ > > mm/madvise.c | 3 +- > > mm/mmap_lock.c | 88 ++++++++++++++++++++++++++++ > > 5 files changed, 214 insertions(+), 11 deletions(-) > > > > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > > index 3d48ffe72583..7c235451c5ea 100644 > > --- a/fs/proc/internal.h > > +++ b/fs/proc/internal.h > > @@ -384,6 +384,11 @@ struct proc_maps_private { > > struct task_struct *task; > > struct mm_struct *mm; > > struct vma_iterator iter; > > + loff_t last_pos; > > +#ifdef CONFIG_PER_VMA_LOCK > > + bool mmap_locked; > > + struct vm_area_struct *locked_vma; > > +#endif > > #ifdef CONFIG_NUMA > > struct mempolicy *task_mempolicy; > > #endif > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > index b8bc06d05a72..ff3fe488ce51 100644 > > --- a/fs/proc/task_mmu.c > > +++ b/fs/proc/task_mmu.c > > @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct proc_maps_private *priv) > > } > > #endif > > > > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, > > - loff_t *ppos) > > +#ifdef CONFIG_PER_VMA_LOCK > > + > > +static void unlock_vma(struct proc_maps_private *priv) > > +{ > > + if (priv->locked_vma) { > > + vma_end_read(priv->locked_vma); > > + priv->locked_vma = NULL; > > + } > > +} > > + > > +static const struct seq_operations proc_pid_maps_op; > > + > > +static inline bool lock_vma_range(struct seq_file *m, > > + struct proc_maps_private *priv) > > +{ > > + /* > > + * smaps and numa_maps perform page table walk, therefore require > > + * mmap_lock but maps can be read with locking just the vma. > > + */ > > + if (m->op != &proc_pid_maps_op) { > > + if (mmap_read_lock_killable(priv->mm)) > > + return false; > > + > > + priv->mmap_locked = true; > > + } else { > > + rcu_read_lock(); > > + priv->locked_vma = NULL; > > + priv->mmap_locked = false; > > + } > > + > > + return true; > > +} > > + > > +static inline void unlock_vma_range(struct proc_maps_private *priv) > > +{ > > + if (priv->mmap_locked) { > > + mmap_read_unlock(priv->mm); > > + } else { > > + unlock_vma(priv); > > + rcu_read_unlock(); > > + } > > +} > > + > > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > > + loff_t last_pos) > > +{ > > + struct vm_area_struct *vma; > > + > > + if (priv->mmap_locked) > > + return vma_next(&priv->iter); > > + > > + unlock_vma(priv); > > + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); > > + if (!IS_ERR_OR_NULL(vma)) > > + priv->locked_vma = vma; > > + > > + return vma; > > +} > > + > > +#else /* CONFIG_PER_VMA_LOCK */ > > + > > +static inline bool lock_vma_range(struct seq_file *m, > > + struct proc_maps_private *priv) > > { > > - struct vm_area_struct *vma = vma_next(&priv->iter); > > + return mmap_read_lock_killable(priv->mm) == 0; > > +} > > + > > +static inline void unlock_vma_range(struct proc_maps_private *priv) > > +{ > > + mmap_read_unlock(priv->mm); > > +} > > + > > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > > + loff_t last_pos) > > +{ > > + return vma_next(&priv->iter); > > +} > > > > +#endif /* CONFIG_PER_VMA_LOCK */ > > + > > +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) > > +{ > > + struct proc_maps_private *priv = m->private; > > + struct vm_area_struct *vma; > > + > > + vma = get_next_vma(priv, *ppos); > > + /* EINTR is possible */ > > + if (IS_ERR(vma)) > > + return vma; > > + > > + /* Store previous position to be able to restart if needed */ > > + priv->last_pos = *ppos; > > if (vma) { > > - *ppos = vma->vm_start; > > + /* > > + * Track the end of the reported vma to ensure position changes > > + * even if previous vma was merged with the next vma and we > > + * found the extended vma with the same vm_start. > > + */ > > + *ppos = vma->vm_end; > > } else { > > - *ppos = -2; > > + *ppos = -2; /* -2 indicates gate vma */ > > vma = get_gate_vma(priv->mm); > > } > > > > @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) > > return NULL; > > } > > > > - if (mmap_read_lock_killable(mm)) { > > + if (!lock_vma_range(m, priv)) { > > mmput(mm); > > put_task_struct(priv->task); > > priv->task = NULL; > > return ERR_PTR(-EINTR); > > } > > > > + /* > > + * Reset current position if last_addr was set before > > + * and it's not a sentinel. > > + */ > > + if (last_addr > 0) > > + *ppos = last_addr = priv->last_pos; > > vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); > > hold_task_mempolicy(priv); > > if (last_addr == -2) > > return get_gate_vma(mm); > > > > - return proc_get_vma(priv, ppos); > > + return proc_get_vma(m, ppos); > > } > > > > static void *m_next(struct seq_file *m, void *v, loff_t *ppos) > > { > > if (*ppos == -2) { > > - *ppos = -1; > > + *ppos = -1; /* -1 indicates no more vmas */ > > return NULL; > > } > > - return proc_get_vma(m->private, ppos); > > + return proc_get_vma(m, ppos); > > } > > > > static void m_stop(struct seq_file *m, void *v) > > @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v) > > return; > > > > release_task_mempolicy(priv); > > - mmap_read_unlock(mm); > > + unlock_vma_range(priv); > > mmput(mm); > > put_task_struct(priv->task); > > priv->task = NULL; > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > > index 5da384bd0a26..1f4f44951abe 100644 > > --- a/include/linux/mmap_lock.h > > +++ b/include/linux/mmap_lock.h > > @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma); > > struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > unsigned long address); > > > > +/* > > + * Locks next vma pointed by the iterator. Confirms the locked vma has not > > + * been modified and will retry under mmap_lock protection if modification > > + * was detected. Should be called from read RCU section. > > + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the > > + * process was interrupted. > > + */ > > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > > + struct vma_iterator *iter, > > + unsigned long address); > > + > > #else /* CONFIG_PER_VMA_LOCK */ > > > > static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} > > diff --git a/mm/madvise.c b/mm/madvise.c > > index a34c2c89a53b..e61e32b2cd91 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref) > > > > struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) > > { > > - mmap_assert_locked(vma->vm_mm); > > + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) > > + vma_assert_locked(vma); > > > > return vma->anon_name; > > } > > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c > > index 5f725cc67334..ed0e5e2171cd 100644 > > --- a/mm/mmap_lock.c > > +++ b/mm/mmap_lock.c > > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > count_vm_vma_lock_event(VMA_LOCK_ABORT); > > return NULL; > > } > > + > > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm, > > + struct vma_iterator *iter, > > + unsigned long address) > > +{ > > + struct vm_area_struct *vma; > > + int ret; > > + > > + ret = mmap_read_lock_killable(mm); > > + if (ret) > > + return ERR_PTR(ret); > > + > > + /* Lookup the vma at the last position again under mmap_read_lock */ > > + vma_iter_init(iter, mm, address); > > + vma = vma_next(iter); > > + if (vma) > > + vma_start_read_locked(vma); > > + > > + mmap_read_unlock(mm); > > + > > + return vma; > > +} > > + > > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > > + struct vma_iterator *iter, > > + unsigned long address) > > +{ > > + struct vm_area_struct *vma; > > + unsigned int mm_wr_seq; > > + bool mmap_unlocked; > > + > > + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); > > +retry: > > + /* Start mmap_lock speculation in case we need to verify the vma later */ > > + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); > > + vma = vma_next(iter); > > + if (!vma) > > + return NULL; > > + > > + vma = vma_start_read(mm, vma); > > + > > + if (IS_ERR_OR_NULL(vma)) { > > + /* > > + * Retry immediately if the vma gets detached from under us. > > + * Infinite loop should not happen because the vma we find will > > + * have to be constantly knocked out from under us. > > + */ > > + if (PTR_ERR(vma) == -EAGAIN) { > > + vma_iter_init(iter, mm, address); > > + goto retry; > > + } > > + > > + goto out; > > + } > > + > > + /* > > + * Verify the vma we locked belongs to the same address space and it's > > + * not behind of the last search position. > > + */ > > + if (unlikely(vma->vm_mm != mm || address >= vma->vm_end)) > > + goto out_unlock; > > + > > + /* > > + * vma can be ahead of the last search position but we need to verify > > + * it was not shrunk after we found it and another vma has not been > > + * installed ahead of it. Otherwise we might observe a gap that should > > + * not be there. > > + */ > > + if (address < vma->vm_start) { > > + /* Verify only if the address space might have changed since vma lookup. */ > > + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { > > + vma_iter_init(iter, mm, address); > > + if (vma != vma_next(iter)) > > + goto out_unlock; > > + } > > + } > > + > > + return vma; > > + > > +out_unlock: > > + vma_end_read(vma); > > +out: > > + rcu_read_unlock(); > > + vma = lock_vma_under_mmap_lock(mm, iter, address); > > + rcu_read_lock(); > > + > > + return vma; > > +} > > #endif /* CONFIG_PER_VMA_LOCK */ > > > > #ifdef CONFIG_LOCK_MM_AND_FIND_VMA > > -- > > 2.50.0.727.gbf7dc18ff4-goog > >
Sorry I know it's annoying, but some petty commit msg nits: On Thu, Jul 03, 2025 at 11:07:25PM -0700, Suren Baghdasaryan wrote: > With maple_tree supporting vma tree traversal under RCU and per-vma > locks, /proc/pid/maps can be read while holding individual vma locks > instead of locking the entire address space. > Completely lockless approach (walking vma tree under RCU) would be quite Completely lockless approach -> A completely lockless approach > complex with the main issue being get_vma_name() using callbacks which > might not work correctly with a stable vma copy, requiring original > (unstable) vma - see special_mapping_name() for an example. NIT: for an example -> for example > When per-vma lock acquisition fails, we take the mmap_lock for reading, > lock the vma, release the mmap_lock and continue. This fallback to mmap > read lock guarantees the reader to make forward progress even during > lock contention. This will interfere with the writer but for a very > short time while we are acquiring the per-vma lock and only when there > was contention on the vma reader is interested in. We shouldn't see a Can we separate out into a new paragraph? > repeated fallback to mmap read locks in practice, as this require a > very unlikely series of lock contentions (for instance due to repeated > vma split operations). However even if this did somehow happen, we would > still progress. > One case requiring special handling is when vma changes between the when vma changes -> when a vma chnages > time it was found and the time it got locked. A problematic case would > be if vma got shrunk so that it's start moved higher in the address vma -> a vma it's start moved higher -> its vm_start moved higher > space and a new vma was installed at the beginning: > > reader found: |--------VMA A--------| > VMA is modified: |-VMA B-|----VMA A----| > reader locks modified VMA A > reader reports VMA A: | gap |----VMA A----| > > This would result in reporting a gap in the address space that does not > exist. To prevent this we retry the lookup after locking the vma, however > we do that only when we identify a gap and detect that the address space > was changed after we found the vma. Can we separate out into a new paragraph? > This change is designed to reduce mmap_lock contention and prevent a > process reading /proc/pid/maps files (often a low priority task, such > as monitoring/data collection services) from blocking address space > updates. Note that this change has a userspace visible disadvantage: > it allows for sub-page data tearing as opposed to the previous mechanism > where data tearing could happen only between pages of generated output > data. Since current userspace considers data tearing between pages to be > acceptable, we assume is will be able to handle sub-page data tearing > as well. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> OK this is looking pretty great now, I make a bunch of points below, but I don't think anything is holding this up from being OK, so with those addressed: Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > fs/proc/internal.h | 5 ++ > fs/proc/task_mmu.c | 118 ++++++++++++++++++++++++++++++++++---- > include/linux/mmap_lock.h | 11 ++++ > mm/madvise.c | 3 +- > mm/mmap_lock.c | 88 ++++++++++++++++++++++++++++ > 5 files changed, 214 insertions(+), 11 deletions(-) > > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > index 3d48ffe72583..7c235451c5ea 100644 > --- a/fs/proc/internal.h > +++ b/fs/proc/internal.h > @@ -384,6 +384,11 @@ struct proc_maps_private { > struct task_struct *task; > struct mm_struct *mm; > struct vma_iterator iter; > + loff_t last_pos; > +#ifdef CONFIG_PER_VMA_LOCK > + bool mmap_locked; > + struct vm_area_struct *locked_vma; > +#endif > #ifdef CONFIG_NUMA > struct mempolicy *task_mempolicy; > #endif > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index b8bc06d05a72..ff3fe488ce51 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct proc_maps_private *priv) > } > #endif > > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, > - loff_t *ppos) > +#ifdef CONFIG_PER_VMA_LOCK > + > +static void unlock_vma(struct proc_maps_private *priv) > +{ > + if (priv->locked_vma) { > + vma_end_read(priv->locked_vma); > + priv->locked_vma = NULL; > + } > +} > + > +static const struct seq_operations proc_pid_maps_op; > + > +static inline bool lock_vma_range(struct seq_file *m, > + struct proc_maps_private *priv) OK this is a nice abstraction. > +{ > + /* > + * smaps and numa_maps perform page table walk, therefore require > + * mmap_lock but maps can be read with locking just the vma. Probably worth mentioning that you hold the RCU read lock for the operation also. > + */ > + if (m->op != &proc_pid_maps_op) { > + if (mmap_read_lock_killable(priv->mm)) > + return false; > + > + priv->mmap_locked = true; > + } else { > + rcu_read_lock(); > + priv->locked_vma = NULL; > + priv->mmap_locked = false; > + } > + > + return true; > +} > + > +static inline void unlock_vma_range(struct proc_maps_private *priv) I guess the 'range' is either - the whole thing in case of mmap read locked, or single VMA in case of per-VMA locks. > +{ > + if (priv->mmap_locked) { > + mmap_read_unlock(priv->mm); > + } else { > + unlock_vma(priv); > + rcu_read_unlock(); > + } > +} > + > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > + loff_t last_pos) > +{ > + struct vm_area_struct *vma; > + > + if (priv->mmap_locked) > + return vma_next(&priv->iter); > + > + unlock_vma(priv); > + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); > + if (!IS_ERR_OR_NULL(vma)) > + priv->locked_vma = vma; > + > + return vma; > +} > + > +#else /* CONFIG_PER_VMA_LOCK */ > + > +static inline bool lock_vma_range(struct seq_file *m, > + struct proc_maps_private *priv) > { > - struct vm_area_struct *vma = vma_next(&priv->iter); > + return mmap_read_lock_killable(priv->mm) == 0; > +} > + > +static inline void unlock_vma_range(struct proc_maps_private *priv) > +{ > + mmap_read_unlock(priv->mm); > +} > + > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > + loff_t last_pos) > +{ > + return vma_next(&priv->iter); > +} > > +#endif /* CONFIG_PER_VMA_LOCK */ > + > +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) > +{ > + struct proc_maps_private *priv = m->private; > + struct vm_area_struct *vma; > + > + vma = get_next_vma(priv, *ppos); > + /* EINTR is possible */ > + if (IS_ERR(vma)) > + return vma; > + > + /* Store previous position to be able to restart if needed */ > + priv->last_pos = *ppos; > if (vma) { > - *ppos = vma->vm_start; > + /* > + * Track the end of the reported vma to ensure position changes > + * even if previous vma was merged with the next vma and we > + * found the extended vma with the same vm_start. > + */ > + *ppos = vma->vm_end; > } else { > - *ppos = -2; > + *ppos = -2; /* -2 indicates gate vma */ > vma = get_gate_vma(priv->mm); > } > > @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) > return NULL; > } > > - if (mmap_read_lock_killable(mm)) { > + if (!lock_vma_range(m, priv)) { > mmput(mm); > put_task_struct(priv->task); > priv->task = NULL; > return ERR_PTR(-EINTR); > } > > + /* > + * Reset current position if last_addr was set before > + * and it's not a sentinel. > + */ > + if (last_addr > 0) > + *ppos = last_addr = priv->last_pos; > vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); > hold_task_mempolicy(priv); > if (last_addr == -2) > return get_gate_vma(mm); > > - return proc_get_vma(priv, ppos); > + return proc_get_vma(m, ppos); > } > > static void *m_next(struct seq_file *m, void *v, loff_t *ppos) > { > if (*ppos == -2) { > - *ppos = -1; > + *ppos = -1; /* -1 indicates no more vmas */ > return NULL; > } > - return proc_get_vma(m->private, ppos); > + return proc_get_vma(m, ppos); > } > > static void m_stop(struct seq_file *m, void *v) > @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v) > return; > > release_task_mempolicy(priv); > - mmap_read_unlock(mm); > + unlock_vma_range(priv); > mmput(mm); > put_task_struct(priv->task); > priv->task = NULL; > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > index 5da384bd0a26..1f4f44951abe 100644 > --- a/include/linux/mmap_lock.h > +++ b/include/linux/mmap_lock.h > @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma); > struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > unsigned long address); > > +/* > + * Locks next vma pointed by the iterator. Confirms the locked vma has not > + * been modified and will retry under mmap_lock protection if modification > + * was detected. Should be called from read RCU section. > + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the > + * process was interrupted. > + */ > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address); > + > #else /* CONFIG_PER_VMA_LOCK */ > > static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} > diff --git a/mm/madvise.c b/mm/madvise.c > index a34c2c89a53b..e61e32b2cd91 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref) > > struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) > { > - mmap_assert_locked(vma->vm_mm); > + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) > + vma_assert_locked(vma); This looks familiar ;) > > return vma->anon_name; > } > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c > index 5f725cc67334..ed0e5e2171cd 100644 > --- a/mm/mmap_lock.c > +++ b/mm/mmap_lock.c > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > count_vm_vma_lock_event(VMA_LOCK_ABORT); > return NULL; > } > + > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm, > + struct vma_iterator *iter, Nit, but we tend to call this vmi (yes Liam and I are addicted to 3 letter abbreviations, we are evil beings) > + unsigned long address) I swear we already had a helper for this? Maybe misremembering > +{ > + struct vm_area_struct *vma; > + int ret; > + > + ret = mmap_read_lock_killable(mm); > + if (ret) > + return ERR_PTR(ret); > + > + /* Lookup the vma at the last position again under mmap_read_lock */ > + vma_iter_init(iter, mm, address); > + vma = vma_next(iter); Maybe worth calling this lock_next_under_mmap_lock() as we are grabbing the next VMA here?? > + if (vma) > + vma_start_read_locked(vma); > + > + mmap_read_unlock(mm); > + > + return vma; > +} > + > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address) Slightly confusing this, I think last_pos would be better? Or last_address? Otherwise it's not clear it's the address of the next VMA or the end of the previous. > +{ > + struct vm_area_struct *vma; > + unsigned int mm_wr_seq; > + bool mmap_unlocked; > + > + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); > +retry: > + /* Start mmap_lock speculation in case we need to verify the vma later */ > + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); > + vma = vma_next(iter); > + if (!vma) > + return NULL; > + > + vma = vma_start_read(mm, vma); > + Nit, but myabe erase this newline. > + if (IS_ERR_OR_NULL(vma)) { > + /* > + * Retry immediately if the vma gets detached from under us. > + * Infinite loop should not happen because the vma we find will > + * have to be constantly knocked out from under us. > + */ > + if (PTR_ERR(vma) == -EAGAIN) { Maybe worth a comment here stating that we intentionally retry getting the next VMA, and therefore must reset to the last visited adress each time. > + vma_iter_init(iter, mm, address); Maybe Liam can confirm this is the best approach? Seems correct though. > + goto retry; > + } > + > + goto out; > + } > + > + /* > + * Verify the vma we locked belongs to the same address space and it's > + * not behind of the last search position. > + */ > + if (unlikely(vma->vm_mm != mm || address >= vma->vm_end)) > + goto out_unlock; > + > + /* > + * vma can be ahead of the last search position but we need to verify > + * it was not shrunk after we found it and another vma has not been > + * installed ahead of it. Otherwise we might observe a gap that should > + * not be there. > + */ > + if (address < vma->vm_start) { > + /* Verify only if the address space might have changed since vma lookup. */ > + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { > + vma_iter_init(iter, mm, address); > + if (vma != vma_next(iter)) > + goto out_unlock; > + } > + } > + > + return vma; > + > +out_unlock: > + vma_end_read(vma); > +out: Maybe these labels should reflect the fact this is a fallback case? Like fallback_unlock + fallback? > + rcu_read_unlock(); > + vma = lock_vma_under_mmap_lock(mm, iter, address); > + rcu_read_lock(); OK I guess we hold the RCU lock _the whole time_ as we traverse except when we lock under mmap lock. > + > + return vma; > +} > #endif /* CONFIG_PER_VMA_LOCK */ > > #ifdef CONFIG_LOCK_MM_AND_FIND_VMA > -- > 2.50.0.727.gbf7dc18ff4-goog >
On Mon, Jul 7, 2025 at 9:52 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > Sorry I know it's annoying, but some petty commit msg nits: > > On Thu, Jul 03, 2025 at 11:07:25PM -0700, Suren Baghdasaryan wrote: > > With maple_tree supporting vma tree traversal under RCU and per-vma > > locks, /proc/pid/maps can be read while holding individual vma locks > > instead of locking the entire address space. > > Completely lockless approach (walking vma tree under RCU) would be quite > > Completely lockless approach -> A completely lockless approach > > > complex with the main issue being get_vma_name() using callbacks which > > might not work correctly with a stable vma copy, requiring original > > (unstable) vma - see special_mapping_name() for an example. > > NIT: for an example -> for example Ack. > > > When per-vma lock acquisition fails, we take the mmap_lock for reading, > > lock the vma, release the mmap_lock and continue. This fallback to mmap > > read lock guarantees the reader to make forward progress even during > > lock contention. This will interfere with the writer but for a very > > short time while we are acquiring the per-vma lock and only when there > > was contention on the vma reader is interested in. We shouldn't see a > > Can we separate out into a new paragraph? Will do. > > > repeated fallback to mmap read locks in practice, as this require a > > very unlikely series of lock contentions (for instance due to repeated > > vma split operations). However even if this did somehow happen, we would > > still progress. > > One case requiring special handling is when vma changes between the > > when vma changes -> when a vma chnages Ack. > > > time it was found and the time it got locked. A problematic case would > > be if vma got shrunk so that it's start moved higher in the address > > vma -> a vma Ack. > > it's start moved higher -> its vm_start moved higher Ack. > > > space and a new vma was installed at the beginning: > > > > reader found: |--------VMA A--------| > > VMA is modified: |-VMA B-|----VMA A----| > > reader locks modified VMA A > > reader reports VMA A: | gap |----VMA A----| > > > > This would result in reporting a gap in the address space that does not > > exist. To prevent this we retry the lookup after locking the vma, however > > we do that only when we identify a gap and detect that the address space > > was changed after we found the vma. > > Can we separate out into a new paragraph? Ack. > > > This change is designed to reduce mmap_lock contention and prevent a > > process reading /proc/pid/maps files (often a low priority task, such > > as monitoring/data collection services) from blocking address space > > updates. Note that this change has a userspace visible disadvantage: > > it allows for sub-page data tearing as opposed to the previous mechanism > > where data tearing could happen only between pages of generated output > > data. Since current userspace considers data tearing between pages to be > > acceptable, we assume is will be able to handle sub-page data tearing > > as well. > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > OK this is looking pretty great now, I make a bunch of points below, but I > don't think anything is holding this up from being OK, so with those > addressed: > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Thanks! > > > --- > > fs/proc/internal.h | 5 ++ > > fs/proc/task_mmu.c | 118 ++++++++++++++++++++++++++++++++++---- > > include/linux/mmap_lock.h | 11 ++++ > > mm/madvise.c | 3 +- > > mm/mmap_lock.c | 88 ++++++++++++++++++++++++++++ > > 5 files changed, 214 insertions(+), 11 deletions(-) > > > > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > > index 3d48ffe72583..7c235451c5ea 100644 > > --- a/fs/proc/internal.h > > +++ b/fs/proc/internal.h > > @@ -384,6 +384,11 @@ struct proc_maps_private { > > struct task_struct *task; > > struct mm_struct *mm; > > struct vma_iterator iter; > > + loff_t last_pos; > > +#ifdef CONFIG_PER_VMA_LOCK > > + bool mmap_locked; > > + struct vm_area_struct *locked_vma; > > +#endif > > #ifdef CONFIG_NUMA > > struct mempolicy *task_mempolicy; > > #endif > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > index b8bc06d05a72..ff3fe488ce51 100644 > > --- a/fs/proc/task_mmu.c > > +++ b/fs/proc/task_mmu.c > > @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct proc_maps_private *priv) > > } > > #endif > > > > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, > > - loff_t *ppos) > > +#ifdef CONFIG_PER_VMA_LOCK > > + > > +static void unlock_vma(struct proc_maps_private *priv) > > +{ > > + if (priv->locked_vma) { > > + vma_end_read(priv->locked_vma); > > + priv->locked_vma = NULL; > > + } > > +} > > + > > +static const struct seq_operations proc_pid_maps_op; > > + > > +static inline bool lock_vma_range(struct seq_file *m, > > + struct proc_maps_private *priv) > > OK this is a nice abstraction. > > > +{ > > + /* > > + * smaps and numa_maps perform page table walk, therefore require > > + * mmap_lock but maps can be read with locking just the vma. > > Probably worth mentioning that you hold the RCU read lock for the operation > also. Ack. > > > + */ > > + if (m->op != &proc_pid_maps_op) { > > + if (mmap_read_lock_killable(priv->mm)) > > + return false; > > + > > + priv->mmap_locked = true; > > + } else { > > + rcu_read_lock(); > > + priv->locked_vma = NULL; > > + priv->mmap_locked = false; > > + } > > + > > + return true; > > +} > > + > > +static inline void unlock_vma_range(struct proc_maps_private *priv) > > I guess the 'range' is either - the whole thing in case of mmap read > locked, or single VMA in case of per-VMA locks. Correct. > > > +{ > > + if (priv->mmap_locked) { > > + mmap_read_unlock(priv->mm); > > + } else { > > + unlock_vma(priv); > > + rcu_read_unlock(); > > + } > > +} > > + > > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > > + loff_t last_pos) > > +{ > > + struct vm_area_struct *vma; > > + > > + if (priv->mmap_locked) > > + return vma_next(&priv->iter); > > + > > + unlock_vma(priv); > > + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); > > + if (!IS_ERR_OR_NULL(vma)) > > + priv->locked_vma = vma; > > + > > + return vma; > > +} > > + > > +#else /* CONFIG_PER_VMA_LOCK */ > > + > > +static inline bool lock_vma_range(struct seq_file *m, > > + struct proc_maps_private *priv) > > { > > - struct vm_area_struct *vma = vma_next(&priv->iter); > > + return mmap_read_lock_killable(priv->mm) == 0; > > +} > > + > > +static inline void unlock_vma_range(struct proc_maps_private *priv) > > +{ > > + mmap_read_unlock(priv->mm); > > +} > > + > > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > > + loff_t last_pos) > > +{ > > + return vma_next(&priv->iter); > > +} > > > > +#endif /* CONFIG_PER_VMA_LOCK */ > > + > > +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) > > +{ > > + struct proc_maps_private *priv = m->private; > > + struct vm_area_struct *vma; > > + > > + vma = get_next_vma(priv, *ppos); > > + /* EINTR is possible */ > > + if (IS_ERR(vma)) > > + return vma; > > + > > + /* Store previous position to be able to restart if needed */ > > + priv->last_pos = *ppos; > > if (vma) { > > - *ppos = vma->vm_start; > > + /* > > + * Track the end of the reported vma to ensure position changes > > + * even if previous vma was merged with the next vma and we > > + * found the extended vma with the same vm_start. > > + */ > > + *ppos = vma->vm_end; > > } else { > > - *ppos = -2; > > + *ppos = -2; /* -2 indicates gate vma */ > > vma = get_gate_vma(priv->mm); > > } > > > > @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) > > return NULL; > > } > > > > - if (mmap_read_lock_killable(mm)) { > > + if (!lock_vma_range(m, priv)) { > > mmput(mm); > > put_task_struct(priv->task); > > priv->task = NULL; > > return ERR_PTR(-EINTR); > > } > > > > + /* > > + * Reset current position if last_addr was set before > > + * and it's not a sentinel. > > + */ > > + if (last_addr > 0) > > + *ppos = last_addr = priv->last_pos; > > vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); > > hold_task_mempolicy(priv); > > if (last_addr == -2) > > return get_gate_vma(mm); > > > > - return proc_get_vma(priv, ppos); > > + return proc_get_vma(m, ppos); > > } > > > > static void *m_next(struct seq_file *m, void *v, loff_t *ppos) > > { > > if (*ppos == -2) { > > - *ppos = -1; > > + *ppos = -1; /* -1 indicates no more vmas */ > > return NULL; > > } > > - return proc_get_vma(m->private, ppos); > > + return proc_get_vma(m, ppos); > > } > > > > static void m_stop(struct seq_file *m, void *v) > > @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v) > > return; > > > > release_task_mempolicy(priv); > > - mmap_read_unlock(mm); > > + unlock_vma_range(priv); > > mmput(mm); > > put_task_struct(priv->task); > > priv->task = NULL; > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > > index 5da384bd0a26..1f4f44951abe 100644 > > --- a/include/linux/mmap_lock.h > > +++ b/include/linux/mmap_lock.h > > @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma); > > struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > unsigned long address); > > > > +/* > > + * Locks next vma pointed by the iterator. Confirms the locked vma has not > > + * been modified and will retry under mmap_lock protection if modification > > + * was detected. Should be called from read RCU section. > > + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the > > + * process was interrupted. > > + */ > > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > > + struct vma_iterator *iter, > > + unsigned long address); > > + > > #else /* CONFIG_PER_VMA_LOCK */ > > > > static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} > > diff --git a/mm/madvise.c b/mm/madvise.c > > index a34c2c89a53b..e61e32b2cd91 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref) > > > > struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) > > { > > - mmap_assert_locked(vma->vm_mm); > > + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) > > + vma_assert_locked(vma); > > This looks familiar ;) Yep, that's your fix which I folded in. > > > > > return vma->anon_name; > > } > > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c > > index 5f725cc67334..ed0e5e2171cd 100644 > > --- a/mm/mmap_lock.c > > +++ b/mm/mmap_lock.c > > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > count_vm_vma_lock_event(VMA_LOCK_ABORT); > > return NULL; > > } > > + > > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm, > > + struct vma_iterator *iter, > > Nit, but we tend to call this vmi (yes Liam and I are addicted to 3 letter > abbreviations, we are evil beings) Ok, I'll rename it. > > > + unsigned long address) > > I swear we already had a helper for this? Maybe misremembering I think you might be confusing it with lock_vma_under_rcu() > > > +{ > > + struct vm_area_struct *vma; > > + int ret; > > + > > + ret = mmap_read_lock_killable(mm); > > + if (ret) > > + return ERR_PTR(ret); > > + > > + /* Lookup the vma at the last position again under mmap_read_lock */ > > + vma_iter_init(iter, mm, address); > > + vma = vma_next(iter); > > Maybe worth calling this lock_next_under_mmap_lock() as we are grabbing the > next VMA here?? Sure. lock_next_vma_under_mmap_lock() ? > > > + if (vma) > > + vma_start_read_locked(vma); > > + > > + mmap_read_unlock(mm); > > + > > + return vma; > > +} > > + > > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > > + struct vma_iterator *iter, > > + unsigned long address) > > Slightly confusing this, I think last_pos would be better? Or last_address? > > Otherwise it's not clear it's the address of the next VMA or the end of the > previous. Ok, last_address it is then. > > > +{ > > + struct vm_area_struct *vma; > > + unsigned int mm_wr_seq; > > + bool mmap_unlocked; > > + > > + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); > > +retry: > > + /* Start mmap_lock speculation in case we need to verify the vma later */ > > + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); > > + vma = vma_next(iter); > > + if (!vma) > > + return NULL; > > + > > + vma = vma_start_read(mm, vma); > > + > > Nit, but myabe erase this newline. Ack. > > > + if (IS_ERR_OR_NULL(vma)) { > > + /* > > + * Retry immediately if the vma gets detached from under us. > > + * Infinite loop should not happen because the vma we find will > > + * have to be constantly knocked out from under us. > > + */ > > + if (PTR_ERR(vma) == -EAGAIN) { > > Maybe worth a comment here stating that we intentionally retry getting the > next VMA, and therefore must reset to the last visited adress each time. Ack. > > > + vma_iter_init(iter, mm, address); > > Maybe Liam can confirm this is the best approach? Seems correct though. Liam's Reviewed-by confirms correctness now :) > > > + goto retry; > > + } > > + > > + goto out; > > + } > > + > > + /* > > + * Verify the vma we locked belongs to the same address space and it's > > + * not behind of the last search position. > > + */ > > + if (unlikely(vma->vm_mm != mm || address >= vma->vm_end)) > > + goto out_unlock; > > + > > + /* > > + * vma can be ahead of the last search position but we need to verify > > + * it was not shrunk after we found it and another vma has not been > > + * installed ahead of it. Otherwise we might observe a gap that should > > + * not be there. > > + */ > > + if (address < vma->vm_start) { > > + /* Verify only if the address space might have changed since vma lookup. */ > > + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { > > + vma_iter_init(iter, mm, address); > > + if (vma != vma_next(iter)) > > + goto out_unlock; > > + } > > + } > > + > > + return vma; > > + > > +out_unlock: > > + vma_end_read(vma); > > +out: > > Maybe these labels should reflect the fact this is a fallback case? > > Like fallback_unlock + fallback? Ack. > > > + rcu_read_unlock(); > > + vma = lock_vma_under_mmap_lock(mm, iter, address); > > + rcu_read_lock(); > > OK I guess we hold the RCU lock _the whole time_ as we traverse except when > we lock under mmap lock. Correct. > > > + > > + return vma; > > +} > > #endif /* CONFIG_PER_VMA_LOCK */ > > > > #ifdef CONFIG_LOCK_MM_AND_FIND_VMA > > -- > > 2.50.0.727.gbf7dc18ff4-goog > >
On 7/8/25 01:10, Suren Baghdasaryan wrote: >>> + rcu_read_unlock(); >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); >>> + rcu_read_lock(); >> OK I guess we hold the RCU lock the whole time as we traverse except when >> we lock under mmap lock. > Correct. I wonder if it's really necessary? Can't it be done just inside lock_next_vma()? It would also avoid the unlock/lock dance quoted above. Even if we later manage to extend this approach to smaps and employ rcu locking to traverse the page tables, I'd think it's best to separate and fine-grain the rcu lock usage for vma iterator and page tables, if only to avoid too long time under the lock.
On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 7/8/25 01:10, Suren Baghdasaryan wrote: > >>> + rcu_read_unlock(); > >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); > >>> + rcu_read_lock(); > >> OK I guess we hold the RCU lock the whole time as we traverse except when > >> we lock under mmap lock. > > Correct. > > I wonder if it's really necessary? Can't it be done just inside > lock_next_vma()? It would also avoid the unlock/lock dance quoted above. > > Even if we later manage to extend this approach to smaps and employ rcu > locking to traverse the page tables, I'd think it's best to separate and > fine-grain the rcu lock usage for vma iterator and page tables, if only to > avoid too long time under the lock. I thought we would need to be in the same rcu read section while traversing the maple tree using vma_next() but now looking at it, maybe we can indeed enter only while finding and locking the next vma... Liam, would that work? I see struct ma_state containing a node field. Can it be freed from under us if we find a vma, exit rcu read section then re-enter rcu and use the same iterator to find the next vma?
On 7/9/25 16:43, Suren Baghdasaryan wrote: > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: >> >> On 7/8/25 01:10, Suren Baghdasaryan wrote: >> >>> + rcu_read_unlock(); >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); >> >>> + rcu_read_lock(); >> >> OK I guess we hold the RCU lock the whole time as we traverse except when >> >> we lock under mmap lock. >> > Correct. >> >> I wonder if it's really necessary? Can't it be done just inside >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. >> >> Even if we later manage to extend this approach to smaps and employ rcu >> locking to traverse the page tables, I'd think it's best to separate and >> fine-grain the rcu lock usage for vma iterator and page tables, if only to >> avoid too long time under the lock. > > I thought we would need to be in the same rcu read section while > traversing the maple tree using vma_next() but now looking at it, > maybe we can indeed enter only while finding and locking the next > vma... > Liam, would that work? I see struct ma_state containing a node field. > Can it be freed from under us if we find a vma, exit rcu read section > then re-enter rcu and use the same iterator to find the next vma? If the rcu protection needs to be contigous, and patch 8 avoids the issue by always doing vma_iter_init() after rcu_read_lock() (but does it really avoid the issue or is it why we see the syzbot reports?) then I guess in the code quoted above we also need a vma_iter_init() after the rcu_read_lock(), because although the iterator was used briefly under mmap_lock protection, that was then unlocked and there can be a race before the rcu_read_lock().
On Wed, Jul 9, 2025 at 3:03 PM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 7/9/25 16:43, Suren Baghdasaryan wrote: > > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > >> > >> On 7/8/25 01:10, Suren Baghdasaryan wrote: > >> >>> + rcu_read_unlock(); > >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); > >> >>> + rcu_read_lock(); > >> >> OK I guess we hold the RCU lock the whole time as we traverse except when > >> >> we lock under mmap lock. > >> > Correct. > >> > >> I wonder if it's really necessary? Can't it be done just inside > >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. > >> > >> Even if we later manage to extend this approach to smaps and employ rcu > >> locking to traverse the page tables, I'd think it's best to separate and > >> fine-grain the rcu lock usage for vma iterator and page tables, if only to > >> avoid too long time under the lock. > > > > I thought we would need to be in the same rcu read section while > > traversing the maple tree using vma_next() but now looking at it, > > maybe we can indeed enter only while finding and locking the next > > vma... > > Liam, would that work? I see struct ma_state containing a node field. > > Can it be freed from under us if we find a vma, exit rcu read section > > then re-enter rcu and use the same iterator to find the next vma? > > If the rcu protection needs to be contigous, and patch 8 avoids the issue by > always doing vma_iter_init() after rcu_read_lock() (but does it really avoid > the issue or is it why we see the syzbot reports?) then I guess in the code > quoted above we also need a vma_iter_init() after the rcu_read_lock(), > because although the iterator was used briefly under mmap_lock protection, > that was then unlocked and there can be a race before the rcu_read_lock(). Quite true. So, let's wait for Liam's confirmation and based on his answer I'll change the patch by either reducing the rcu read section or adding the missing vma_iter_init() after we switch to mmap_lock.
* Suren Baghdasaryan <surenb@google.com> [250709 11:06]: > On Wed, Jul 9, 2025 at 3:03 PM Vlastimil Babka <vbabka@suse.cz> wrote: > > > > On 7/9/25 16:43, Suren Baghdasaryan wrote: > > > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > >> > > >> On 7/8/25 01:10, Suren Baghdasaryan wrote: > > >> >>> + rcu_read_unlock(); > > >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); > > >> >>> + rcu_read_lock(); > > >> >> OK I guess we hold the RCU lock the whole time as we traverse except when > > >> >> we lock under mmap lock. > > >> > Correct. > > >> > > >> I wonder if it's really necessary? Can't it be done just inside > > >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. > > >> > > >> Even if we later manage to extend this approach to smaps and employ rcu > > >> locking to traverse the page tables, I'd think it's best to separate and > > >> fine-grain the rcu lock usage for vma iterator and page tables, if only to > > >> avoid too long time under the lock. > > > > > > I thought we would need to be in the same rcu read section while > > > traversing the maple tree using vma_next() but now looking at it, > > > maybe we can indeed enter only while finding and locking the next > > > vma... > > > Liam, would that work? I see struct ma_state containing a node field. > > > Can it be freed from under us if we find a vma, exit rcu read section > > > then re-enter rcu and use the same iterator to find the next vma? > > > > If the rcu protection needs to be contigous, and patch 8 avoids the issue by > > always doing vma_iter_init() after rcu_read_lock() (but does it really avoid > > the issue or is it why we see the syzbot reports?) then I guess in the code > > quoted above we also need a vma_iter_init() after the rcu_read_lock(), > > because although the iterator was used briefly under mmap_lock protection, > > that was then unlocked and there can be a race before the rcu_read_lock(). > > Quite true. So, let's wait for Liam's confirmation and based on his > answer I'll change the patch by either reducing the rcu read section > or adding the missing vma_iter_init() after we switch to mmap_lock. You need to either be under rcu or mmap lock to ensure the node in the maple state hasn't been freed (and potentially, reallocated). So in this case, in the higher level, we can hold the rcu read lock for a series of walks and avoid re-walking the tree then the performance would be better. When we return to userspace, then we should drop the rcu read lock and will need to vma_iter_set()/vma_iter_invalidate() on return. I thought this was being done (through vma_iter_init()), but syzbot seems to indicate a path that was missed? This is the same thing that needed to be done previously with the mmap lock, but now under the rcu lock. I'm not sure how to mitigate the issue with the page table, maybe we guess on the number of vmas that we were doing for 4k blocks of output and just drop/reacquire then. Probably a problem for another day anyways. Also, I think you can also change the vma_iter_init() to vma_iter_set(), which is slightly less code under the hood. Vlastimil asked about this and it's probably a better choice. Thanks, Liam
On Wed, Jul 9, 2025 at 4:12 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Suren Baghdasaryan <surenb@google.com> [250709 11:06]: > > On Wed, Jul 9, 2025 at 3:03 PM Vlastimil Babka <vbabka@suse.cz> wrote: > > > > > > On 7/9/25 16:43, Suren Baghdasaryan wrote: > > > > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > > >> > > > >> On 7/8/25 01:10, Suren Baghdasaryan wrote: > > > >> >>> + rcu_read_unlock(); > > > >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); > > > >> >>> + rcu_read_lock(); > > > >> >> OK I guess we hold the RCU lock the whole time as we traverse except when > > > >> >> we lock under mmap lock. > > > >> > Correct. > > > >> > > > >> I wonder if it's really necessary? Can't it be done just inside > > > >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. > > > >> > > > >> Even if we later manage to extend this approach to smaps and employ rcu > > > >> locking to traverse the page tables, I'd think it's best to separate and > > > >> fine-grain the rcu lock usage for vma iterator and page tables, if only to > > > >> avoid too long time under the lock. > > > > > > > > I thought we would need to be in the same rcu read section while > > > > traversing the maple tree using vma_next() but now looking at it, > > > > maybe we can indeed enter only while finding and locking the next > > > > vma... > > > > Liam, would that work? I see struct ma_state containing a node field. > > > > Can it be freed from under us if we find a vma, exit rcu read section > > > > then re-enter rcu and use the same iterator to find the next vma? > > > > > > If the rcu protection needs to be contigous, and patch 8 avoids the issue by > > > always doing vma_iter_init() after rcu_read_lock() (but does it really avoid > > > the issue or is it why we see the syzbot reports?) then I guess in the code > > > quoted above we also need a vma_iter_init() after the rcu_read_lock(), > > > because although the iterator was used briefly under mmap_lock protection, > > > that was then unlocked and there can be a race before the rcu_read_lock(). > > > > Quite true. So, let's wait for Liam's confirmation and based on his > > answer I'll change the patch by either reducing the rcu read section > > or adding the missing vma_iter_init() after we switch to mmap_lock. > > You need to either be under rcu or mmap lock to ensure the node in the > maple state hasn't been freed (and potentially, reallocated). > > So in this case, in the higher level, we can hold the rcu read lock for > a series of walks and avoid re-walking the tree then the performance > would be better. Got it. Thanks for confirming! > > When we return to userspace, then we should drop the rcu read lock and > will need to vma_iter_set()/vma_iter_invalidate() on return. I thought > this was being done (through vma_iter_init()), but syzbot seems to > indicate a path that was missed? We do that in m_start()/m_stop() by calling lock_vma_range()/unlock_vma_range() but I think I have two problems here: 1. As Vlastimil mentioned I do not reset the iterator when falling back to mmap_lock and exiting and then re-entering rcu read section; 2. I do not reset the iterator after exiting rcu read section in m_stop() and re-entering it in m_start(), so the later call to lock_next_vma() might be using an iterator with a node that was freed (and possibly reallocated). > > This is the same thing that needed to be done previously with the mmap > lock, but now under the rcu lock. > > I'm not sure how to mitigate the issue with the page table, maybe we > guess on the number of vmas that we were doing for 4k blocks of output > and just drop/reacquire then. Probably a problem for another day > anyways. > > Also, I think you can also change the vma_iter_init() to vma_iter_set(), > which is slightly less code under the hood. Vlastimil asked about this > and it's probably a better choice. Ack. I'll update my series with these fixes and all comments I received so far, will run the reproducers to confirm no issues and repost them later today. Thanks, Suren. > > Thanks, > Liam >
On Wed, Jul 9, 2025 at 10:47 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Wed, Jul 9, 2025 at 4:12 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > * Suren Baghdasaryan <surenb@google.com> [250709 11:06]: > > > On Wed, Jul 9, 2025 at 3:03 PM Vlastimil Babka <vbabka@suse.cz> wrote: > > > > > > > > On 7/9/25 16:43, Suren Baghdasaryan wrote: > > > > > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > > > >> > > > > >> On 7/8/25 01:10, Suren Baghdasaryan wrote: > > > > >> >>> + rcu_read_unlock(); > > > > >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); > > > > >> >>> + rcu_read_lock(); > > > > >> >> OK I guess we hold the RCU lock the whole time as we traverse except when > > > > >> >> we lock under mmap lock. > > > > >> > Correct. > > > > >> > > > > >> I wonder if it's really necessary? Can't it be done just inside > > > > >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. > > > > >> > > > > >> Even if we later manage to extend this approach to smaps and employ rcu > > > > >> locking to traverse the page tables, I'd think it's best to separate and > > > > >> fine-grain the rcu lock usage for vma iterator and page tables, if only to > > > > >> avoid too long time under the lock. > > > > > > > > > > I thought we would need to be in the same rcu read section while > > > > > traversing the maple tree using vma_next() but now looking at it, > > > > > maybe we can indeed enter only while finding and locking the next > > > > > vma... > > > > > Liam, would that work? I see struct ma_state containing a node field. > > > > > Can it be freed from under us if we find a vma, exit rcu read section > > > > > then re-enter rcu and use the same iterator to find the next vma? > > > > > > > > If the rcu protection needs to be contigous, and patch 8 avoids the issue by > > > > always doing vma_iter_init() after rcu_read_lock() (but does it really avoid > > > > the issue or is it why we see the syzbot reports?) then I guess in the code > > > > quoted above we also need a vma_iter_init() after the rcu_read_lock(), > > > > because although the iterator was used briefly under mmap_lock protection, > > > > that was then unlocked and there can be a race before the rcu_read_lock(). > > > > > > Quite true. So, let's wait for Liam's confirmation and based on his > > > answer I'll change the patch by either reducing the rcu read section > > > or adding the missing vma_iter_init() after we switch to mmap_lock. > > > > You need to either be under rcu or mmap lock to ensure the node in the > > maple state hasn't been freed (and potentially, reallocated). > > > > So in this case, in the higher level, we can hold the rcu read lock for > > a series of walks and avoid re-walking the tree then the performance > > would be better. > > Got it. Thanks for confirming! > > > > > When we return to userspace, then we should drop the rcu read lock and > > will need to vma_iter_set()/vma_iter_invalidate() on return. I thought > > this was being done (through vma_iter_init()), but syzbot seems to > > indicate a path that was missed? > > We do that in m_start()/m_stop() by calling > lock_vma_range()/unlock_vma_range() but I think I have two problems > here: > 1. As Vlastimil mentioned I do not reset the iterator when falling > back to mmap_lock and exiting and then re-entering rcu read section; > 2. I do not reset the iterator after exiting rcu read section in > m_stop() and re-entering it in m_start(), so the later call to > lock_next_vma() might be using an iterator with a node that was freed > (and possibly reallocated). > > > > > This is the same thing that needed to be done previously with the mmap > > lock, but now under the rcu lock. > > > > I'm not sure how to mitigate the issue with the page table, maybe we > > guess on the number of vmas that we were doing for 4k blocks of output > > and just drop/reacquire then. Probably a problem for another day > > anyways. > > > > Also, I think you can also change the vma_iter_init() to vma_iter_set(), > > which is slightly less code under the hood. Vlastimil asked about this > > and it's probably a better choice. > > Ack. > I'll update my series with these fixes and all comments I received so > far, will run the reproducers to confirm no issues and repost them > later today. I have the patchset ready but would like to test it some more. Will post it tomorrow. > Thanks, > Suren. > > > > > Thanks, > > Liam > >
On Thu, Jul 10, 2025 at 12:03 AM Suren Baghdasaryan <surenb@google.com> wrote: > > On Wed, Jul 9, 2025 at 10:47 AM Suren Baghdasaryan <surenb@google.com> wrote: > > > > On Wed, Jul 9, 2025 at 4:12 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > * Suren Baghdasaryan <surenb@google.com> [250709 11:06]: > > > > On Wed, Jul 9, 2025 at 3:03 PM Vlastimil Babka <vbabka@suse.cz> wrote: > > > > > > > > > > On 7/9/25 16:43, Suren Baghdasaryan wrote: > > > > > > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > > > > >> > > > > > >> On 7/8/25 01:10, Suren Baghdasaryan wrote: > > > > > >> >>> + rcu_read_unlock(); > > > > > >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); > > > > > >> >>> + rcu_read_lock(); > > > > > >> >> OK I guess we hold the RCU lock the whole time as we traverse except when > > > > > >> >> we lock under mmap lock. > > > > > >> > Correct. > > > > > >> > > > > > >> I wonder if it's really necessary? Can't it be done just inside > > > > > >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. > > > > > >> > > > > > >> Even if we later manage to extend this approach to smaps and employ rcu > > > > > >> locking to traverse the page tables, I'd think it's best to separate and > > > > > >> fine-grain the rcu lock usage for vma iterator and page tables, if only to > > > > > >> avoid too long time under the lock. > > > > > > > > > > > > I thought we would need to be in the same rcu read section while > > > > > > traversing the maple tree using vma_next() but now looking at it, > > > > > > maybe we can indeed enter only while finding and locking the next > > > > > > vma... > > > > > > Liam, would that work? I see struct ma_state containing a node field. > > > > > > Can it be freed from under us if we find a vma, exit rcu read section > > > > > > then re-enter rcu and use the same iterator to find the next vma? > > > > > > > > > > If the rcu protection needs to be contigous, and patch 8 avoids the issue by > > > > > always doing vma_iter_init() after rcu_read_lock() (but does it really avoid > > > > > the issue or is it why we see the syzbot reports?) then I guess in the code > > > > > quoted above we also need a vma_iter_init() after the rcu_read_lock(), > > > > > because although the iterator was used briefly under mmap_lock protection, > > > > > that was then unlocked and there can be a race before the rcu_read_lock(). > > > > > > > > Quite true. So, let's wait for Liam's confirmation and based on his > > > > answer I'll change the patch by either reducing the rcu read section > > > > or adding the missing vma_iter_init() after we switch to mmap_lock. > > > > > > You need to either be under rcu or mmap lock to ensure the node in the > > > maple state hasn't been freed (and potentially, reallocated). > > > > > > So in this case, in the higher level, we can hold the rcu read lock for > > > a series of walks and avoid re-walking the tree then the performance > > > would be better. > > > > Got it. Thanks for confirming! > > > > > > > > When we return to userspace, then we should drop the rcu read lock and > > > will need to vma_iter_set()/vma_iter_invalidate() on return. I thought > > > this was being done (through vma_iter_init()), but syzbot seems to > > > indicate a path that was missed? > > > > We do that in m_start()/m_stop() by calling > > lock_vma_range()/unlock_vma_range() but I think I have two problems > > here: > > 1. As Vlastimil mentioned I do not reset the iterator when falling > > back to mmap_lock and exiting and then re-entering rcu read section; > > 2. I do not reset the iterator after exiting rcu read section in > > m_stop() and re-entering it in m_start(), so the later call to > > lock_next_vma() might be using an iterator with a node that was freed > > (and possibly reallocated). > > > > > > > > This is the same thing that needed to be done previously with the mmap > > > lock, but now under the rcu lock. > > > > > > I'm not sure how to mitigate the issue with the page table, maybe we > > > guess on the number of vmas that we were doing for 4k blocks of output > > > and just drop/reacquire then. Probably a problem for another day > > > anyways. > > > > > > Also, I think you can also change the vma_iter_init() to vma_iter_set(), > > > which is slightly less code under the hood. Vlastimil asked about this > > > and it's probably a better choice. > > > > Ack. > > I'll update my series with these fixes and all comments I received so > > far, will run the reproducers to confirm no issues and repost them > > later today. > > I have the patchset ready but would like to test it some more. Will > post it tomorrow. Ok, I found a couple of issues using the syzbot reproducer [1] (which is awesome BTW!): 1. rwsem_acquire_read() inside vma_start_read() at [2] should be moved after the last check, otherwise the lock is considered taken on vma->vm_refcnt overflow; 2. query_matching_vma() is missing unlock_vma() call when it does "goto next_vma;" and re-issues query_vma_find_by_addr(). The previous vma is left locked; [1] https://syzkaller.appspot.com/x/repro.c?x=101edf70580000 [2] https://elixir.bootlin.com/linux/v6.15.5/source/include/linux/mm.h#L747 After these fixes it's much harder to fail but I still get one more error copied below. I will continue the investigation and will hold off reposting until this is fixed. That will be next week since I'll be out of town the rest of this week. Andrew, could you please remove this patchset from mm-unstable for now until I fix the issue and re-post the new version? The error I got after these fixes is: [ 56.342886] [ 56.342910] ------------[ cut here ]------------ [ 56.342934] WARNING: CPU: 46 PID: 5701 at lib/maple_tree.c:4734 mas_next_slot+0x552/0x840 [ 56.344691] ================================================ [ 56.344695] WARNING: lock held when returning to user space! [ 56.344698] 6.16.0-rc5-00321-g31d640f7b07c-dirty #379 Not tainted [ 56.344702] ------------------------------------------------ [ 56.344704] syzbot_repro1/5700 is leaving the kernel with locks still held! [ 56.344715] 1 lock held by syzbot_repro1/5700: [ 56.344720] #0: ffff93a8c2cea788 (vm_lock){++++}-{0:0} [ 56.349286] Modules linked in: [ 56.355569] , at: get_next_vma+0x91/0xe0 [ 56.377452] [ 56.377929] CPU: 46 UID: 0 PID: 5701 Comm: syzbot_repro1 Not tainted 6.16.0-rc5-00321-g31d640f7b07c-dirty #379 PREEMPT(voluntary) [ 56.381592] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 56.384664] RIP: 0010:mas_next_slot+0x552/0x840 [ 56.386097] Code: 43 38 83 e8 02 83 f8 01 77 c4 e9 e5 fa ff ff 48 8b 43 28 e9 83 fb ff ff 49 8b 06 30 c0 49 39 c6 74 be c7 43 38 05 00 00 00 90 <0f> 0b 90 48 c7 04 24 00 001 [ 56.392303] RSP: 0018:ffffa01188217cd8 EFLAGS: 00010206 [ 56.393928] RAX: ffff93a8c2724e00 RBX: ffff93a8c1af2898 RCX: 1ffff27519183e61 [ 56.396300] RDX: ffff93a8c2724e0e RSI: 0000000000000000 RDI: ffff93a8c1af2898 [ 56.398515] RBP: 00007ffd83c2ffff R08: 00000000ffffffff R09: ffff93a8c8c1f308 [ 56.400722] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffffffffff [ 56.402935] R13: 0000000000000001 R14: ffff93a8c8c1f300 R15: ffff93a8c8c1f308 [ 56.405222] FS: 00007ff71a3946c0(0000) GS:ffff93b83a9af000(0000) knlGS:0000000000000000 [ 56.408236] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 56.409994] CR2: 00007ff71a393bb0 CR3: 0000000102f84000 CR4: 0000000000750ef0 [ 56.412367] PKRU: 55555554 [ 56.413231] Call Trace: [ 56.413955] <TASK> [ 56.414672] mas_find+0x5c/0x1c0 [ 56.415713] lock_next_vma+0x41/0x4d0 [ 56.416869] get_next_vma+0x91/0xe0 [ 56.417954] do_procmap_query+0x249/0xa90 [ 56.419310] ? do_procmap_query+0x1b8/0xa90 [ 56.420591] procfs_procmap_ioctl+0x20/0x40 [ 56.421896] __x64_sys_ioctl+0x8e/0xe0 [ 56.423514] do_syscall_64+0xbb/0x360 [ 56.424715] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 56.426296] RIP: 0033:0x41a7e9 [ 56.427254] Code: c0 79 93 eb d5 48 8d 7c 1d 00 eb 99 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c38 [ 56.432893] RSP: 002b:00007ff71a3941f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 56.435222] RAX: ffffffffffffffda RBX: 00007ff71a394cdc RCX: 000000000041a7e9 [ 56.437475] RDX: 0000200000000180 RSI: 00000000c0686611 RDI: 0000000000000003 [ 56.440084] RBP: 00007ff71a394220 R08: 00007ff71a3946c0 R09: 00007ff71a394210 [ 56.442345] R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffd0 [ 56.444545] R13: 0000000000000000 R14: 00007ffd83c4fe30 R15: 00007ffd83c4ff18 [ 56.446732] </TASK> [ 56.447436] Kernel panic - not syncing: kernel: panic_on_warn set ... [ 56.449433] CPU: 46 UID: 0 PID: 5701 Comm: syzbot_repro1 Not tainted 6.16.0-rc5-00321-g31d640f7b07c-dirty #379 PREEMPT(voluntary) [ 56.453043] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 56.456564] Call Trace: [ 56.457340] <TASK> [ 56.457969] dump_stack_lvl+0x5d/0x80 [ 56.459188] ? mas_next_slot+0x510/0x840 [ 56.460388] panic+0x11a/0x2ce [ 56.461353] ? mas_next_slot+0x552/0x840 [ 56.462564] check_panic_on_warn.cold+0xf/0x1e [ 56.463960] __warn.cold+0xc3/0x153 [ 56.465043] ? mas_next_slot+0x552/0x840 [ 56.466367] report_bug+0xff/0x140 [ 56.467441] ? mas_next_slot+0x552/0x840 [ 56.468993] handle_bug+0x163/0x1e0 [ 56.470236] exc_invalid_op+0x17/0x70 [ 56.471366] asm_exc_invalid_op+0x1a/0x20 [ 56.472629] RIP: 0010:mas_next_slot+0x552/0x840 [ 56.474053] Code: 43 38 83 e8 02 83 f8 01 77 c4 e9 e5 fa ff ff 48 8b 43 28 e9 83 fb ff ff 49 8b 06 30 c0 49 39 c6 74 be c7 43 38 05 00 00 00 90 <0f> 0b 90 48 c7 04 24 00 001 [ 56.479861] RSP: 0018:ffffa01188217cd8 EFLAGS: 00010206 [ 56.481501] RAX: ffff93a8c2724e00 RBX: ffff93a8c1af2898 RCX: 1ffff27519183e61 [ 56.483665] RDX: ffff93a8c2724e0e RSI: 0000000000000000 RDI: ffff93a8c1af2898 [ 56.486323] RBP: 00007ffd83c2ffff R08: 00000000ffffffff R09: ffff93a8c8c1f308 [ 56.488491] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffffffffff [ 56.490634] R13: 0000000000000001 R14: ffff93a8c8c1f300 R15: ffff93a8c8c1f308 [ 56.492856] mas_find+0x5c/0x1c0 [ 56.493888] lock_next_vma+0x41/0x4d0 [ 56.495022] get_next_vma+0x91/0xe0 [ 56.496204] do_procmap_query+0x249/0xa90 [ 56.497515] ? do_procmap_query+0x1b8/0xa90 [ 56.499232] procfs_procmap_ioctl+0x20/0x40 [ 56.500500] __x64_sys_ioctl+0x8e/0xe0 [ 56.501682] do_syscall_64+0xbb/0x360 [ 56.502845] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 56.504408] RIP: 0033:0x41a7e9 [ 56.505362] Code: c0 79 93 eb d5 48 8d 7c 1d 00 eb 99 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c38 [ 56.511066] RSP: 002b:00007ff71a3941f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 56.513523] RAX: ffffffffffffffda RBX: 00007ff71a394cdc RCX: 000000000041a7e9 [ 56.515977] RDX: 0000200000000180 RSI: 00000000c0686611 RDI: 0000000000000003 [ 56.518440] RBP: 00007ff71a394220 R08: 00007ff71a3946c0 R09: 00007ff71a394210 [ 56.520643] R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffd0 [ 56.522884] R13: 0000000000000000 R14: 00007ffd83c4fe30 R15: 00007ffd83c4ff18 [ 56.525170] </TASK> [ 56.527859] Kernel Offset: 0x1a00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 56.531106] Rebooting in 86400 seconds.. > > > Thanks, > > Suren. > > > > > > > > Thanks, > > > Liam > > >
On 7/10/25 19:02, Suren Baghdasaryan wrote: > On Thu, Jul 10, 2025 at 12:03 AM Suren Baghdasaryan <surenb@google.com> wrote: >> >> On Wed, Jul 9, 2025 at 10:47 AM Suren Baghdasaryan <surenb@google.com> wrote: >> > >> > On Wed, Jul 9, 2025 at 4:12 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: >> > > >> > > * Suren Baghdasaryan <surenb@google.com> [250709 11:06]: >> > > > On Wed, Jul 9, 2025 at 3:03 PM Vlastimil Babka <vbabka@suse.cz> wrote: >> > > > > >> > > > > On 7/9/25 16:43, Suren Baghdasaryan wrote: >> > > > > > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: >> > > > > >> >> > > > > >> On 7/8/25 01:10, Suren Baghdasaryan wrote: >> > > > > >> >>> + rcu_read_unlock(); >> > > > > >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); >> > > > > >> >>> + rcu_read_lock(); >> > > > > >> >> OK I guess we hold the RCU lock the whole time as we traverse except when >> > > > > >> >> we lock under mmap lock. >> > > > > >> > Correct. >> > > > > >> >> > > > > >> I wonder if it's really necessary? Can't it be done just inside >> > > > > >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. >> > > > > >> >> > > > > >> Even if we later manage to extend this approach to smaps and employ rcu >> > > > > >> locking to traverse the page tables, I'd think it's best to separate and >> > > > > >> fine-grain the rcu lock usage for vma iterator and page tables, if only to >> > > > > >> avoid too long time under the lock. >> > > > > > >> > > > > > I thought we would need to be in the same rcu read section while >> > > > > > traversing the maple tree using vma_next() but now looking at it, >> > > > > > maybe we can indeed enter only while finding and locking the next >> > > > > > vma... >> > > > > > Liam, would that work? I see struct ma_state containing a node field. >> > > > > > Can it be freed from under us if we find a vma, exit rcu read section >> > > > > > then re-enter rcu and use the same iterator to find the next vma? >> > > > > >> > > > > If the rcu protection needs to be contigous, and patch 8 avoids the issue by >> > > > > always doing vma_iter_init() after rcu_read_lock() (but does it really avoid >> > > > > the issue or is it why we see the syzbot reports?) then I guess in the code >> > > > > quoted above we also need a vma_iter_init() after the rcu_read_lock(), >> > > > > because although the iterator was used briefly under mmap_lock protection, >> > > > > that was then unlocked and there can be a race before the rcu_read_lock(). >> > > > >> > > > Quite true. So, let's wait for Liam's confirmation and based on his >> > > > answer I'll change the patch by either reducing the rcu read section >> > > > or adding the missing vma_iter_init() after we switch to mmap_lock. >> > > >> > > You need to either be under rcu or mmap lock to ensure the node in the >> > > maple state hasn't been freed (and potentially, reallocated). >> > > >> > > So in this case, in the higher level, we can hold the rcu read lock for >> > > a series of walks and avoid re-walking the tree then the performance >> > > would be better. >> > >> > Got it. Thanks for confirming! >> > >> > > >> > > When we return to userspace, then we should drop the rcu read lock and >> > > will need to vma_iter_set()/vma_iter_invalidate() on return. I thought >> > > this was being done (through vma_iter_init()), but syzbot seems to >> > > indicate a path that was missed? >> > >> > We do that in m_start()/m_stop() by calling >> > lock_vma_range()/unlock_vma_range() but I think I have two problems >> > here: >> > 1. As Vlastimil mentioned I do not reset the iterator when falling >> > back to mmap_lock and exiting and then re-entering rcu read section; >> > 2. I do not reset the iterator after exiting rcu read section in >> > m_stop() and re-entering it in m_start(), so the later call to >> > lock_next_vma() might be using an iterator with a node that was freed >> > (and possibly reallocated). >> > >> > > >> > > This is the same thing that needed to be done previously with the mmap >> > > lock, but now under the rcu lock. >> > > >> > > I'm not sure how to mitigate the issue with the page table, maybe we >> > > guess on the number of vmas that we were doing for 4k blocks of output >> > > and just drop/reacquire then. Probably a problem for another day >> > > anyways. >> > > >> > > Also, I think you can also change the vma_iter_init() to vma_iter_set(), >> > > which is slightly less code under the hood. Vlastimil asked about this >> > > and it's probably a better choice. >> > >> > Ack. >> > I'll update my series with these fixes and all comments I received so >> > far, will run the reproducers to confirm no issues and repost them >> > later today. >> >> I have the patchset ready but would like to test it some more. Will >> post it tomorrow. > > Ok, I found a couple of issues using the syzbot reproducer [1] (which > is awesome BTW!): > 1. rwsem_acquire_read() inside vma_start_read() at [2] should be moved > after the last check, otherwise the lock is considered taken on > vma->vm_refcnt overflow; I think it's fine because if the last check fails there's a vma_refcount_put() that includes rwsem_release(), no? > 2. query_matching_vma() is missing unlock_vma() call when it does > "goto next_vma;" and re-issues query_vma_find_by_addr(). The previous > vma is left locked; > > [1] https://syzkaller.appspot.com/x/repro.c?x=101edf70580000 > [2] https://elixir.bootlin.com/linux/v6.15.5/source/include/linux/mm.h#L747 > > After these fixes it's much harder to fail but I still get one more > error copied below. I will continue the investigation and will hold > off reposting until this is fixed. That will be next week since I'll > be out of town the rest of this week. > > Andrew, could you please remove this patchset from mm-unstable for now > until I fix the issue and re-post the new version? Andrew can you do that please? We keep getting new syzbot reports. > The error I got after these fixes is: I suspect the root cause is the ioctls are not serialized against each other (probably not even against read()) and yet we treat m->private as safe to work on. Now we have various fields that are dangerous to race on - for example locked_vma and iter races would explain a lot of this. I suspect as long as we used purely seq_file workflow, it did the right thing for us wrt serialization, but the ioctl addition violates that. We should rather recheck even the code before this series, if dangerous ioctl vs read() races are possible. And the ioctl implementation should be refactored to use an own per-ioctl-call private context, not the seq_file's per-file-open context.
On Tue, Jul 15, 2025 at 1:16 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 7/10/25 19:02, Suren Baghdasaryan wrote: > > On Thu, Jul 10, 2025 at 12:03 AM Suren Baghdasaryan <surenb@google.com> wrote: > >> > >> On Wed, Jul 9, 2025 at 10:47 AM Suren Baghdasaryan <surenb@google.com> wrote: > >> > > >> > On Wed, Jul 9, 2025 at 4:12 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > >> > > > >> > > * Suren Baghdasaryan <surenb@google.com> [250709 11:06]: > >> > > > On Wed, Jul 9, 2025 at 3:03 PM Vlastimil Babka <vbabka@suse.cz> wrote: > >> > > > > > >> > > > > On 7/9/25 16:43, Suren Baghdasaryan wrote: > >> > > > > > On Wed, Jul 9, 2025 at 1:57 AM Vlastimil Babka <vbabka@suse.cz> wrote: > >> > > > > >> > >> > > > > >> On 7/8/25 01:10, Suren Baghdasaryan wrote: > >> > > > > >> >>> + rcu_read_unlock(); > >> > > > > >> >>> + vma = lock_vma_under_mmap_lock(mm, iter, address); > >> > > > > >> >>> + rcu_read_lock(); > >> > > > > >> >> OK I guess we hold the RCU lock the whole time as we traverse except when > >> > > > > >> >> we lock under mmap lock. > >> > > > > >> > Correct. > >> > > > > >> > >> > > > > >> I wonder if it's really necessary? Can't it be done just inside > >> > > > > >> lock_next_vma()? It would also avoid the unlock/lock dance quoted above. > >> > > > > >> > >> > > > > >> Even if we later manage to extend this approach to smaps and employ rcu > >> > > > > >> locking to traverse the page tables, I'd think it's best to separate and > >> > > > > >> fine-grain the rcu lock usage for vma iterator and page tables, if only to > >> > > > > >> avoid too long time under the lock. > >> > > > > > > >> > > > > > I thought we would need to be in the same rcu read section while > >> > > > > > traversing the maple tree using vma_next() but now looking at it, > >> > > > > > maybe we can indeed enter only while finding and locking the next > >> > > > > > vma... > >> > > > > > Liam, would that work? I see struct ma_state containing a node field. > >> > > > > > Can it be freed from under us if we find a vma, exit rcu read section > >> > > > > > then re-enter rcu and use the same iterator to find the next vma? > >> > > > > > >> > > > > If the rcu protection needs to be contigous, and patch 8 avoids the issue by > >> > > > > always doing vma_iter_init() after rcu_read_lock() (but does it really avoid > >> > > > > the issue or is it why we see the syzbot reports?) then I guess in the code > >> > > > > quoted above we also need a vma_iter_init() after the rcu_read_lock(), > >> > > > > because although the iterator was used briefly under mmap_lock protection, > >> > > > > that was then unlocked and there can be a race before the rcu_read_lock(). > >> > > > > >> > > > Quite true. So, let's wait for Liam's confirmation and based on his > >> > > > answer I'll change the patch by either reducing the rcu read section > >> > > > or adding the missing vma_iter_init() after we switch to mmap_lock. > >> > > > >> > > You need to either be under rcu or mmap lock to ensure the node in the > >> > > maple state hasn't been freed (and potentially, reallocated). > >> > > > >> > > So in this case, in the higher level, we can hold the rcu read lock for > >> > > a series of walks and avoid re-walking the tree then the performance > >> > > would be better. > >> > > >> > Got it. Thanks for confirming! > >> > > >> > > > >> > > When we return to userspace, then we should drop the rcu read lock and > >> > > will need to vma_iter_set()/vma_iter_invalidate() on return. I thought > >> > > this was being done (through vma_iter_init()), but syzbot seems to > >> > > indicate a path that was missed? > >> > > >> > We do that in m_start()/m_stop() by calling > >> > lock_vma_range()/unlock_vma_range() but I think I have two problems > >> > here: > >> > 1. As Vlastimil mentioned I do not reset the iterator when falling > >> > back to mmap_lock and exiting and then re-entering rcu read section; > >> > 2. I do not reset the iterator after exiting rcu read section in > >> > m_stop() and re-entering it in m_start(), so the later call to > >> > lock_next_vma() might be using an iterator with a node that was freed > >> > (and possibly reallocated). > >> > > >> > > > >> > > This is the same thing that needed to be done previously with the mmap > >> > > lock, but now under the rcu lock. > >> > > > >> > > I'm not sure how to mitigate the issue with the page table, maybe we > >> > > guess on the number of vmas that we were doing for 4k blocks of output > >> > > and just drop/reacquire then. Probably a problem for another day > >> > > anyways. > >> > > > >> > > Also, I think you can also change the vma_iter_init() to vma_iter_set(), > >> > > which is slightly less code under the hood. Vlastimil asked about this > >> > > and it's probably a better choice. > >> > > >> > Ack. > >> > I'll update my series with these fixes and all comments I received so > >> > far, will run the reproducers to confirm no issues and repost them > >> > later today. > >> > >> I have the patchset ready but would like to test it some more. Will > >> post it tomorrow. > > > > Ok, I found a couple of issues using the syzbot reproducer [1] (which > > is awesome BTW!): > > 1. rwsem_acquire_read() inside vma_start_read() at [2] should be moved > > after the last check, otherwise the lock is considered taken on > > vma->vm_refcnt overflow; > > I think it's fine because if the last check fails there's a > vma_refcount_put() that includes rwsem_release(), no? Ah, yes, you are right. This is fine. Obviously trying to figure out the issue right before a flight is not a good idea :) > > > 2. query_matching_vma() is missing unlock_vma() call when it does > > "goto next_vma;" and re-issues query_vma_find_by_addr(). The previous > > vma is left locked; > > > > [1] https://syzkaller.appspot.com/x/repro.c?x=101edf70580000 > > [2] https://elixir.bootlin.com/linux/v6.15.5/source/include/linux/mm.h#L747 > > > > After these fixes it's much harder to fail but I still get one more > > error copied below. I will continue the investigation and will hold > > off reposting until this is fixed. That will be next week since I'll > > be out of town the rest of this week. > > > > Andrew, could you please remove this patchset from mm-unstable for now > > until I fix the issue and re-post the new version? > > Andrew can you do that please? We keep getting new syzbot reports. > > > The error I got after these fixes is: > > I suspect the root cause is the ioctls are not serialized against each other > (probably not even against read()) and yet we treat m->private as safe to > work on. Now we have various fields that are dangerous to race on - for > example locked_vma and iter races would explain a lot of this. > > I suspect as long as we used purely seq_file workflow, it did the right > thing for us wrt serialization, but the ioctl addition violates that. We > should rather recheck even the code before this series, if dangerous ioctl > vs read() races are possible. And the ioctl implementation should be > refactored to use an own per-ioctl-call private context, not the seq_file's > per-file-open context. Huh, I completely failed to consider this. In hindsight it is quite obvious... Thanks Vlastimil, I owe you a beer or two.
On Tue, Jul 15, 2025 at 01:13:36PM -0700, Suren Baghdasaryan wrote: > Huh, I completely failed to consider this. In hindsight it is quite > obvious... Thanks Vlastimil, I owe you a beer or two. FYI - Vlasta prefers the 'starobrno' brand of beer, he says he can't get enough :)
On 7/16/25 16:00, Lorenzo Stoakes wrote: > On Tue, Jul 15, 2025 at 01:13:36PM -0700, Suren Baghdasaryan wrote: >> Huh, I completely failed to consider this. In hindsight it is quite >> obvious... Thanks Vlastimil, I owe you a beer or two. > > FYI - Vlasta prefers the 'starobrno' brand of beer, he says he can't get > enough :) FYI - Lorenzo is a notorious liar :)
On Wed, Jul 16, 2025 at 7:07 AM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 7/16/25 16:00, Lorenzo Stoakes wrote: > > On Tue, Jul 15, 2025 at 01:13:36PM -0700, Suren Baghdasaryan wrote: > >> Huh, I completely failed to consider this. In hindsight it is quite > >> obvious... Thanks Vlastimil, I owe you a beer or two. > > > > FYI - Vlasta prefers the 'starobrno' brand of beer, he says he can't get > > enough :) > > FYI - Lorenzo is a notorious liar :) A search for starobrno in Tokyo provides a couple of suggestions: - Pilsen Alley in Ginza focuses on Czech Pilsner beer, and while they feature Pilsner Urquell, they might also carry other Czech brands. - Cerveza Japan ShibuyaAXSH, a casual restaurant specializing in Spanish paella, also offers an extensive selection of imported craft beers from various countries, including the Czech Republic.
On Tue, Jul 15, 2025 at 10:16:41AM +0200, Vlastimil Babka wrote: > > Andrew, could you please remove this patchset from mm-unstable for now > > until I fix the issue and re-post the new version? > > Andrew can you do that please? We keep getting new syzbot reports. I also pinged up top :P just to be extra specially clear... > > > The error I got after these fixes is: > > I suspect the root cause is the ioctls are not serialized against each other > (probably not even against read()) and yet we treat m->private as safe to > work on. Now we have various fields that are dangerous to race on - for > example locked_vma and iter races would explain a lot of this. > > I suspect as long as we used purely seq_file workflow, it did the right > thing for us wrt serialization, but the ioctl addition violates that. We > should rather recheck even the code before this series, if dangerous ioctl > vs read() races are possible. And the ioctl implementation should be > refactored to use an own per-ioctl-call private context, not the seq_file's > per-file-open context. Entirely agree with this analysis. I had a look at most recent report, see: https://lore.kernel.org/linux-mm/f13cda37-06a0-4281-87d1-042678a38a6b@lucifer.local/ AFAICT we either have to lock around the ioctl or find a new way of storing per-ioctl state. We'd probably need to separate out the procmap query stuff to do that though. Probably. I don't think a per-priv say lock is necessarily _that_ egregious since are people _really_ opening this one file and doing multiple parallel queries all that much? Anyway this latest report seems entirely about the procmap stuff.
On 15.07.25 11:40, Lorenzo Stoakes wrote: > On Tue, Jul 15, 2025 at 10:16:41AM +0200, Vlastimil Babka wrote: >>> Andrew, could you please remove this patchset from mm-unstable for now >>> until I fix the issue and re-post the new version? >> >> Andrew can you do that please? We keep getting new syzbot reports. > > I also pinged up top :P just to be extra specially clear... > >> >>> The error I got after these fixes is: >> >> I suspect the root cause is the ioctls are not serialized against each other >> (probably not even against read()) and yet we treat m->private as safe to >> work on. Now we have various fields that are dangerous to race on - for >> example locked_vma and iter races would explain a lot of this. >> >> I suspect as long as we used purely seq_file workflow, it did the right >> thing for us wrt serialization, but the ioctl addition violates that. We >> should rather recheck even the code before this series, if dangerous ioctl >> vs read() races are possible. And the ioctl implementation should be >> refactored to use an own per-ioctl-call private context, not the seq_file's >> per-file-open context. > > Entirely agree with this analysis. I had a look at most recent report, see: > > https://lore.kernel.org/linux-mm/f13cda37-06a0-4281-87d1-042678a38a6b@lucifer.local/ > > AFAICT we either have to lock around the ioctl or find a new way of storing > per-ioctl state. > > We'd probably need to separate out the procmap query stuff to do that > though. Probably. When I skimmed that series the first time, I was wondering "why are we even caring about PROCMAP_QUERY that in the context of this patch series". Maybe that helps :) -- Cheers, David / dhildenb
On 7/15/25 11:52, David Hildenbrand wrote: > On 15.07.25 11:40, Lorenzo Stoakes wrote: >> On Tue, Jul 15, 2025 at 10:16:41AM +0200, Vlastimil Babka wrote: >>>> Andrew, could you please remove this patchset from mm-unstable for now >>>> until I fix the issue and re-post the new version? >>> >>> Andrew can you do that please? We keep getting new syzbot reports. >> >> I also pinged up top :P just to be extra specially clear... >> >>> >>>> The error I got after these fixes is: >>> >>> I suspect the root cause is the ioctls are not serialized against each other >>> (probably not even against read()) and yet we treat m->private as safe to >>> work on. Now we have various fields that are dangerous to race on - for >>> example locked_vma and iter races would explain a lot of this. >>> >>> I suspect as long as we used purely seq_file workflow, it did the right >>> thing for us wrt serialization, but the ioctl addition violates that. We >>> should rather recheck even the code before this series, if dangerous ioctl >>> vs read() races are possible. And the ioctl implementation should be >>> refactored to use an own per-ioctl-call private context, not the seq_file's >>> per-file-open context. >> >> Entirely agree with this analysis. I had a look at most recent report, see: >> >> https://lore.kernel.org/linux-mm/f13cda37-06a0-4281-87d1-042678a38a6b@lucifer.local/ >> >> AFAICT we either have to lock around the ioctl or find a new way of storing >> per-ioctl state. >> >> We'd probably need to separate out the procmap query stuff to do that >> though. Probably. > > When I skimmed that series the first time, I was wondering "why are we > even caring about PROCMAP_QUERY that in the context of this patch series". > > Maybe that helps :) Yeah seems like before patch 8/8 the ioctl handling, specifically do_procmap_query() only looks at priv->mm and nothing else so it should be safe as that's a stable value. So it should be also enough to drop the last patch from mm for now, not whole series.
On Tue, Jul 15, 2025 at 12:23:31PM +0200, Vlastimil Babka wrote: > On 7/15/25 11:52, David Hildenbrand wrote: > > On 15.07.25 11:40, Lorenzo Stoakes wrote: > >> On Tue, Jul 15, 2025 at 10:16:41AM +0200, Vlastimil Babka wrote: > >>>> Andrew, could you please remove this patchset from mm-unstable for now > >>>> until I fix the issue and re-post the new version? > >>> > >>> Andrew can you do that please? We keep getting new syzbot reports. > >> > >> I also pinged up top :P just to be extra specially clear... > >> > >>> > >>>> The error I got after these fixes is: > >>> > >>> I suspect the root cause is the ioctls are not serialized against each other > >>> (probably not even against read()) and yet we treat m->private as safe to > >>> work on. Now we have various fields that are dangerous to race on - for > >>> example locked_vma and iter races would explain a lot of this. > >>> > >>> I suspect as long as we used purely seq_file workflow, it did the right > >>> thing for us wrt serialization, but the ioctl addition violates that. We > >>> should rather recheck even the code before this series, if dangerous ioctl > >>> vs read() races are possible. And the ioctl implementation should be > >>> refactored to use an own per-ioctl-call private context, not the seq_file's > >>> per-file-open context. > >> > >> Entirely agree with this analysis. I had a look at most recent report, see: > >> > >> https://lore.kernel.org/linux-mm/f13cda37-06a0-4281-87d1-042678a38a6b@lucifer.local/ > >> > >> AFAICT we either have to lock around the ioctl or find a new way of storing > >> per-ioctl state. > >> > >> We'd probably need to separate out the procmap query stuff to do that > >> though. Probably. > > > > When I skimmed that series the first time, I was wondering "why are we > > even caring about PROCMAP_QUERY that in the context of this patch series". > > > > Maybe that helps :) > > Yeah seems like before patch 8/8 the ioctl handling, specifically > do_procmap_query() only looks at priv->mm and nothing else so it should be > safe as that's a stable value. > > So it should be also enough to drop the last patch from mm for now, not > whole series. Yeah to save the mothership we can ditch the landing craft :P Maybe worth doing that, and figure out in a follow up how to fix this. Or we could just sling in a cheeky spinlock
On Tue, Jul 15, 2025 at 3:31 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > On Tue, Jul 15, 2025 at 12:23:31PM +0200, Vlastimil Babka wrote: > > On 7/15/25 11:52, David Hildenbrand wrote: > > > On 15.07.25 11:40, Lorenzo Stoakes wrote: > > >> On Tue, Jul 15, 2025 at 10:16:41AM +0200, Vlastimil Babka wrote: > > >>>> Andrew, could you please remove this patchset from mm-unstable for now > > >>>> until I fix the issue and re-post the new version? > > >>> > > >>> Andrew can you do that please? We keep getting new syzbot reports. > > >> > > >> I also pinged up top :P just to be extra specially clear... > > >> > > >>> > > >>>> The error I got after these fixes is: > > >>> > > >>> I suspect the root cause is the ioctls are not serialized against each other > > >>> (probably not even against read()) and yet we treat m->private as safe to > > >>> work on. Now we have various fields that are dangerous to race on - for > > >>> example locked_vma and iter races would explain a lot of this. > > >>> > > >>> I suspect as long as we used purely seq_file workflow, it did the right > > >>> thing for us wrt serialization, but the ioctl addition violates that. We > > >>> should rather recheck even the code before this series, if dangerous ioctl > > >>> vs read() races are possible. And the ioctl implementation should be > > >>> refactored to use an own per-ioctl-call private context, not the seq_file's > > >>> per-file-open context. > > >> > > >> Entirely agree with this analysis. I had a look at most recent report, see: > > >> > > >> https://lore.kernel.org/linux-mm/f13cda37-06a0-4281-87d1-042678a38a6b@lucifer.local/ > > >> > > >> AFAICT we either have to lock around the ioctl or find a new way of storing > > >> per-ioctl state. > > >> > > >> We'd probably need to separate out the procmap query stuff to do that > > >> though. Probably. > > > > > > When I skimmed that series the first time, I was wondering "why are we > > > even caring about PROCMAP_QUERY that in the context of this patch series". > > > > > > Maybe that helps :) > > > > Yeah seems like before patch 8/8 the ioctl handling, specifically > > do_procmap_query() only looks at priv->mm and nothing else so it should be > > safe as that's a stable value. > > > > So it should be also enough to drop the last patch from mm for now, not > > whole series. > > Yeah to save the mothership we can ditch the landing craft :P > > Maybe worth doing that, and figure out in a follow up how to fix this. For PROCMAP_QUERY, we need priv->mm, but the newly added locked_vma and locked_vma don't need to be persisted between ioctl calls. So we can just add those two fields into a small struct, and for seq_file case have it in priv, but for PROCMAP_QUERY just have it on the stack. The code can be written to accept this struct to maintain the state, which for PROCMAP_QUERY ioctl will be very short-lived on the stack one. Would that work? > > Or we could just sling in a cheeky spinlock
On Tue, Jul 15, 2025 at 10:05:49AM -0700, Andrii Nakryiko wrote: > On Tue, Jul 15, 2025 at 3:31 AM Lorenzo Stoakes > <lorenzo.stoakes@oracle.com> wrote: > > > > On Tue, Jul 15, 2025 at 12:23:31PM +0200, Vlastimil Babka wrote: > > > On 7/15/25 11:52, David Hildenbrand wrote: > > > > On 15.07.25 11:40, Lorenzo Stoakes wrote: > > > >> On Tue, Jul 15, 2025 at 10:16:41AM +0200, Vlastimil Babka wrote: > > > >>>> Andrew, could you please remove this patchset from mm-unstable for now > > > >>>> until I fix the issue and re-post the new version? > > > >>> > > > >>> Andrew can you do that please? We keep getting new syzbot reports. > > > >> > > > >> I also pinged up top :P just to be extra specially clear... > > > >> > > > >>> > > > >>>> The error I got after these fixes is: > > > >>> > > > >>> I suspect the root cause is the ioctls are not serialized against each other > > > >>> (probably not even against read()) and yet we treat m->private as safe to > > > >>> work on. Now we have various fields that are dangerous to race on - for > > > >>> example locked_vma and iter races would explain a lot of this. > > > >>> > > > >>> I suspect as long as we used purely seq_file workflow, it did the right > > > >>> thing for us wrt serialization, but the ioctl addition violates that. We > > > >>> should rather recheck even the code before this series, if dangerous ioctl > > > >>> vs read() races are possible. And the ioctl implementation should be > > > >>> refactored to use an own per-ioctl-call private context, not the seq_file's > > > >>> per-file-open context. > > > >> > > > >> Entirely agree with this analysis. I had a look at most recent report, see: > > > >> > > > >> https://lore.kernel.org/linux-mm/f13cda37-06a0-4281-87d1-042678a38a6b@lucifer.local/ > > > >> > > > >> AFAICT we either have to lock around the ioctl or find a new way of storing > > > >> per-ioctl state. > > > >> > > > >> We'd probably need to separate out the procmap query stuff to do that > > > >> though. Probably. > > > > > > > > When I skimmed that series the first time, I was wondering "why are we > > > > even caring about PROCMAP_QUERY that in the context of this patch series". > > > > > > > > Maybe that helps :) > > > > > > Yeah seems like before patch 8/8 the ioctl handling, specifically > > > do_procmap_query() only looks at priv->mm and nothing else so it should be > > > safe as that's a stable value. > > > > > > So it should be also enough to drop the last patch from mm for now, not > > > whole series. > > > > Yeah to save the mothership we can ditch the landing craft :P > > > > Maybe worth doing that, and figure out in a follow up how to fix this. > > For PROCMAP_QUERY, we need priv->mm, but the newly added locked_vma > and locked_vma don't need to be persisted between ioctl calls. So we > can just add those two fields into a small struct, and for seq_file > case have it in priv, but for PROCMAP_QUERY just have it on the stack. > The code can be written to accept this struct to maintain the state, > which for PROCMAP_QUERY ioctl will be very short-lived on the stack > one. > > Would that work? Yeah that's a great idea actually, the stack would obviously give us the per-query invocation thing. Nice! I am kicking myself because I jokingly suggested (off-list) that a helper struct would be the answer to everything (I do love them) and of course... here we are :P
On Tue, Jul 15, 2025 at 06:10:16PM +0100, Lorenzo Stoakes wrote: > > For PROCMAP_QUERY, we need priv->mm, but the newly added locked_vma > > and locked_vma don't need to be persisted between ioctl calls. So we > > can just add those two fields into a small struct, and for seq_file > > case have it in priv, but for PROCMAP_QUERY just have it on the stack. > > The code can be written to accept this struct to maintain the state, > > which for PROCMAP_QUERY ioctl will be very short-lived on the stack > > one. > > > > Would that work? > > Yeah that's a great idea actually, the stack would obviously give us the > per-query invocation thing. Nice! > > I am kicking myself because I jokingly suggested (off-list) that a helper > struct would be the answer to everything (I do love them) and of > course... here we are :P Hm but actually we'd have to invert things I think, what I mean is - since these fields can be updated at any time by racing threads, we can't have _anything_ in the priv struct that is mutable. So instead we should do something like: struct proc_maps_state { const struct proc_maps_private *priv; bool mmap_locked; struct vm_area_struct *locked_vma; struct vma_iterator iter; loff_t last_pos; }; static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; struct proc_maps_state state = { .priv = priv, }; switch (cmd) { case PROCMAP_QUERY: return do_procmap_query(state, (void __user *)arg); default: return -ENOIOCTLCMD; } } And then we have a stack-based thing with the bits that change and a read-only pointer to the bits that must remain static. And we can enforce that with const... We'd have to move the VMI and last_pos out too to make it const. Anyway the general idea should work!
On Tue, Jul 15, 2025 at 10:21 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > On Tue, Jul 15, 2025 at 06:10:16PM +0100, Lorenzo Stoakes wrote: > > > For PROCMAP_QUERY, we need priv->mm, but the newly added locked_vma > > > and locked_vma don't need to be persisted between ioctl calls. So we > > > can just add those two fields into a small struct, and for seq_file > > > case have it in priv, but for PROCMAP_QUERY just have it on the stack. > > > The code can be written to accept this struct to maintain the state, > > > which for PROCMAP_QUERY ioctl will be very short-lived on the stack > > > one. > > > > > > Would that work? > > > > Yeah that's a great idea actually, the stack would obviously give us the > > per-query invocation thing. Nice! > > > > I am kicking myself because I jokingly suggested (off-list) that a helper > > struct would be the answer to everything (I do love them) and of > > course... here we are :P > > Hm but actually we'd have to invert things I think, what I mean is - since > these fields can be updated at any time by racing threads, we can't have > _anything_ in the priv struct that is mutable. > Exactly, and I guess I was just being incomplete with just listing two of the fields that Suren make use of in PROCMAP_QUERY. See below. > So instead we should do something like: > > struct proc_maps_state { > const struct proc_maps_private *priv; > bool mmap_locked; > struct vm_area_struct *locked_vma; > struct vma_iterator iter; > loff_t last_pos; > }; > > static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) > { > struct seq_file *seq = file->private_data; > struct proc_maps_private *priv = seq->private; > struct proc_maps_state state = { > .priv = priv, > }; > > switch (cmd) { > case PROCMAP_QUERY: > return do_procmap_query(state, (void __user *)arg); I guess it's a matter of preference, but I'd actually just pass seq->priv->mm and arg into do_procmap_query(), which will make it super obvious that priv is not used or mutated, and all the new stuff that Suren needs for lockless VMA iteration, including iter (not sure PROCMAP_QUERY needs last_pos, tbh), I'd just put into this new struct, which do_procmap_query() can keep private to itself. Ultimately, I think we are on the same page, it's just a matter of structuring code and types. > default: > return -ENOIOCTLCMD; > } > } > > And then we have a stack-based thing with the bits that change and a > read-only pointer to the bits that must remain static. And we can enforce > that with const... > > We'd have to move the VMI and last_pos out too to make it const. > > Anyway the general idea should work!
On Tue, Jul 15, 2025 at 10:29 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Jul 15, 2025 at 10:21 AM Lorenzo Stoakes > <lorenzo.stoakes@oracle.com> wrote: > > > > On Tue, Jul 15, 2025 at 06:10:16PM +0100, Lorenzo Stoakes wrote: > > > > For PROCMAP_QUERY, we need priv->mm, but the newly added locked_vma > > > > and locked_vma don't need to be persisted between ioctl calls. So we > > > > can just add those two fields into a small struct, and for seq_file > > > > case have it in priv, but for PROCMAP_QUERY just have it on the stack. > > > > The code can be written to accept this struct to maintain the state, > > > > which for PROCMAP_QUERY ioctl will be very short-lived on the stack > > > > one. > > > > > > > > Would that work? > > > > > > Yeah that's a great idea actually, the stack would obviously give us the > > > per-query invocation thing. Nice! > > > > > > I am kicking myself because I jokingly suggested (off-list) that a helper > > > struct would be the answer to everything (I do love them) and of > > > course... here we are :P > > > > Hm but actually we'd have to invert things I think, what I mean is - since > > these fields can be updated at any time by racing threads, we can't have > > _anything_ in the priv struct that is mutable. > > > > Exactly, and I guess I was just being incomplete with just listing two > of the fields that Suren make use of in PROCMAP_QUERY. See below. > > > So instead we should do something like: > > > > struct proc_maps_state { > > const struct proc_maps_private *priv; > > bool mmap_locked; > > struct vm_area_struct *locked_vma; > > struct vma_iterator iter; > > loff_t last_pos; > > }; > > > > static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) > > { > > struct seq_file *seq = file->private_data; > > struct proc_maps_private *priv = seq->private; > > struct proc_maps_state state = { > > .priv = priv, > > }; > > > > switch (cmd) { > > case PROCMAP_QUERY: > > return do_procmap_query(state, (void __user *)arg); > > I guess it's a matter of preference, but I'd actually just pass > seq->priv->mm and arg into do_procmap_query(), which will make it > super obvious that priv is not used or mutated, and all the new stuff > that Suren needs for lockless VMA iteration, including iter (not sure > PROCMAP_QUERY needs last_pos, tbh), I'd just put into this new struct, > which do_procmap_query() can keep private to itself. > > Ultimately, I think we are on the same page, it's just a matter of > structuring code and types. That sounds cleaner to me too. I'll post a new version of my patchset today without the last patch to keep PROCMAP_QUERY changes separate, and then a follow-up patch that does this refactoring and changes PROCMAP_QUERY to use per-vma locks. Thanks folks! It's good to be back from vacation with the problem already figured out for you :) > > > default: > > return -ENOIOCTLCMD; > > } > > } > > > > And then we have a stack-based thing with the bits that change and a > > read-only pointer to the bits that must remain static. And we can enforce > > that with const... > > > > We'd have to move the VMI and last_pos out too to make it const. > > > > Anyway the general idea should work!
On Tue, Jul 15, 2025 at 1:18 PM Suren Baghdasaryan <surenb@google.com> wrote: > > On Tue, Jul 15, 2025 at 10:29 AM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Tue, Jul 15, 2025 at 10:21 AM Lorenzo Stoakes > > <lorenzo.stoakes@oracle.com> wrote: > > > > > > On Tue, Jul 15, 2025 at 06:10:16PM +0100, Lorenzo Stoakes wrote: > > > > > For PROCMAP_QUERY, we need priv->mm, but the newly added locked_vma > > > > > and locked_vma don't need to be persisted between ioctl calls. So we > > > > > can just add those two fields into a small struct, and for seq_file > > > > > case have it in priv, but for PROCMAP_QUERY just have it on the stack. > > > > > The code can be written to accept this struct to maintain the state, > > > > > which for PROCMAP_QUERY ioctl will be very short-lived on the stack > > > > > one. > > > > > > > > > > Would that work? > > > > > > > > Yeah that's a great idea actually, the stack would obviously give us the > > > > per-query invocation thing. Nice! > > > > > > > > I am kicking myself because I jokingly suggested (off-list) that a helper > > > > struct would be the answer to everything (I do love them) and of > > > > course... here we are :P > > > > > > Hm but actually we'd have to invert things I think, what I mean is - since > > > these fields can be updated at any time by racing threads, we can't have > > > _anything_ in the priv struct that is mutable. > > > > > > > Exactly, and I guess I was just being incomplete with just listing two > > of the fields that Suren make use of in PROCMAP_QUERY. See below. > > > > > So instead we should do something like: > > > > > > struct proc_maps_state { > > > const struct proc_maps_private *priv; > > > bool mmap_locked; > > > struct vm_area_struct *locked_vma; > > > struct vma_iterator iter; > > > loff_t last_pos; > > > }; > > > > > > static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) > > > { > > > struct seq_file *seq = file->private_data; > > > struct proc_maps_private *priv = seq->private; > > > struct proc_maps_state state = { > > > .priv = priv, > > > }; > > > > > > switch (cmd) { > > > case PROCMAP_QUERY: > > > return do_procmap_query(state, (void __user *)arg); > > > > I guess it's a matter of preference, but I'd actually just pass > > seq->priv->mm and arg into do_procmap_query(), which will make it > > super obvious that priv is not used or mutated, and all the new stuff > > that Suren needs for lockless VMA iteration, including iter (not sure > > PROCMAP_QUERY needs last_pos, tbh), I'd just put into this new struct, > > which do_procmap_query() can keep private to itself. > > > > Ultimately, I think we are on the same page, it's just a matter of > > structuring code and types. > > That sounds cleaner to me too. > I'll post a new version of my patchset today without the last patch to > keep PROCMAP_QUERY changes separate, and then a follow-up patch that > does this refactoring and changes PROCMAP_QUERY to use per-vma locks. > > Thanks folks! It's good to be back from vacation with the problem > already figured out for you :) Yes, Vlastimil was correct. Once I refactored the last patch so that do_procmap_query() does not use seq->private at all, the reproducer stopped failing. I'll post the patchset without the last patch shortly and once Andrew takes it into mm-unstable, will post the last patch as a follow-up. Thanks, Suren. > > > > > > default: > > > return -ENOIOCTLCMD; > > > } > > > } > > > > > > And then we have a stack-based thing with the bits that change and a > > > read-only pointer to the bits that must remain static. And we can enforce > > > that with const... > > > > > > We'd have to move the VMI and last_pos out too to make it const. > > > > > > Anyway the general idea should work!
On Tue, Jul 15, 2025 at 11:31:23AM +0100, Lorenzo Stoakes wrote: > Or we could just sling in a cheeky spinlock As politely pointed out to me by Vlasta off-list s/spinlock/mutex/ since we apparently allocate for anon_vma_name :'(.
On Tue, Jul 15, 2025 at 11:52:49AM +0200, David Hildenbrand wrote: > On 15.07.25 11:40, Lorenzo Stoakes wrote: > > On Tue, Jul 15, 2025 at 10:16:41AM +0200, Vlastimil Babka wrote: > > > > Andrew, could you please remove this patchset from mm-unstable for now > > > > until I fix the issue and re-post the new version? > > > > > > Andrew can you do that please? We keep getting new syzbot reports. > > > > I also pinged up top :P just to be extra specially clear... > > > > > > > > > The error I got after these fixes is: > > > > > > I suspect the root cause is the ioctls are not serialized against each other > > > (probably not even against read()) and yet we treat m->private as safe to > > > work on. Now we have various fields that are dangerous to race on - for > > > example locked_vma and iter races would explain a lot of this. > > > > > > I suspect as long as we used purely seq_file workflow, it did the right > > > thing for us wrt serialization, but the ioctl addition violates that. We > > > should rather recheck even the code before this series, if dangerous ioctl > > > vs read() races are possible. And the ioctl implementation should be > > > refactored to use an own per-ioctl-call private context, not the seq_file's > > > per-file-open context. > > > > Entirely agree with this analysis. I had a look at most recent report, see: > > > > https://lore.kernel.org/linux-mm/f13cda37-06a0-4281-87d1-042678a38a6b@lucifer.local/ > > > > AFAICT we either have to lock around the ioctl or find a new way of storing > > per-ioctl state. > > > > We'd probably need to separate out the procmap query stuff to do that > > though. Probably. > > When I skimmed that series the first time, I was wondering "why are we even > caring about PROCMAP_QUERY that in the context of this patch series". > > Maybe that helps :) Haha well I think it's _still useful_ for avoid contention of the mmap lock. But we probably just need to bite bullet and lock per-fd for this > > -- > Cheers, > > David / dhildenb >
On 7/10/25 19:02, Suren Baghdasaryan wrote: > On Thu, Jul 10, 2025 at 12:03 AM Suren Baghdasaryan <surenb@google.com> wrote: >> >> >> I have the patchset ready but would like to test it some more. Will >> post it tomorrow. > > Ok, I found a couple of issues using the syzbot reproducer [1] (which > is awesome BTW!): > 1. rwsem_acquire_read() inside vma_start_read() at [2] should be moved > after the last check, otherwise the lock is considered taken on > vma->vm_refcnt overflow; > 2. query_matching_vma() is missing unlock_vma() call when it does > "goto next_vma;" and re-issues query_vma_find_by_addr(). The previous > vma is left locked; How does that happen? query_vma_find_by_addr() does get_next_vma() which does unlock_vma()?
On Thu, 10 Jul 2025 19:42:16 +0200 Vlastimil Babka wrote: > On 7/10/25 19:02, Suren Baghdasaryan wrote: > > On Thu, Jul 10, 2025 at 12:03 AM Suren Baghdasaryan <surenb@google.com> wrote: > >> > >> > >> I have the patchset ready but would like to test it some more. Will > >> post it tomorrow. > > > > Ok, I found a couple of issues using the syzbot reproducer [1] (which > > is awesome BTW!): > > 1. rwsem_acquire_read() inside vma_start_read() at [2] should be moved > > after the last check, otherwise the lock is considered taken on > > vma->vm_refcnt overflow; > > 2. query_matching_vma() is missing unlock_vma() call when it does > > "goto next_vma;" and re-issues query_vma_find_by_addr(). The previous > > vma is left locked; > > How does that happen? query_vma_find_by_addr() does get_next_vma() which > does unlock_vma()? > Adding a mutex that protects do_procmap_query() survived the syzbot test [1,2]. That sounds like a bad news as locking vma alone is not enough to query the map. [1] https://lore.kernel.org/lkml/687092d6.a00a0220.26a83e.0036.GAE@google.com/ [2] https://lore.kernel.org/lkml/6870a4a4.a00a0220.26a83e.003c.GAE@google.com/
© 2016 - 2025 Red Hat, Inc.