kernel/locking/rwsem.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-)
Reader optimistic lock stealing lets a slowpath reader acquire the
lock without queueing when WRITER_LOCKED and HANDOFF are clear. It
works well when writers are rare, but it can starve writers when
readers hold the rwsem across slow operations.
A common case is Direct I/O on inode->i_rwsem: both an O_DIRECT
appending writer and an O_DIRECT tailing reader hold inode->i_rwsem
across iomap_dio_rw(). After the last reader releases the lock the
queued writer is woken, but a newly arriving reader can win the
atomic fast path before the writer is scheduled. The stolen read
lock is then held for another DIO, so the writer repeatedly pays for
a reader I/O until the handoff timeout fires.
The handoff timeout already moves the lock briefly to a writer/
no-steal state, but only for one writer; as soon as it completes,
reader stealing is permitted again and the next queued writer is
starved by the same race. Make the writer phase explicit so that it
persists across consecutive queued writers and ends only when a
reader is granted. Use a new bit RWSEM_FLAG_WRITER_PHASE in the
existing reserved range of rw_semaphore->count, so struct
rw_semaphore is unchanged.
RWSEM_FLAG_WRITER_PHASE is set when a writer becomes the head of the
wait queue (in rwsem_mark_wake() when waking a queued writer head,
or in rwsem_down_write_slowpath() when enqueueing onto an empty
list), and cleared when the queue drains or transitions to a reader
phase (in rwsem_mark_wake() when waking a reader or emptying the
list; in rwsem_try_write_lock() when the acquiring writer is the
only remaining waiter; in rwsem_del_waiter() when removing the last
waiter).
While the bit is set, the reader fast path falls through via
RWSEM_READ_FAILED_MASK, slowpath stealing is disabled, and multiple
consecutive queued writers stay protected. Reader stealing is only
blocked in the precise condition that produces starvation, not when
the queue head is itself a reader.
The HANDOFF bit and its post-CAS invariant are not modified.
RWSEM_FLAG_WRITER_PHASE is only set or cleared under wait_lock,
never inside the rwsem_try_write_lock() compare-exchange that
handles HANDOFF.
Tested with one O_DIRECT appending writer and one O_DIRECT tailing
reader on ext4 over NVMe:
before patch after patch
writer-only baseline: ~390 MB/s, 20 us ~390 MB/s, 20 us
mixed write throughput: ~10 MB/s, ~970 us ~190 MB/s, ~50 us
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Peng Wang <peng_wang@linux.alibaba.com>
---
kernel/locking/rwsem.c | 41 ++++++++++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 13 deletions(-)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index bf647097369c..f04112d2a336 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -85,7 +85,8 @@
* Bit 0 - writer locked bit
* Bit 1 - waiters present bit
* Bit 2 - lock handoff bit
- * Bits 3-7 - reserved
+ * Bit 3 - writer phase bit
+ * Bits 4-7 - reserved
* Bits 8-62 - 55-bit reader count
* Bit 63 - read fail bit
*
@@ -94,7 +95,8 @@
* Bit 0 - writer locked bit
* Bit 1 - waiters present bit
* Bit 2 - lock handoff bit
- * Bits 3-7 - reserved
+ * Bit 3 - writer phase bit
+ * Bits 4-7 - reserved
* Bits 8-30 - 23-bit reader count
* Bit 31 - read fail bit
*
@@ -106,10 +108,11 @@
* atomic_long_fetch_add() is used to obtain reader lock, whereas
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
- * There are three places where the lock handoff bit may be set or cleared.
- * 1) rwsem_mark_wake() for readers -- set, clear
- * 2) rwsem_try_write_lock() for writers -- set, clear
- * 3) rwsem_del_waiter() -- clear
+ * There are three places where the lock handoff bit and writer phase bit
+ * may be set or cleared.
+ * 1) rwsem_mark_wake() -- set, clear
+ * 2) rwsem_try_write_lock() -- clear
+ * 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
@@ -118,6 +121,7 @@
#define RWSEM_WRITER_LOCKED (1UL << 0)
#define RWSEM_FLAG_WAITERS (1UL << 1)
#define RWSEM_FLAG_HANDOFF (1UL << 2)
+#define RWSEM_FLAG_WRITER_PHASE (1UL << 3)
#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
#define RWSEM_READER_SHIFT 8
@@ -126,7 +130,9 @@
#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
- RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
+ RWSEM_FLAG_HANDOFF |\
+ RWSEM_FLAG_WRITER_PHASE |\
+ RWSEM_FLAG_READFAIL)
/*
* All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -396,7 +402,8 @@ rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
lockdep_assert_held(&sem->wait_lock);
if (__rwsem_del_waiter(sem, waiter))
return true;
- atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
+ atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS |
+ RWSEM_FLAG_WRITER_PHASE, &sem->count);
return false;
}
@@ -444,12 +451,13 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
if (wake_type == RWSEM_WAKE_ANY) {
+ atomic_long_or(RWSEM_FLAG_WRITER_PHASE, &sem->count);
/*
* Mark writer at the front of the queue for wakeup.
* Until the task is actually later awoken later by
* the caller, other writers are able to steal it.
* Readers, on the other hand, will block as they
- * will notice the queued writer.
+ * will notice the writer phase.
*/
wake_q_add(wake_q, waiter->task);
lockevent_inc(rwsem_wake_writer);
@@ -554,13 +562,17 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
adjustment -= RWSEM_FLAG_WAITERS;
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
+ if (oldcount & RWSEM_FLAG_WRITER_PHASE)
+ adjustment -= RWSEM_FLAG_WRITER_PHASE;
} else if (woken) {
/*
* When we've woken a reader, we no longer need to force
- * writers to give up the lock and we can clear HANDOFF.
+ * writers to give up the lock and we can clear writer phase.
*/
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
+ if (oldcount & RWSEM_FLAG_WRITER_PHASE)
+ adjustment -= RWSEM_FLAG_WRITER_PHASE;
}
if (adjustment)
@@ -663,7 +675,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
new &= ~RWSEM_FLAG_HANDOFF;
if (list_empty(&first->list))
- new &= ~RWSEM_FLAG_WAITERS;
+ new &= ~(RWSEM_FLAG_WAITERS |
+ RWSEM_FLAG_WRITER_PHASE);
}
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
@@ -1033,7 +1046,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
/*
* Reader optimistic lock stealing.
*/
- if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
+ if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF |
+ RWSEM_FLAG_WRITER_PHASE))) {
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_steal);
@@ -1175,7 +1189,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
} else {
INIT_LIST_HEAD(&waiter.list);
sem->first_waiter = &waiter;
- atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
+ atomic_long_or(RWSEM_FLAG_WAITERS | RWSEM_FLAG_WRITER_PHASE,
+ &sem->count);
}
/* wait until we successfully acquire the lock */
--
2.47.3
© 2016 - 2026 Red Hat, Inc.