Instead of embedding a list_head in struct rw_semaphore, store a pointer
to the first waiter. The list of waiters remains a doubly linked list
so we can efficiently add to the tail of the list, remove from the front
(or middle) of the list.
Some of the list manipulation becomes more complicated, but it's a
reasonable tradeoff on the slow paths to shrink some core data structures
like struct inode.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
include/linux/rwsem.h | 8 ++--
kernel/locking/rwsem.c | 89 +++++++++++++++++++++++++++---------------
2 files changed, 61 insertions(+), 36 deletions(-)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index f1aaf676a874..1771c96a01d2 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -57,7 +57,7 @@ struct rw_semaphore {
struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
raw_spinlock_t wait_lock;
- struct list_head wait_list;
+ struct rwsem_waiter *first_waiter;
#ifdef CONFIG_DEBUG_RWSEMS
void *magic;
#endif
@@ -104,7 +104,7 @@ static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *
.owner = ATOMIC_LONG_INIT(0), \
__RWSEM_OPT_INIT(name) \
.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
- .wait_list = LIST_HEAD_INIT((name).wait_list), \
+ .first_waiter = NULL, \
__RWSEM_DEBUG_INIT(name) \
__RWSEM_DEP_MAP_INIT(name) }
@@ -127,9 +127,9 @@ do { \
* rwsem to see if somebody from an incompatible type is wanting access to the
* lock.
*/
-static inline int rwsem_is_contended(struct rw_semaphore *sem)
+static inline bool rwsem_is_contended(struct rw_semaphore *sem)
{
- return !list_empty(&sem->wait_list);
+ return sem->first_waiter != NULL;
}
#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 24df4d98f7d2..6030d5d81ccc 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -72,7 +72,7 @@
#c, atomic_long_read(&(sem)->count), \
(unsigned long) sem->magic, \
atomic_long_read(&(sem)->owner), (long)current, \
- list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ (sem)->first_waiter ? "" : "not ")) \
debug_locks_off(); \
} while (0)
#else
@@ -321,7 +321,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
#endif
atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
+ sem->first_waiter = NULL;
atomic_long_set(&sem->owner, 0L);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
osq_lock_init(&sem->osq);
@@ -341,8 +341,6 @@ struct rwsem_waiter {
unsigned long timeout;
bool handoff_set;
};
-#define rwsem_first_waiter(sem) \
- list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
enum rwsem_wake_type {
RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
@@ -365,12 +363,21 @@ enum rwsem_wake_type {
*/
#define MAX_READERS_WAKEUP 0x100
-static inline void
-rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+static inline
+bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
- lockdep_assert_held(&sem->wait_lock);
- list_add_tail(&waiter->list, &sem->wait_list);
- /* caller will set RWSEM_FLAG_WAITERS */
+ if (list_empty(&waiter->list)) {
+ sem->first_waiter = NULL;
+ return true;
+ }
+
+ if (sem->first_waiter == waiter) {
+ sem->first_waiter = list_first_entry(&waiter->list,
+ struct rwsem_waiter, list);
+ }
+ list_del(&waiter->list);
+
+ return false;
}
/*
@@ -385,14 +392,22 @@ static inline bool
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
- list_del(&waiter->list);
- if (likely(!list_empty(&sem->wait_list)))
+ if (__rwsem_del_waiter(sem, waiter))
return true;
-
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
return false;
}
+static inline struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem,
+ const struct rwsem_waiter *waiter)
+{
+ struct rwsem_waiter *next = list_first_entry(&waiter->list,
+ struct rwsem_waiter, list);
+ if (next == sem->first_waiter)
+ return NULL;
+ return next;
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -411,7 +426,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
struct wake_q_head *wake_q)
{
- struct rwsem_waiter *waiter, *tmp;
+ struct rwsem_waiter *waiter, *next;
long oldcount, woken = 0, adjustment = 0;
struct list_head wlist;
@@ -421,7 +436,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* Take a peek at the queue head waiter such that we can determine
* the wakeup(s) to perform.
*/
- waiter = rwsem_first_waiter(sem);
+ waiter = sem->first_waiter;
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
if (wake_type == RWSEM_WAKE_ANY) {
@@ -506,25 +521,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* put them into wake_q to be woken up later.
*/
INIT_LIST_HEAD(&wlist);
- list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ do {
+ next = next_waiter(sem, waiter);
if (waiter->type == RWSEM_WAITING_FOR_WRITE)
continue;
woken++;
list_move_tail(&waiter->list, &wlist);
+ if (sem->first_waiter == waiter)
+ sem->first_waiter = next;
/*
* Limit # of readers that can be woken up per wakeup call.
*/
if (unlikely(woken >= MAX_READERS_WAKEUP))
break;
- }
+ } while ((waiter = next) != NULL);
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
oldcount = atomic_long_read(&sem->count);
- if (list_empty(&sem->wait_list)) {
+ if (!sem->first_waiter) {
/*
* Combined with list_move_tail() above, this implies
* rwsem_del_waiter().
@@ -545,7 +563,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
atomic_long_add(adjustment, &sem->count);
/* 2nd pass */
- list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ list_for_each_entry_safe(waiter, next, &wlist, list) {
struct task_struct *tsk;
tsk = waiter->task;
@@ -577,7 +595,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
struct wake_q_head *wake_q)
__releases(&sem->wait_lock)
{
- bool first = rwsem_first_waiter(sem) == waiter;
+ bool first = sem->first_waiter == waiter;
wake_q_init(wake_q);
@@ -603,7 +621,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
struct rwsem_waiter *waiter)
{
- struct rwsem_waiter *first = rwsem_first_waiter(sem);
+ struct rwsem_waiter *first = sem->first_waiter;
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -639,7 +657,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
new |= RWSEM_WRITER_LOCKED;
new &= ~RWSEM_FLAG_HANDOFF;
- if (list_is_singular(&sem->wait_list))
+ if (list_empty(&first->list))
new &= ~RWSEM_FLAG_WAITERS;
}
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
@@ -659,7 +677,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
* success.
*/
- list_del(&waiter->list);
+ __rwsem_del_waiter(sem, waiter);
+
rwsem_set_owner(sem);
return true;
}
@@ -994,7 +1013,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
{
long adjustment = -RWSEM_READER_BIAS;
long rcnt = (count >> RWSEM_READER_SHIFT);
- struct rwsem_waiter waiter;
+ struct rwsem_waiter waiter, *first;
DEFINE_WAKE_Q(wake_q);
/*
@@ -1019,7 +1038,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
*/
if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
raw_spin_lock_irq(&sem->wait_lock);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
&wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
@@ -1035,7 +1054,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list)) {
+ first = sem->first_waiter;
+ if (!first) {
/*
* In case the wait queue is empty and the lock isn't owned
* by a writer, this reader can exit the slowpath and return
@@ -1051,8 +1071,11 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
return sem;
}
adjustment += RWSEM_FLAG_WAITERS;
+ INIT_LIST_HEAD(&waiter.list);
+ sem->first_waiter = &waiter;
+ } else {
+ list_add_tail(&waiter.list, &first->list);
}
- rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
@@ -1110,7 +1133,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- struct rwsem_waiter waiter;
+ struct rwsem_waiter waiter, *first;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
@@ -1129,10 +1152,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
- rwsem_add_waiter(sem, &waiter);
- /* we're now waiting on the lock */
- if (rwsem_first_waiter(sem) != &waiter) {
+ first = sem->first_waiter;
+ if (first) {
+ list_add_tail(&waiter.list, &first->list);
rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
&wake_q);
if (!wake_q_empty(&wake_q)) {
@@ -1145,6 +1168,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
+ INIT_LIST_HEAD(&waiter.list);
+ sem->first_waiter = &waiter;
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
@@ -1218,7 +1243,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -1239,7 +1264,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
--
2.47.3
On Thu, Mar 05, 2026 at 07:55:41PM +0000, Matthew Wilcox (Oracle) wrote: > Instead of embedding a list_head in struct rw_semaphore, store a pointer > to the first waiter. The list of waiters remains a doubly linked list > so we can efficiently add to the tail of the list, remove from the front > (or middle) of the list. > Some of the list manipulation becomes more complicated, but it's a > reasonable tradeoff on the slow paths to shrink some core data structures > like struct inode. In the past few days we've started seeing lockups when running LTP on -next on a range of arm64 platforms which bisect to this patch. It looks like corruption of some kind, the exact trigger varies but it's very predictable that something goes wrong and we get lots of rwsem related backtraces which do seem relevant to this commmit. This one seems reasonably typical: <0>[ 79.522930] Internal error: Oops: 0000000096000004 [#2] SMP <6>[ 79.522932] note: cve-2017-17052[653] exited with preempt_count 2 ... <4>[ 79.839721] Call trace: <4>[ 79.842417] rwsem_mark_wake (kernel/locking/rwsem.c:442) (P) <4>[ 79.846854] rwsem_down_write_slowpath (kernel/locking/rwsem.c:609 kernel/locking/rwsem.c:1230) <4>[ 79.851896] down_write_killable (kernel/locking/rwsem.c:1343 (discriminator 2) kernel/locking/rwsem.c:1357 (discriminator 2) kernel/locking/rwsem.c:1629 (discriminator 2)) <4>[ 79.856242] vm_mmap_pgoff (include/linux/mmap_lock.h:555 mm/util.c:579) <4>[ 79.860158] ksys_mmap_pgoff (mm/mmap.c:605) <4>[ 79.864246] __arm64_sys_mmap (arch/arm64/kernel/sys.c:21) <4>[ 79.868333] invoke_syscall (arch/arm64/include/asm/current.h:19 arch/arm64/kernel/syscall.c:54) <4>[ 79.872332] el0_svc_common.constprop.0 (include/linux/thread_info.h:142 (discriminator 2) arch/arm64/kernel/syscall.c:140 (discriminator 2)) <4>[ 79.877285] do_el0_svc (arch/arm64/kernel/syscall.c:152) <4>[ 79.880850] el0_svc (arch/arm64/include/asm/irqflags.h:55 arch/arm64/include/asm/irqflags.h:76 arch/arm64/kernel/entry-common.c:80 arch/arm64/kernel/entry-common.c:725) <4>[ 79.884242] el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:744) <4>[ 79.888676] el0t_64_sync (arch/arm64/kernel/entry.S:596) It's one particular subset of LTP tests that's being run when triggering the issue which makes me suspect that there's some preexisting bug that's being exposed, I've enclosed the full list below but it's generally relatively early that things go south. Bisect log, I confirmed that yesterday's -next also has the issue: git bisect start # status: waiting for both good and bad commits # bad: [95c541ddfb0815a0ea8477af778bb13bb075079a] Add linux-next specific files for 20260316 git bisect bad 95c541ddfb0815a0ea8477af778bb13bb075079a # status: waiting for good commit(s), bad commit known # good: [ead394bf2919868802fdf6da887f485866893b12] Merge branch 'tip/urgent' of https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git git bisect good ead394bf2919868802fdf6da887f485866893b12 # good: [cf610899a17faed2e78af3336854572033243dbd] Merge branch 'master' of https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git git bisect good cf610899a17faed2e78af3336854572033243dbd # good: [828588f80831be0e7d40c1602d17a71f1810474c] Merge branch 'next' of https://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git git bisect good 828588f80831be0e7d40c1602d17a71f1810474c # bad: [acf2f4ef88e001a943f651aa1095a2337550c9a9] Merge branch 'usb-next' of https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git git bisect bad acf2f4ef88e001a943f651aa1095a2337550c9a9 # bad: [bd35dc8f24e87f856e7d5462b0e70a08fcfd13fc] Merge branch 'master' of https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git git bisect bad bd35dc8f24e87f856e7d5462b0e70a08fcfd13fc # bad: [fb3ed409f83bac4cdc38e2f2a35ce059a36bc24f] Merge branch into tip/master: 'timers/vdso' git bisect bad fb3ed409f83bac4cdc38e2f2a35ce059a36bc24f # bad: [35a4a178818d30d3802253651ad4e430686dce13] Merge branch into tip/master: 'objtool/core' git bisect bad 35a4a178818d30d3802253651ad4e430686dce13 # good: [bcf081a44cb86c9f48479915fccffcd0ea8f6309] Merge branch into tip/master: 'irq/core' git bisect good bcf081a44cb86c9f48479915fccffcd0ea8f6309 # bad: [739690915ce1f017223ef4e6f3cc966ccfa3c861] locking/rwsem: Add context analysis git bisect bad 739690915ce1f017223ef4e6f3cc966ccfa3c861 # good: [553c02fb588d4310193eba80f75b43b20befd1d2] rust: sync: atomic: Clarify the need of CONFIG_ARCH_SUPPORTS_ATOMIC_RMW git bisect good 553c02fb588d4310193eba80f75b43b20befd1d2 # good: [b91d5d4bcf1266257a9e0199e1b4ad7fa8771baa] rust: atomic: Update a safety comment in impl of `fetch_add()` git bisect good b91d5d4bcf1266257a9e0199e1b4ad7fa8771baa # bad: [25500ba7e77ce9d3d9b5a1929d41a2ee2e23f6fe] locking/mutex: Remove the list_head from struct mutex git bisect bad 25500ba7e77ce9d3d9b5a1929d41a2ee2e23f6fe # bad: [b9bdd4b6840454ef87f61b6506c9635c57a81650] locking/semaphore: Remove the list_head from struct semaphore git bisect bad b9bdd4b6840454ef87f61b6506c9635c57a81650 # bad: [1ea4b473504b6dc6a0d21c298519aff2d52433c9] locking/rwsem: Remove the list_head from struct rw_semaphore git bisect bad 1ea4b473504b6dc6a0d21c298519aff2d52433c9 # first bad commit: [1ea4b473504b6dc6a0d21c298519aff2d52433c9] locking/rwsem: Remove the list_head from struct rw_semaphore The LTP test list: cve-2016-9604 keyctl08 cve-2016-9793 setsockopt04 cve-2017-1000111 setsockopt07 cve-2017-1000112 setsockopt05 cve-2017-1000364 stack_clash cve-2017-1000380 snd_timer01 cve-2017-1000405 thp04 cve-2017-10661 timerfd_settime02 cve-2017-12192 keyctl07 cve-2017-12193 add_key04 cve-2017-15274 add_key02 cve-2017-15299 request_key03 -b cve-2017-15299 cve-2017-15951 request_key03 -b cve-2017-15951 cve-2017-17052 cve-2017-17052 cve-2017-17712 sendmsg03 cve-2017-17807 request_key04 cve-2017-2618 cve-2017-2618 cve-2017-2671 cve-2017-2671 cve-2017-6951 request_key05 cve-2017-7308 setsockopt02 cve-2017-7472 keyctl04 cve-2018-1000001 realpath01 cve-2018-12896 timer_settime03 cve-2018-9568 connect02 cve-2020-14386 sendto03 data_space data_space delete_module02 delete_module02 df01_sh df01.sh dirtyc0w dirtyc0w du01_sh du01.sh dup01 dup01 dup02 dup02 dup03 dup03 dup04 dup04 dup05 dup05 dup06 dup06 dup07 dup07 dup201 dup201 dup202 dup202 dup203 dup203 dup204 dup204 dup205 dup205 dup3_01 dup3_01 dup3_02 dup3_02 epoll01 epoll-ltp epoll_create1_01 epoll_create1_01 epoll_ctl01 epoll_ctl01 epoll_ctl02 epoll_ctl02 epoll_pwait01 epoll_pwait01 epoll_wait01 epoll_wait01 epoll_wait02 epoll_wait02 epoll_wait03 epoll_wait03 eventfd2_01 eventfd2_01 eventfd2_02 eventfd2_02 eventfd2_03 eventfd2_03 execl01 execl01 execle01 execle01 execlp01 execlp01 execv01 execv01 execve01 execve01 execve02 execve02 execve03 execve03 execve05 execve05 -i 5 -n 32 execveat01 execveat01 execveat02 execveat02 execveat03 execveat03 execvp01 execvp01 exit01 exit01 exit02 exit02 exit_group01 exit_group01 faccessat01 faccessat01 fallocate01 fallocate01 fallocate02 fallocate02 fallocate03 fallocate03 fallocate04 fallocate04 fallocate05 fallocate05 fallocate06 fallocate06 fanotify01 fanotify01 fanotify02 fanotify02 fanotify03 fanotify03 fanotify04 fanotify04 fanotify05 fanotify05 fanotify06 fanotify06 fanotify07 fanotify07 fanotify08 fanotify08 fanotify09 fanotify09 fanotify10 fanotify10 fanotify11 fanotify11 fanotify12 fanotify12 fanotify13 fanotify13 fanotify14 fanotify14 fanotify15 fanotify15 fanotify16 fanotify16 fchdir01 fchdir01 fchdir02 fchdir02 fchdir03 fchdir03 fchmod01 fchmod01 fchmod02 fchmod02 fchmod03 fchmod03 fchmod04 fchmod04 fchmod05 fchmod05 fchmod06 fchmod06 fchmodat01 fchmodat01 fchown01 fchown01 fchown02 fchown02 fchown03 fchown03 fchown04 fchown04 fchown05 fchown05 fchownat01 fchownat01 fchownat02 fchownat02 fcntl01 fcntl01 fcntl01_64 fcntl01_64 fcntl02 fcntl02 fcntl02_64 fcntl02_64 fcntl03 fcntl03 fcntl03_64 fcntl03_64 fcntl04 fcntl04 fcntl04_64 fcntl04_64 fcntl05 fcntl05 fcntl05_64 fcntl05_64 fcntl07 fcntl07 fcntl07_64 fcntl07_64 fcntl08 fcntl08 fcntl08_64 fcntl08_64 fcntl09 fcntl09 fcntl09_64 fcntl09_64 fcntl10 fcntl10 fcntl10_64 fcntl10_64 fcntl11 fcntl11 fcntl11_64 fcntl11_64 fcntl12 fcntl12 fcntl12_64 fcntl12_64 fcntl13 fcntl13 fcntl13_64 fcntl13_64 fcntl14 fcntl14 fcntl14_64 fcntl14_64 fcntl15 fcntl15 fcntl15_64 fcntl15_64 fcntl16 fcntl16 fcntl16_64 fcntl16_64 fcntl17 fcntl17 fcntl17_64 fcntl17_64 fcntl18 fcntl18 fcntl18_64 fcntl18_64 fcntl19 fcntl19 fcntl19_64 fcntl19_64 fcntl20 fcntl20 fcntl20_64 fcntl20_64 fcntl21 fcntl21 fcntl21_64 fcntl21_64 fcntl22 fcntl22 fcntl22_64 fcntl22_64 fcntl23 fcntl23 fcntl23_64 fcntl23_64 fcntl27 fcntl27 fcntl27_64 fcntl27_64 fcntl29 fcntl29 fcntl29_64 fcntl29_64 fcntl30 fcntl30 fcntl30_64 fcntl30_64 fcntl31 fcntl31 fcntl31_64 fcntl31_64 fcntl34 fcntl34 fcntl34_64 fcntl34_64 fcntl35 fcntl35 fcntl35_64 fcntl35_64 fcntl36 fcntl36 fcntl36_64 fcntl36_64 fcntl37 fcntl37 fcntl37_64 fcntl37_64 fcntl38 fcntl38 fcntl38_64 fcntl38_64 FCNTL_LOCKTESTS locktests -n 100 -f /tmp/fcntl_locktest_testfile fdatasync01 fdatasync01 fdatasync02 fdatasync02 fdatasync03 fdatasync03 fgetxattr01 fgetxattr01 file01_sh file01.sh float_bessel cd $LTPROOT/testcases/bin; float_bessel -v float_exp_log cd $LTPROOT/testcases/bin; float_exp_log -v float_iperb cd $LTPROOT/testcases/bin; float_iperb -v float_power cd $LTPROOT/testcases/bin; float_power -v float_trigo cd $LTPROOT/testcases/bin; float_trigo -v flock01 flock01 flock02 flock02 flock03 flock03 flock04 flock04 flock06 flock06 fmtmsg01 fmtmsg01 fork01 fork01 fork03 fork03 fork04 fork04 fork05 fork05 fork07 fork07 fork08 fork08 fork09 fork09 fork10 fork10 fork14 fork14 fpathconf01 fpathconf01 fptest01 fptest01 fptest02 fptest02 fremovexattr01 fremovexattr01 fremovexattr02 fremovexattr02 fs_bind01_sh fs_bind01.sh fs_bind02_sh fs_bind02.sh fs_bind03_sh fs_bind03.sh fs_bind04_sh fs_bind04.sh fs_bind05_sh fs_bind05.sh fs_bind06_sh fs_bind06.sh fs_bind07_sh fs_bind07.sh fs_bind07-2_sh fs_bind07-2.sh fs_bind08_sh fs_bind08.sh fs_bind09_sh fs_bind09.sh fs_bind10_sh fs_bind10.sh fs_bind11_sh fs_bind11.sh fs_bind12_sh fs_bind12.sh fs_bind13_sh fs_bind13.sh fs_bind14_sh fs_bind14.sh fs_bind15_sh fs_bind15.sh fs_bind16_sh fs_bind16.sh fs_bind17_sh fs_bind17.sh fs_bind18_sh fs_bind18.sh fs_bind19_sh fs_bind19.sh fs_bind20_sh fs_bind20.sh fs_bind21_sh fs_bind21.sh fs_bind22_sh fs_bind22.sh fs_bind23_sh fs_bind23.sh fs_bind24_sh fs_bind24.sh fs_bind_move01_sh fs_bind_move01.sh fs_bind_move02_sh fs_bind_move02.sh fs_bind_move03_sh fs_bind_move03.sh fs_bind_move04_sh fs_bind_move04.sh fs_bind_move05_sh fs_bind_move05.sh fs_bind_move06_sh fs_bind_move06.sh fs_bind_move07_sh fs_bind_move07.sh fs_bind_move08_sh fs_bind_move08.sh fs_bind_move09_sh fs_bind_move09.sh fs_bind_move10_sh fs_bind_move10.sh fs_bind_move11_sh fs_bind_move11.sh fs_bind_move12_sh fs_bind_move12.sh fs_bind_move13_sh fs_bind_move13.sh fs_bind_move14_sh fs_bind_move14.sh fs_bind_move15_sh fs_bind_move15.sh fs_bind_move16_sh fs_bind_move16.sh fs_bind_move17_sh fs_bind_move17.sh fs_bind_move18_sh fs_bind_move18.sh fs_bind_move19_sh fs_bind_move19.sh fs_bind_move20_sh fs_bind_move20.sh fs_bind_move21_sh fs_bind_move21.sh fs_bind_move22_sh fs_bind_move22.sh fs_bind_rbind01_sh fs_bind_rbind01.sh fs_bind_rbind02_sh fs_bind_rbind02.sh fs_bind_rbind03_sh fs_bind_rbind03.sh fs_bind_rbind04_sh fs_bind_rbind04.sh fs_bind_rbind05_sh fs_bind_rbind05.sh fs_bind_rbind06_sh fs_bind_rbind06.sh fs_bind_rbind07-2_sh fs_bind_rbind07-2.sh fs_bind_rbind07_sh fs_bind_rbind07.sh fs_bind_rbind08_sh fs_bind_rbind08.sh fs_bind_rbind09_sh fs_bind_rbind09.sh fs_bind_rbind10_sh fs_bind_rbind10.sh fs_bind_rbind11_sh fs_bind_rbind11.sh fs_bind_rbind12_sh fs_bind_rbind12.sh fs_bind_rbind13_sh fs_bind_rbind13.sh fs_bind_rbind14_sh fs_bind_rbind14.sh fs_bind_rbind15_sh fs_bind_rbind15.sh fs_bind_rbind16_sh fs_bind_rbind16.sh fs_bind_rbind17_sh fs_bind_rbind17.sh fs_bind_rbind18_sh fs_bind_rbind18.sh fs_bind_rbind19_sh fs_bind_rbind19.sh fs_bind_rbind20_sh fs_bind_rbind20.sh fs_bind_rbind21_sh fs_bind_rbind21.sh fs_bind_rbind22_sh fs_bind_rbind22.sh fs_bind_rbind23_sh fs_bind_rbind23.sh fs_bind_rbind24_sh fs_bind_rbind24.sh fs_bind_rbind25_sh fs_bind_rbind25.sh fs_bind_rbind26_sh fs_bind_rbind26.sh fs_bind_rbind27_sh fs_bind_rbind27.sh fs_bind_rbind28_sh fs_bind_rbind28.sh fs_bind_rbind29_sh fs_bind_rbind29.sh fs_bind_rbind30_sh fs_bind_rbind30.sh fs_bind_rbind31_sh fs_bind_rbind31.sh fs_bind_rbind32_sh fs_bind_rbind32.sh fs_bind_rbind33_sh fs_bind_rbind33.sh fs_bind_rbind34_sh fs_bind_rbind34.sh fs_bind_rbind35_sh fs_bind_rbind35.sh fs_bind_rbind36_sh fs_bind_rbind36.sh fs_bind_rbind37_sh fs_bind_rbind37.sh fs_bind_rbind38_sh fs_bind_rbind38.sh fs_bind_rbind39_sh fs_bind_rbind39.sh fs_bind_regression_sh fs_bind_regression.sh fs_di fs_di -d $TMPDIR fs_fill fs_fill fs_inod01 fs_inod $TMPDIR 10 10 10 fs_perms01 fs_perms 005 99 99 12 100 x 0 fs_perms02 fs_perms 050 99 99 200 99 x 0 fs_perms03 fs_perms 500 99 99 99 500 x 0 fs_perms04 fs_perms 002 99 99 12 100 w 0 fs_perms05 fs_perms 020 99 99 200 99 w 0 fs_perms06 fs_perms 200 99 99 99 500 w 0 fs_perms07 fs_perms 004 99 99 12 100 r 0 fs_perms08 fs_perms 040 99 99 200 99 r 0 fs_perms09 fs_perms 400 99 99 99 500 r 0 fs_perms10 fs_perms 000 99 99 99 99 r 1 fs_perms11 fs_perms 000 99 99 99 99 w 1 fs_perms12 fs_perms 000 99 99 99 99 x 1 fs_perms13 fs_perms 010 99 99 99 500 x 1 fs_perms14 fs_perms 100 99 99 200 99 x 1 fs_perms15 fs_perms 020 99 99 99 500 w 1 fs_perms16 fs_perms 200 99 99 200 99 w 1 fs_perms17 fs_perms 040 99 99 99 500 r 1 fs_perms18 fs_perms 400 99 99 200 99 r 1 fs_racer fs_racer.sh -t 5 fsconfig01 fsconfig01 fsconfig02 fsconfig02 fsetxattr01 fsetxattr01 fsmount01 fsmount01 fsmount02 fsmount02 fsopen01 fsopen01 fsopen02 fsopen02 fspick01 fspick01 fspick02 fspick02 fstat02 fstat02 fstat02_64 fstat02_64 fstat03 fstat03 fstat03_64 fstat03_64 fstatat01 fstatat01 fstatfs01 fstatfs01 fstatfs01_64 fstatfs01_64 fstatfs02 fstatfs02 fstatfs02_64 fstatfs02_64 fsx02 fsx-linux -l 500000 -r 4096 -t 2048 -w 2048 -N 10000 fsync01 fsync01 fsync02 fsync02 fsync03 fsync03 fsync04 fsync04 ftest01 ftest01 ftest02 ftest02 ftest03 ftest03 ftest04 ftest04 ftest05 ftest05 ftest06 ftest06 ftest07 ftest07 ftest08 ftest08 ftruncate01 ftruncate01 ftruncate01_64 ftruncate01_64 ftruncate03 ftruncate03 ftruncate03_64 ftruncate03_64 ftruncate04 ftruncate04 ftruncate04_64 ftruncate04_64 futex_cmp_requeue01 futex_cmp_requeue01 futex_cmp_requeue02 futex_cmp_requeue02 futex_wait01 futex_wait01 futex_wait02 futex_wait02 futex_wait03 futex_wait03 futex_wait04 futex_wait04 futex_wait05 futex_wait05 futex_wait_bitset01 futex_wait_bitset01 futex_wake01 futex_wake01 futex_wake02 futex_wake02 futex_wake03 futex_wake03 futex_wake04 futex_wake04 get_robust_list01 get_robust_list01 getaddrinfo_01 getaddrinfo_01 getcontext01 getcontext01 getcpu01 getcpu01 getcwd01 getcwd01 getcwd02 getcwd02 getcwd03 getcwd03 getcwd04 getcwd04 getdents01 getdents01 getdents02 getdents02 getdomainname01 getdomainname01 getegid01 getegid01 getegid02 getegid02 geteuid01 geteuid01 geteuid02 geteuid02 getgid01 getgid01 getgid03 getgid03 getgroups01 getgroups01 getgroups03 getgroups03 gethostbyname_r01 gethostbyname_r01 gethostid01 gethostid01 gethostname01 gethostname01 getitimer01 getitimer01 getitimer02 getitimer02 getpagesize01 getpagesize01 getpeername01 getpeername01 getpgid01 getpgid01 getpgid02 getpgid02 getpgrp01 getpgrp01 getpid01 getpid01 getpid02 getpid02 getppid01 getppid01 getppid02 getppid02 getpriority01 getpriority01 getpriority02 getpriority02 getrandom01 getrandom01 getrandom02 getrandom02 getrandom03 getrandom03 getrandom04 getrandom04 getresgid01 getresgid01 getresgid02 getresgid02 getresgid03 getresgid03 getresuid01 getresuid01 getresuid02 getresuid02 getresuid03 getresuid03 getrlimit01 getrlimit01 getrlimit02 getrlimit02 getrlimit03 getrlimit03 getrusage01 getrusage01 getrusage02 getrusage02 getrusage03 getrusage03 getrusage04 getrusage04 getsid01 getsid01 getsid02 getsid02 getsockname01 getsockname01 getsockopt01 getsockopt01 getsockopt02 getsockopt02 gettid01 gettid01 gettimeofday01 gettimeofday01 gettimeofday02 gettimeofday02 getuid01 getuid01 getuid03 getuid03 gzip01_sh gzip_tests.sh hackbench01 hackbench 50 process 1000 hackbench02 hackbench 20 thread 1000 hangup01 hangup01 hugemmap01 hugemmap01 hugemmap02 hugemmap02 hugemmap04 hugemmap04 hugemmap05 hugemmap05 hugemmap05_1 hugemmap05 -m hugemmap05_2 hugemmap05 -s
On Wed, Mar 18, 2026 at 07:07:24PM +0000, Mark Brown wrote: > On Thu, Mar 05, 2026 at 07:55:41PM +0000, Matthew Wilcox (Oracle) wrote: > > Instead of embedding a list_head in struct rw_semaphore, store a pointer > > to the first waiter. The list of waiters remains a doubly linked list > > so we can efficiently add to the tail of the list, remove from the front > > (or middle) of the list. > > > Some of the list manipulation becomes more complicated, but it's a > > reasonable tradeoff on the slow paths to shrink some core data structures > > like struct inode. > > In the past few days we've started seeing lockups when running LTP on > -next on a range of arm64 platforms which bisect to this patch. It > looks like corruption of some kind, the exact trigger varies but it's > very predictable that something goes wrong and we get lots of rwsem > related backtraces which do seem relevant to this commmit. This one > seems reasonably typical: I merged the fix in todays branch: https://lkml.kernel.org/r/177382097549.1647592.8219974128268935080.tip-bot2@tip-bot2
On Wed, Mar 18, 2026 at 09:28:51PM +0100, Peter Zijlstra wrote: > On Wed, Mar 18, 2026 at 07:07:24PM +0000, Mark Brown wrote: > > On Thu, Mar 05, 2026 at 07:55:41PM +0000, Matthew Wilcox (Oracle) wrote: > > In the past few days we've started seeing lockups when running LTP on > > -next on a range of arm64 platforms which bisect to this patch. It > > looks like corruption of some kind, the exact trigger varies but it's > > very predictable that something goes wrong and we get lots of rwsem > > related backtraces which do seem relevant to this commmit. This one > > seems reasonably typical: > I merged the fix in todays branch: > https://lkml.kernel.org/r/177382097549.1647592.8219974128268935080.tip-bot2@tip-bot2 Ah, excellent timing :/ - I'll let you know if there are still issues going forwards.
The following commit has been merged into the locking/core branch of tip:
Commit-ID: 1ea4b473504b6dc6a0d21c298519aff2d52433c9
Gitweb: https://git.kernel.org/tip/1ea4b473504b6dc6a0d21c298519aff2d52433c9
Author: Matthew Wilcox (Oracle) <willy@infradead.org>
AuthorDate: Thu, 05 Mar 2026 19:55:41
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Sun, 08 Mar 2026 11:06:51 +01:00
locking/rwsem: Remove the list_head from struct rw_semaphore
Instead of embedding a list_head in struct rw_semaphore, store a pointer
to the first waiter. The list of waiters remains a doubly linked list
so we can efficiently add to the tail of the list, remove from the front
(or middle) of the list.
Some of the list manipulation becomes more complicated, but it's a
reasonable tradeoff on the slow paths to shrink some core data structures
like struct inode.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260305195545.3707590-2-willy@infradead.org
---
include/linux/rwsem.h | 8 ++--
kernel/locking/rwsem.c | 90 ++++++++++++++++++++++++++---------------
2 files changed, 62 insertions(+), 36 deletions(-)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9bf1d93..e782953 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) {
struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
raw_spinlock_t wait_lock;
- struct list_head wait_list;
+ struct rwsem_waiter *first_waiter;
#ifdef CONFIG_DEBUG_RWSEMS
void *magic;
#endif
@@ -106,7 +106,7 @@ static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *
.owner = ATOMIC_LONG_INIT(0), \
__RWSEM_OPT_INIT(name) \
.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
- .wait_list = LIST_HEAD_INIT((name).wait_list), \
+ .first_waiter = NULL, \
__RWSEM_DEBUG_INIT(name) \
__RWSEM_DEP_MAP_INIT(name) }
@@ -129,9 +129,9 @@ do { \
* rwsem to see if somebody from an incompatible type is wanting access to the
* lock.
*/
-static inline int rwsem_is_contended(struct rw_semaphore *sem)
+static inline bool rwsem_is_contended(struct rw_semaphore *sem)
{
- return !list_empty(&sem->wait_list);
+ return sem->first_waiter != NULL;
}
#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 24df4d9..e66f37e 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -72,7 +72,7 @@
#c, atomic_long_read(&(sem)->count), \
(unsigned long) sem->magic, \
atomic_long_read(&(sem)->owner), (long)current, \
- list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ (sem)->first_waiter ? "" : "not ")) \
debug_locks_off(); \
} while (0)
#else
@@ -321,7 +321,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
#endif
atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
+ sem->first_waiter = NULL;
atomic_long_set(&sem->owner, 0L);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
osq_lock_init(&sem->osq);
@@ -341,8 +341,6 @@ struct rwsem_waiter {
unsigned long timeout;
bool handoff_set;
};
-#define rwsem_first_waiter(sem) \
- list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
enum rwsem_wake_type {
RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
@@ -365,12 +363,21 @@ enum rwsem_wake_type {
*/
#define MAX_READERS_WAKEUP 0x100
-static inline void
-rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
+static inline
+bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
- lockdep_assert_held(&sem->wait_lock);
- list_add_tail(&waiter->list, &sem->wait_list);
- /* caller will set RWSEM_FLAG_WAITERS */
+ if (list_empty(&waiter->list)) {
+ sem->first_waiter = NULL;
+ return true;
+ }
+
+ if (sem->first_waiter == waiter) {
+ sem->first_waiter = list_first_entry(&waiter->list,
+ struct rwsem_waiter, list);
+ }
+ list_del(&waiter->list);
+
+ return false;
}
/*
@@ -385,14 +392,23 @@ static inline bool
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
- list_del(&waiter->list);
- if (likely(!list_empty(&sem->wait_list)))
+ if (__rwsem_del_waiter(sem, waiter))
return true;
-
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
return false;
}
+static inline
+struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem,
+ const struct rwsem_waiter *waiter)
+{
+ struct rwsem_waiter *next = list_first_entry(&waiter->list,
+ struct rwsem_waiter, list);
+ if (next == sem->first_waiter)
+ return NULL;
+ return next;
+}
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
@@ -411,7 +427,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
struct wake_q_head *wake_q)
{
- struct rwsem_waiter *waiter, *tmp;
+ struct rwsem_waiter *waiter, *next;
long oldcount, woken = 0, adjustment = 0;
struct list_head wlist;
@@ -421,7 +437,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* Take a peek at the queue head waiter such that we can determine
* the wakeup(s) to perform.
*/
- waiter = rwsem_first_waiter(sem);
+ waiter = sem->first_waiter;
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
if (wake_type == RWSEM_WAKE_ANY) {
@@ -506,25 +522,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* put them into wake_q to be woken up later.
*/
INIT_LIST_HEAD(&wlist);
- list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ do {
+ next = next_waiter(sem, waiter);
if (waiter->type == RWSEM_WAITING_FOR_WRITE)
continue;
woken++;
list_move_tail(&waiter->list, &wlist);
+ if (sem->first_waiter == waiter)
+ sem->first_waiter = next;
/*
* Limit # of readers that can be woken up per wakeup call.
*/
if (unlikely(woken >= MAX_READERS_WAKEUP))
break;
- }
+ } while ((waiter = next) != NULL);
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
oldcount = atomic_long_read(&sem->count);
- if (list_empty(&sem->wait_list)) {
+ if (!sem->first_waiter) {
/*
* Combined with list_move_tail() above, this implies
* rwsem_del_waiter().
@@ -545,7 +564,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
atomic_long_add(adjustment, &sem->count);
/* 2nd pass */
- list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ list_for_each_entry_safe(waiter, next, &wlist, list) {
struct task_struct *tsk;
tsk = waiter->task;
@@ -577,7 +596,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
struct wake_q_head *wake_q)
__releases(&sem->wait_lock)
{
- bool first = rwsem_first_waiter(sem) == waiter;
+ bool first = sem->first_waiter == waiter;
wake_q_init(wake_q);
@@ -603,7 +622,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
struct rwsem_waiter *waiter)
{
- struct rwsem_waiter *first = rwsem_first_waiter(sem);
+ struct rwsem_waiter *first = sem->first_waiter;
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -639,7 +658,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
new |= RWSEM_WRITER_LOCKED;
new &= ~RWSEM_FLAG_HANDOFF;
- if (list_is_singular(&sem->wait_list))
+ if (list_empty(&first->list))
new &= ~RWSEM_FLAG_WAITERS;
}
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
@@ -659,7 +678,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
* success.
*/
- list_del(&waiter->list);
+ __rwsem_del_waiter(sem, waiter);
+
rwsem_set_owner(sem);
return true;
}
@@ -994,7 +1014,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
{
long adjustment = -RWSEM_READER_BIAS;
long rcnt = (count >> RWSEM_READER_SHIFT);
- struct rwsem_waiter waiter;
+ struct rwsem_waiter waiter, *first;
DEFINE_WAKE_Q(wake_q);
/*
@@ -1019,7 +1039,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
*/
if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
raw_spin_lock_irq(&sem->wait_lock);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
&wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
@@ -1035,7 +1055,8 @@ queue:
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list)) {
+ first = sem->first_waiter;
+ if (!first) {
/*
* In case the wait queue is empty and the lock isn't owned
* by a writer, this reader can exit the slowpath and return
@@ -1051,8 +1072,11 @@ queue:
return sem;
}
adjustment += RWSEM_FLAG_WAITERS;
+ INIT_LIST_HEAD(&waiter.list);
+ sem->first_waiter = &waiter;
+ } else {
+ list_add_tail(&waiter.list, &first->list);
}
- rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
@@ -1110,7 +1134,7 @@ out_nolock:
static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
- struct rwsem_waiter waiter;
+ struct rwsem_waiter waiter, *first;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
@@ -1129,10 +1153,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
- rwsem_add_waiter(sem, &waiter);
- /* we're now waiting on the lock */
- if (rwsem_first_waiter(sem) != &waiter) {
+ first = sem->first_waiter;
+ if (first) {
+ list_add_tail(&waiter.list, &first->list);
rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
&wake_q);
if (!wake_q_empty(&wake_q)) {
@@ -1145,6 +1169,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
raw_spin_lock_irq(&sem->wait_lock);
}
} else {
+ INIT_LIST_HEAD(&waiter.list);
+ sem->first_waiter = &waiter;
atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
}
@@ -1218,7 +1244,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -1239,7 +1265,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (!list_empty(&sem->wait_list))
+ if (sem->first_waiter)
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
On Mon, Mar 9, 2026 at 12:48 PM tip-bot2 for Matthew Wilcox (Oracle)
<tip-bot2@linutronix.de> wrote:
>
> The following commit has been merged into the locking/core branch of tip:
>
> Commit-ID: 1ea4b473504b6dc6a0d21c298519aff2d52433c9
> Gitweb: https://git.kernel.org/tip/1ea4b473504b6dc6a0d21c298519aff2d52433c9
> Author: Matthew Wilcox (Oracle) <willy@infradead.org>
> AuthorDate: Thu, 05 Mar 2026 19:55:41
> Committer: Peter Zijlstra <peterz@infradead.org>
> CommitterDate: Sun, 08 Mar 2026 11:06:51 +01:00
>
> locking/rwsem: Remove the list_head from struct rw_semaphore
>
> Instead of embedding a list_head in struct rw_semaphore, store a pointer
> to the first waiter. The list of waiters remains a doubly linked list
> so we can efficiently add to the tail of the list, remove from the front
> (or middle) of the list.
>
> Some of the list manipulation becomes more complicated, but it's a
> reasonable tradeoff on the slow paths to shrink some core data structures
> like struct inode.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: https://patch.msgid.link/20260305195545.3707590-2-willy@infradead.org
> ---
> include/linux/rwsem.h | 8 ++--
> kernel/locking/rwsem.c | 90 ++++++++++++++++++++++++++---------------
> 2 files changed, 62 insertions(+), 36 deletions(-)
>
...
> -static inline void
> -rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
> +static inline
> +bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
> {
> - lockdep_assert_held(&sem->wait_lock);
> - list_add_tail(&waiter->list, &sem->wait_list);
> - /* caller will set RWSEM_FLAG_WAITERS */
> + if (list_empty(&waiter->list)) {
> + sem->first_waiter = NULL;
> + return true;
> + }
> +
> + if (sem->first_waiter == waiter) {
> + sem->first_waiter = list_first_entry(&waiter->list,
> + struct rwsem_waiter, list);
> + }
> + list_del(&waiter->list);
> +
> + return false;
> }
>
> /*
> @@ -385,14 +392,23 @@ static inline bool
> rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
> {
> lockdep_assert_held(&sem->wait_lock);
> - list_del(&waiter->list);
> - if (likely(!list_empty(&sem->wait_list)))
> + if (__rwsem_del_waiter(sem, waiter))
__rwsem_del_waiter() returns true when the wait list becomes empty.
rwsem_del_waiter() is supposed to return true if the wait list is not empty...
> return true;
> -
> atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
> return false;
> }
Thanks,
Andrei
© 2016 - 2026 Red Hat, Inc.