From nobody Thu Apr 9 20:27:26 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2D5E03E123D for ; Thu, 5 Mar 2026 19:55:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772740553; cv=none; b=kil26PFST7JjT5E9mw60SBItzbrJBsdBfe+MqumcQJCNmcth8UPD53u2j3Gnf7/YzvNcHi3LVCxjGEGcLQoHR71Ilp0yuMvsws0Aw4Yf+bZLjEEXlomPckxmTR/7+OAlfBTe21tUXSiDFaEw4ZZiGPbL/hF/bBnAhaZ1cHrvz70= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772740553; c=relaxed/simple; bh=aiXptpPWFyzmvgRmE/7zRmQS5SN9KFZthYgyLZv66As=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=CmObW1m0EHh1GR/qB9YpS8JR89bIK08XncC6quGyN525spuT/VkZir3RNvOiy4u11nJBmlO3T+rRXmSFFrdmHO1MPYD8o4VKmtfvOhwR9KeBoXriR5SN1EbQZ+uKIGsxwNyPhRVt/CanfteCbn7pvRx5d/EdQBCMfVc54PUlH5E= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=GMFAA5l+; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="GMFAA5l+" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=777Q1eBBiI5jR1iz/ArGmsTPTNXMXX/GIX3RGqh5TdU=; b=GMFAA5l+39hYvl46GNyehW5uVI EpqSuo1B+nQVVeCue4PvCscT/iIEfgjzuKYkpJjl0iAAeL5gewOj5RAiaDdmMZw3jxCs/HIGNidiM vsGiCMW39k6WtmMG1DHS7CEcX7jMkwWybgX0dFb3FpXc1Y/NSWSLv1oBBBYk9DOdWPSgbFouejVZe meg4a8iABzvoB2jwdm4o0hfk2bSzNvXrd3/rVsPmUTdyjZgBZ7jmUDnh4+EZShwDXYCFrA++Krs4Z ONrfytUOlVXAO26gFPj052Lnxr9KkeuRh4ew2vkBgf7ef8Pg9KzYBWFc3YjW9skRoyO9PxnOgrwxR a8QK4Zsw==; Received: from willy by casper.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux)) id 1vyEnT-0000000FYWC-3HjM; Thu, 05 Mar 2026 19:55:47 +0000 From: "Matthew Wilcox (Oracle)" To: Peter Zijlstra Cc: "Matthew Wilcox (Oracle)" , Ingo Molnar , Will Deacon , Boqun Feng , Waiman Long , linux-kernel@vger.kernel.org Subject: [PATCH 1/3] rwsem: Remove the list_head from struct rw_semaphore Date: Thu, 5 Mar 2026 19:55:41 +0000 Message-ID: <20260305195545.3707590-2-willy@infradead.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260305195545.3707590-1-willy@infradead.org> References: <20260305195545.3707590-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Instead of embedding a list_head in struct rw_semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink some core data structures like struct inode. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rwsem.h | 8 ++-- kernel/locking/rwsem.c | 89 +++++++++++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 36 deletions(-) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index f1aaf676a874..1771c96a01d2 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,7 +57,7 @@ struct rw_semaphore { struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; - struct list_head wait_list; + struct rwsem_waiter *first_waiter; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif @@ -104,7 +104,7 @@ static inline void rwsem_assert_held_write_nolockdep(co= nst struct rw_semaphore * .owner =3D ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock =3D __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ - .wait_list =3D LIST_HEAD_INIT((name).wait_list), \ + .first_waiter =3D NULL, \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } =20 @@ -127,9 +127,9 @@ do { \ * rwsem to see if somebody from an incompatible type is wanting access to= the * lock. */ -static inline int rwsem_is_contended(struct rw_semaphore *sem) +static inline bool rwsem_is_contended(struct rw_semaphore *sem) { - return !list_empty(&sem->wait_list); + return sem->first_waiter !=3D NULL; } =20 #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKE= R) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 24df4d98f7d2..6030d5d81ccc 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -72,7 +72,7 @@ #c, atomic_long_read(&(sem)->count), \ (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ + (sem)->first_waiter ? "" : "not ")) \ debug_locks_off(); \ } while (0) #else @@ -321,7 +321,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char = *name, #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); + sem->first_waiter =3D NULL; atomic_long_set(&sem->owner, 0L); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); @@ -341,8 +341,6 @@ struct rwsem_waiter { unsigned long timeout; bool handoff_set; }; -#define rwsem_first_waiter(sem) \ - list_first_entry(&sem->wait_list, struct rwsem_waiter, list) =20 enum rwsem_wake_type { RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ @@ -365,12 +363,21 @@ enum rwsem_wake_type { */ #define MAX_READERS_WAKEUP 0x100 =20 -static inline void -rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +static inline +bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *wai= ter) { - lockdep_assert_held(&sem->wait_lock); - list_add_tail(&waiter->list, &sem->wait_list); - /* caller will set RWSEM_FLAG_WAITERS */ + if (list_empty(&waiter->list)) { + sem->first_waiter =3D NULL; + return true; + } + + if (sem->first_waiter =3D=3D waiter) { + sem->first_waiter =3D list_first_entry(&waiter->list, + struct rwsem_waiter, list); + } + list_del(&waiter->list); + + return false; } =20 /* @@ -385,14 +392,22 @@ static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); - list_del(&waiter->list); - if (likely(!list_empty(&sem->wait_list))) + if (__rwsem_del_waiter(sem, waiter)) return true; - atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); return false; } =20 +static inline struct rwsem_waiter *next_waiter(const struct rw_semaphore *= sem, + const struct rwsem_waiter *waiter) +{ + struct rwsem_waiter *next =3D list_first_entry(&waiter->list, + struct rwsem_waiter, list); + if (next =3D=3D sem->first_waiter) + return NULL; + return next; +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -411,7 +426,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter, *tmp; + struct rwsem_waiter *waiter, *next; long oldcount, woken =3D 0, adjustment =3D 0; struct list_head wlist; =20 @@ -421,7 +436,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * Take a peek at the queue head waiter such that we can determine * the wakeup(s) to perform. */ - waiter =3D rwsem_first_waiter(sem); + waiter =3D sem->first_waiter; =20 if (waiter->type =3D=3D RWSEM_WAITING_FOR_WRITE) { if (wake_type =3D=3D RWSEM_WAKE_ANY) { @@ -506,25 +521,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * put them into wake_q to be woken up later. */ INIT_LIST_HEAD(&wlist); - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + do { + next =3D next_waiter(sem, waiter); if (waiter->type =3D=3D RWSEM_WAITING_FOR_WRITE) continue; =20 woken++; list_move_tail(&waiter->list, &wlist); + if (sem->first_waiter =3D=3D waiter) + sem->first_waiter =3D next; =20 /* * Limit # of readers that can be woken up per wakeup call. */ if (unlikely(woken >=3D MAX_READERS_WAKEUP)) break; - } + } while ((waiter =3D next) !=3D NULL); =20 adjustment =3D woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); =20 oldcount =3D atomic_long_read(&sem->count); - if (list_empty(&sem->wait_list)) { + if (!sem->first_waiter) { /* * Combined with list_move_tail() above, this implies * rwsem_del_waiter(). @@ -545,7 +563,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, atomic_long_add(adjustment, &sem->count); =20 /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { + list_for_each_entry_safe(waiter, next, &wlist, list) { struct task_struct *tsk; =20 tsk =3D waiter->task; @@ -577,7 +595,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct = rwsem_waiter *waiter, struct wake_q_head *wake_q) __releases(&sem->wait_lock) { - bool first =3D rwsem_first_waiter(sem) =3D=3D waiter; + bool first =3D sem->first_waiter =3D=3D waiter; =20 wake_q_init(wake_q); =20 @@ -603,7 +621,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct = rwsem_waiter *waiter, static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { - struct rwsem_waiter *first =3D rwsem_first_waiter(sem); + struct rwsem_waiter *first =3D sem->first_waiter; long count, new; =20 lockdep_assert_held(&sem->wait_lock); @@ -639,7 +657,7 @@ static inline bool rwsem_try_write_lock(struct rw_semap= hore *sem, new |=3D RWSEM_WRITER_LOCKED; new &=3D ~RWSEM_FLAG_HANDOFF; =20 - if (list_is_singular(&sem->wait_list)) + if (list_empty(&first->list)) new &=3D ~RWSEM_FLAG_WAITERS; } } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); @@ -659,7 +677,8 @@ static inline bool rwsem_try_write_lock(struct rw_semap= hore *sem, * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on * success. */ - list_del(&waiter->list); + __rwsem_del_waiter(sem, waiter); + rwsem_set_owner(sem); return true; } @@ -994,7 +1013,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, lon= g count, unsigned int stat { long adjustment =3D -RWSEM_READER_BIAS; long rcnt =3D (count >> RWSEM_READER_SHIFT); - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); =20 /* @@ -1019,7 +1038,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, lo= ng count, unsigned int stat */ if ((rcnt =3D=3D 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -1035,7 +1054,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, lo= ng count, unsigned int stat waiter.handoff_set =3D false; =20 raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { + first =3D sem->first_waiter; + if (!first) { /* * In case the wait queue is empty and the lock isn't owned * by a writer, this reader can exit the slowpath and return @@ -1051,8 +1071,11 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, l= ong count, unsigned int stat return sem; } adjustment +=3D RWSEM_FLAG_WAITERS; + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter =3D &waiter; + } else { + list_add_tail(&waiter.list, &first->list); } - rwsem_add_waiter(sem, &waiter); =20 /* we're now waiting on the lock, but no longer actively locking */ count =3D atomic_long_add_return(adjustment, &sem->count); @@ -1110,7 +1133,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, lo= ng count, unsigned int stat static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); =20 /* do optimistic spinning and steal lock if possible */ @@ -1129,10 +1152,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem,= int state) waiter.handoff_set =3D false; =20 raw_spin_lock_irq(&sem->wait_lock); - rwsem_add_waiter(sem, &waiter); =20 - /* we're now waiting on the lock */ - if (rwsem_first_waiter(sem) !=3D &waiter) { + first =3D sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), &wake_q); if (!wake_q_empty(&wake_q)) { @@ -1145,6 +1168,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, i= nt state) raw_spin_lock_irq(&sem->wait_lock); } } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter =3D &waiter; atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } =20 @@ -1218,7 +1243,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_sema= phore *sem) =20 raw_spin_lock_irqsave(&sem->wait_lock, flags); =20 - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); =20 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -1239,7 +1264,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(stru= ct rw_semaphore *sem) =20 raw_spin_lock_irqsave(&sem->wait_lock, flags); =20 - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); =20 raw_spin_unlock_irqrestore(&sem->wait_lock, flags); --=20 2.47.3 From nobody Thu Apr 9 20:27:26 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 143BB3ECBC9 for ; Thu, 5 Mar 2026 19:55:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772740557; cv=none; b=V/U68Kg+tOyKP2Df29oazDakfS/BQBjEkAzjuWMn1Rnpv8q27lOSQURW+6PcWH47g4/q7HIdWQ5nOo/k8Y64HdYr2ZLHxLdWFlamPk0oFYk2GnepCXVEl/yGFn5hOp4o+c036of+ew4eqHgAXtRmGweJpPSyetMSojXfufsrp4k= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772740557; c=relaxed/simple; bh=/lZeqjqQZax3bUcO84V9r/my8kQ4QKbCXhEmGg8Cw4M=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=qNgx9CG1kKSH+LY55SOKTxxjj648+f6KkSoLt67a05sMGI8YkIcznQyN8E7PLKJyq4coHN9ZJUrkgEUM093zXzmg6iubOgkG6uEEd8g4ToqSuiFfBRr4cEAI+goxrUqZ3NRzZncQIhJPH8NVg4iN6dXKc4amt0niWASYr3pqIpc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=OxC9nrzk; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="OxC9nrzk" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=jG2jyA9BuQlTAXqpUX1aWUfapy2uCq+TR/xn+DRwXC0=; b=OxC9nrzkXtBvr2Fh+d6PLSCxSg 5ORfgj76VdPhbt5EF2sQcIDGKN6duiIiBPgMzB1oINDmElWkojXvYlO52aj+Mtlm8XkzB3tWNvELV tr2TfR+tYkpHJ1Hb6v2gjOJHGlP9MQDSfGZAKayNh7NOrzidaklzNMHS54lHbTTAH+E5vgpvbT6lD +N+YX5o2TgB6WrUjnMtb1HLsVOLgH2yHAH6GbP3NCCFQXt/9Br6ralmsAjkhbWy7HFXRefal0quSP mWDqb41zcbrFicspEDN2YdCnmb7PkR5Mvayy7MApNIGtrbNga/8WpzDRmXFmaOQMIRGA9Vxyh/gLF /GD+cRGw==; Received: from willy by casper.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux)) id 1vyEnT-0000000FYWE-3xjd; Thu, 05 Mar 2026 19:55:47 +0000 From: "Matthew Wilcox (Oracle)" To: Peter Zijlstra Cc: "Matthew Wilcox (Oracle)" , Ingo Molnar , Will Deacon , Boqun Feng , Waiman Long , linux-kernel@vger.kernel.org Subject: [PATCH 2/3] semaphore: Remove the list_head from struct semaphore Date: Thu, 5 Mar 2026 19:55:42 +0000 Message-ID: <20260305195545.3707590-3-willy@infradead.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260305195545.3707590-1-willy@infradead.org> References: <20260305195545.3707590-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Instead of embedding a list_head in struct semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list and remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a semaphore. Signed-off-by: Matthew Wilcox (Oracle) --- drivers/acpi/osl.c | 2 +- include/linux/semaphore.h | 4 ++-- kernel/locking/semaphore.c | 41 ++++++++++++++++++++++++++++---------- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 05393a7315fe..782dd93b891e 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1257,7 +1257,7 @@ acpi_status acpi_os_delete_semaphore(acpi_handle hand= le) =20 ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Deleting semaphore[%p].\n", handle)); =20 - BUG_ON(!list_empty(&sem->wait_list)); + BUG_ON(sem->first_waiter); kfree(sem); sem =3D NULL; =20 diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 89706157e622..a4c8651ef021 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -15,7 +15,7 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; - struct list_head wait_list; + struct semaphore_waiter *first_waiter; =20 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER unsigned long last_holder; @@ -33,7 +33,7 @@ struct semaphore { { \ .lock =3D __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count =3D n, \ - .wait_list =3D LIST_HEAD_INIT((name).wait_list) \ + .first_waiter =3D NULL \ __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } =20 diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 3ef032e22f7e..cb9eae819e64 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -21,7 +21,7 @@ * too. * * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. + * semaphore. If it's zero, there may be waiters. */ =20 #include @@ -226,7 +226,7 @@ void __sched up(struct semaphore *sem) =20 hung_task_sem_clear_if_holder(sem); =20 - if (likely(list_empty(&sem->wait_list))) + if (likely(!sem->first_waiter)) sem->count++; else __up(sem, &wake_q); @@ -244,6 +244,21 @@ struct semaphore_waiter { bool up; }; =20 +static inline +void sem_del_waiter(struct semaphore *sem, struct semaphore_waiter *waiter) +{ + if (list_empty(&waiter->list)) { + sem->first_waiter =3D NULL; + return; + } + + if (sem->first_waiter =3D=3D waiter) { + sem->first_waiter =3D list_first_entry(&waiter->list, + struct semaphore_waiter, list); + } + list_del(&waiter->list); +} + /* * Because this function is inlined, the 'state' parameter will be * constant, and thus optimised away by the compiler. Likewise the @@ -252,9 +267,15 @@ struct semaphore_waiter { static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); + struct semaphore_waiter waiter, *first; + + first =3D sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); + } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter =3D &waiter; + } waiter.task =3D current; waiter.up =3D false; =20 @@ -274,11 +295,11 @@ static inline int __sched ___down_common(struct semap= hore *sem, long state, } =20 timed_out: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -ETIME; =20 interrupted: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -EINTR; } =20 @@ -321,9 +342,9 @@ static noinline int __sched __down_timeout(struct semap= hore *sem, long timeout) static noinline void __sched __up(struct semaphore *sem, struct wake_q_head *wake_q) { - struct semaphore_waiter *waiter =3D list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); + struct semaphore_waiter *waiter =3D sem->first_waiter; + + sem_del_waiter(sem, waiter); waiter->up =3D true; wake_q_add(wake_q, waiter->task); } --=20 2.47.3 From nobody Thu Apr 9 20:27:26 2026 Received: from casper.infradead.org (casper.infradead.org [90.155.50.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 688CB3E51FF for ; Thu, 5 Mar 2026 19:55:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=90.155.50.34 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772740553; cv=none; b=cgA0kXYfxVo1qJj6jRx0Tjt+wTA/m731RpwcBvqEHLEvWPC/a6ttXD4QT3LkVUlS+TmqWfNJz1xeVH95ShwO6zkzf52In9DGQ4nJEKCxDEdImzK0c6wlHNWeIGxRZRdo0yQn9j+x3iG102Y0Yyyzrj3ZYWB5BBzrv833PtZv9XY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1772740553; c=relaxed/simple; bh=wdERgwOaK5t4N0zprsjqzjFaxk/dMyWrJNLHxwfOnW0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=dR5KWQGiJOuMNZe1oZ8ez6K8j8qnTLaHOmlwuZYv+s2WZe/t8s3qDLLWB9eTiJOWzvBp2KUdzOZzgsGsdwFVWLF9RQ53cKzMJnMDMTnXdCTp9zSai73iqlHmeMRaq0/ki3/AraQ51ofuayhiz5SJ9YwlukhZQstzDBQ9OfDcqNE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=KapX4tEA; arc=none smtp.client-ip=90.155.50.34 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=infradead.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="KapX4tEA" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=Content-Transfer-Encoding:MIME-Version: References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:Sender:Reply-To: Content-Type:Content-ID:Content-Description; bh=ub0UoHObFTW59m/3JUajcwUEgWnu4gggr8oZhweYH7g=; b=KapX4tEAfkHPSRqQm72+QhqqZt /seU8gkvB4v1YPEVU3H4TkW39AGoTT2Qt+huepLK2qkGE0aRVqLEUJIMVSoRHpQmqWXL4EP8jXgxM SH4xY46RR2vjznnzGEheARGwIaX/nXqfTYJ7sK/ahvL/Y6hRdWa7wNNW37ip+2SplTVnR1b87UPRj rJV7JVgR4VzhQgu3DpDDHZ5NLLP5tiRwGqFfKr6iNT+Pb7sMQgQqjvWWGHc9wXyCfJCfxft7MvS7R whvsjV0/yHflRtlE2oJtyl0ym9uwlNG35/UosTex0sJuv29E9qfL/GaVNg3PdFVNhPcxpC742Scq8 nwfH5c/A==; Received: from willy by casper.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux)) id 1vyEnU-0000000FYWG-0EoO; Thu, 05 Mar 2026 19:55:48 +0000 From: "Matthew Wilcox (Oracle)" To: Peter Zijlstra Cc: "Matthew Wilcox (Oracle)" , Ingo Molnar , Will Deacon , Boqun Feng , Waiman Long , linux-kernel@vger.kernel.org Subject: [PATCH 3/3] mutex: Remove the list_head from struct mutex Date: Thu, 5 Mar 2026 19:55:43 +0000 Message-ID: <20260305195545.3707590-4-willy@infradead.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260305195545.3707590-1-willy@infradead.org> References: <20260305195545.3707590-1-willy@infradead.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Instead of embedding a list_head in struct mutex, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a mutex like struct file. Some of the debug checks have to be deleted because there's no equivalent to checking them in the new scheme (eg an empty waiter->list now means that it is the only waiter, not that the waiter is no longer on the list). Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mutex.h | 2 +- include/linux/mutex_types.h | 2 +- kernel/locking/mutex-debug.c | 5 +--- kernel/locking/mutex.c | 49 ++++++++++++++++++++---------------- kernel/locking/ww_mutex.h | 25 ++++++------------ 5 files changed, 37 insertions(+), 46 deletions(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index bf535f0118bb..86860beaa38c 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -79,7 +79,7 @@ do { \ #define __MUTEX_INITIALIZER(lockname) \ { .owner =3D ATOMIC_LONG_INIT(0) \ , .wait_lock =3D __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ - , .wait_list =3D LIST_HEAD_INIT(lockname.wait_list) \ + , .first_waiter =3D NULL \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } =20 diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h index fdf7f515fde8..6a4871879b41 100644 --- a/include/linux/mutex_types.h +++ b/include/linux/mutex_types.h @@ -44,7 +44,7 @@ struct mutex { #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif - struct list_head wait_list; + struct mutex_waiter *first_waiter; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 2c6b02d4699b..94930d506bcf 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -37,9 +37,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct m= utex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *wait= er) { lockdep_assert_held(&lock->wait_lock); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(!lock->first_waiter); DEBUG_LOCKS_WARN_ON(waiter->magic !=3D waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); } =20 void debug_mutex_free_waiter(struct mutex_waiter *waiter) @@ -62,7 +61,6 @@ void debug_mutex_remove_waiter(struct mutex *lock, struct= mutex_waiter *waiter, { struct mutex *blocked_on =3D __get_task_blocked_on(task); =20 - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task !=3D task); DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on !=3D lock); =20 @@ -74,7 +72,6 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic !=3D lock); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } } =20 diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2a1d165b3167..21c0818cbe4f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -47,7 +47,7 @@ static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); + lock->first_waiter =3D NULL; #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif @@ -194,33 +194,42 @@ static inline void __mutex_clear_flag(struct mutex *l= ock, unsigned long flag) atomic_long_andnot(flag, &lock->owner); } =20 -static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mute= x_waiter *waiter) -{ - return list_first_entry(&lock->wait_list, struct mutex_waiter, list) =3D= =3D waiter; -} - /* * Add @waiter to a given location in the lock wait_list and set the * FLAG_WAITERS flag if it's the first waiter. */ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct list_head *list) + struct mutex_waiter *first) { hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); =20 - list_add_tail(&waiter->list, list); - if (__mutex_waiter_is_first(lock, waiter)) + if (!first) + first =3D lock->first_waiter; + + if (first) { + list_add_tail(&waiter->list, &first->list); + } else { + INIT_LIST_HEAD(&waiter->list); + lock->first_waiter =3D waiter; __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + } } =20 static void __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - list_del(&waiter->list); - if (likely(list_empty(&lock->wait_list))) + if (list_empty(&waiter->list)) { __mutex_clear_flag(lock, MUTEX_FLAGS); + lock->first_waiter =3D NULL; + } else { + if (lock->first_waiter =3D=3D waiter) { + lock->first_waiter =3D list_first_entry(&waiter->list, + struct mutex_waiter, list); + } + list_del(&waiter->list); + } =20 debug_mutex_remove_waiter(lock, waiter, current); hung_task_clear_blocker(); @@ -340,7 +349,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct = ww_acquire_ctx *ww_ctx, * Similarly, stop spinning if we are no longer the * first waiter. */ - if (waiter && !__mutex_waiter_is_first(lock, waiter)) + if (waiter && lock->first_waiter !=3D waiter) return false; =20 return true; @@ -645,7 +654,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int st= ate, unsigned int subclas =20 if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - __mutex_add_waiter(lock, &waiter, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, NULL); } else { /* * Add in stamp order, waking up waiters that must kill @@ -691,7 +700,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int st= ate, unsigned int subclas =20 schedule_preempt_disabled(); =20 - first =3D __mutex_waiter_is_first(lock, &waiter); + first =3D lock->first_waiter =3D=3D &waiter; =20 /* * As we likely have been woken up by task @@ -734,8 +743,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int st= ate, unsigned int subclas * Wound-Wait; we stole the lock (!first_waiter), check the * waiters as anyone might want to wound us. */ - if (!ww_ctx->is_wait_die && - !__mutex_waiter_is_first(lock, &waiter)) + if (!ww_ctx->is_wait_die && lock->first_waiter !=3D &waiter) __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); } =20 @@ -931,6 +939,7 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, u= nsigned long ip) { struct task_struct *next =3D NULL; + struct mutex_waiter *waiter; DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; @@ -962,12 +971,8 @@ static noinline void __sched __mutex_unlock_slowpath(s= truct mutex *lock, unsigne =20 raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter =3D - list_first_entry(&lock->wait_list, - struct mutex_waiter, list); - + waiter =3D lock->first_waiter; + if (waiter) { next =3D waiter->task; =20 debug_mutex_wake_waiter(lock, waiter); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 31a785afee6c..a0847e91ae04 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -8,20 +8,14 @@ static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) { - struct mutex_waiter *w; - - w =3D list_first_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; - - return w; + return lock->first_waiter; } =20 static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) { w =3D list_next_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter =3D=3D w) return NULL; =20 return w; @@ -31,7 +25,7 @@ static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) { w =3D list_prev_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter =3D=3D w) return NULL; =20 return w; @@ -40,22 +34,17 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waite= r *w) static inline struct mutex_waiter * __ww_waiter_last(struct mutex *lock) { - struct mutex_waiter *w; - - w =3D list_last_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; + struct mutex_waiter *w =3D lock->first_waiter; =20 + if (w) + w =3D list_prev_entry(w, list); return w; } =20 static inline void __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mu= tex_waiter *pos) { - struct list_head *p =3D &lock->wait_list; - if (pos) - p =3D &pos->list; - __mutex_add_waiter(lock, waiter, p); + __mutex_add_waiter(lock, waiter, pos); } =20 static inline struct task_struct * --=20 2.47.3