[RFC PATCH 1/2] locking: add mutex_lock_nospin()

Yafang Shao posted 2 patches 1 month, 1 week ago
There is a newer version of this series
[RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
on the owner for specific heavy locks. This prevents long spinning times
that can lead to latency spikes for other tasks on the same runqueue.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/linux/mutex.h  |  3 +++
 kernel/locking/mutex.c | 39 ++++++++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index ecaa0440f6ec..1e488bb24b57 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -189,11 +189,13 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 extern int __must_check _mutex_lock_killable(struct mutex *lock,
 		unsigned int subclass, struct lockdep_map *nest_lock) __cond_acquires(0, lock);
 extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) __acquires(lock);
+extern void mutex_lock_nospin_nested(struct mutex *lock, unsigned int subclass);
 
 #define mutex_lock(lock) mutex_lock_nested(lock, 0)
 #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
 #define mutex_lock_killable(lock) _mutex_lock_killable(lock, 0, NULL)
 #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)
+#define mutex_lock_nospin(lock) mutex_lock_nospin_nested(lock, 0)
 
 #define mutex_lock_nest_lock(lock, nest_lock)				\
 do {									\
@@ -215,6 +217,7 @@ extern void mutex_lock(struct mutex *lock) __acquires(lock);
 extern int __must_check mutex_lock_interruptible(struct mutex *lock) __cond_acquires(0, lock);
 extern int __must_check mutex_lock_killable(struct mutex *lock) __cond_acquires(0, lock);
 extern void mutex_lock_io(struct mutex *lock) __acquires(lock);
+extern void mutex_lock_nospin(struct mutex *lock) __acquires(lock);
 
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2a1d165b3167..03d3b0749882 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -290,6 +290,14 @@ void __sched mutex_lock(struct mutex *lock)
 		__mutex_lock_slowpath(lock);
 }
 EXPORT_SYMBOL(mutex_lock);
+
+void __sched mutex_lock_nospin(struct mutex *lock)
+{
+	might_sleep();
+
+	if (!__mutex_trylock_fast(lock))
+		__mutex_lock_nospin(lock);
+}
 #endif
 
 #include "ww_mutex.h"
@@ -443,8 +451,11 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
  */
 static __always_inline bool
 mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
-		      struct mutex_waiter *waiter)
+		      struct mutex_waiter *waiter, const bool nospin)
 {
+	if (nospin)
+		return false;
+
 	if (!waiter) {
 		/*
 		 * The purpose of the mutex_can_spin_on_owner() function is
@@ -577,7 +588,8 @@ EXPORT_SYMBOL(ww_mutex_unlock);
 static __always_inline int __sched
 __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
 		    struct lockdep_map *nest_lock, unsigned long ip,
-		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx,
+		    const bool nospin)
 {
 	DEFINE_WAKE_Q(wake_q);
 	struct mutex_waiter waiter;
@@ -615,7 +627,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 	trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
 	if (__mutex_trylock(lock) ||
-	    mutex_optimistic_spin(lock, ww_ctx, NULL)) {
+	    mutex_optimistic_spin(lock, ww_ctx, NULL, nospin)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
 		if (ww_ctx)
@@ -716,7 +728,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 			 * to run.
 			 */
 			clear_task_blocked_on(current, lock);
-			if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+			if (mutex_optimistic_spin(lock, ww_ctx, &waiter, nospin))
 				break;
 			set_task_blocked_on(current, lock);
 			trace_contention_begin(lock, LCB_F_MUTEX);
@@ -773,14 +785,21 @@ static int __sched
 __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 	     struct lockdep_map *nest_lock, unsigned long ip)
 {
-	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false, false);
+}
+
+static int __sched
+__mutex_lock_nospin(struct mutex *lock, unsigned int state, unsigned int subclass,
+		    struct lockdep_map *nest_lock, unsigned long ip)
+{
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false, true);
 }
 
 static int __sched
 __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 		unsigned long ip, struct ww_acquire_ctx *ww_ctx)
 {
-	return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
+	return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true, false);
 }
 
 /**
@@ -861,11 +880,17 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
 
 	token = io_schedule_prepare();
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-			    subclass, NULL, _RET_IP_, NULL, 0);
+			    subclass, NULL, _RET_IP_, NULL, 0, false);
 	io_schedule_finish(token);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
 
+void __sched
+mutex_lock_nospin_nested(struct mutex *lock, unsigned int subclass)
+{
+	__mutex_lock_nospin(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+
 static inline int
 ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-- 
2.47.3
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Peter Zijlstra 1 month, 1 week ago
On Wed, Mar 04, 2026 at 03:46:49PM +0800, Yafang Shao wrote:
> Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
> on the owner for specific heavy locks. This prevents long spinning times
> that can lead to latency spikes for other tasks on the same runqueue.

This makes no sense; spinning stops on need_resched().
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by David Laight 1 month, 1 week ago
On Wed, 4 Mar 2026 10:02:49 +0100
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Mar 04, 2026 at 03:46:49PM +0800, Yafang Shao wrote:
> > Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
> > on the owner for specific heavy locks. This prevents long spinning times
> > that can lead to latency spikes for other tasks on the same runqueue.  
> 
> This makes no sense; spinning stops on need_resched().
> 

That might still be an issue if a high priority process is spinning.
But a %sys spike doesn't imply a latency spike.

Is this using the osq_lock.c code?
That will have problems on overprovisioned VMs, it tries to find out
whether the hypervisor has switched out - but ISTR that is flawed.

In reality a spin lock shouldn't be held for long enough to cause
any kind latency issue.
So something in the code that reads the list of filter functions
needs to be done differently so that the lock isn't held for as long.

	David
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Steven Rostedt 1 month, 1 week ago
On Wed, 4 Mar 2026 09:54:15 +0000
David Laight <david.laight.linux@gmail.com> wrote:

> That might still be an issue if a high priority process is spinning.
> But a %sys spike doesn't imply a latency spike.
> 
> Is this using the osq_lock.c code?
> That will have problems on overprovisioned VMs, it tries to find out
> whether the hypervisor has switched out - but ISTR that is flawed.
> 
> In reality a spin lock shouldn't be held for long enough to cause
> any kind latency issue.
> So something in the code that reads the list of filter functions
> needs to be done differently so that the lock isn't held for as long.

It's not a spinlock, it's an adaptive mutex which spins while the owner of
the mutex is also still running on the CPU. If the spinner CPU triggers a
NEED_RESCHED or the owner goes to sleep, the spinner stops spinning and
goes to sleep too.

Honestly, this still looks like a non-issue or a corner case that I don't
think requires these changes.

This looks like one of those "Patient: Doctor it hurts me when I do this.
Doctor: Then don't do that." cases.

Why is a production system having multiple users cat
avaliable_filter_functions to begin with?

-- Steve
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by David Laight 1 month, 1 week ago
On Wed, 4 Mar 2026 15:57:42 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Wed, 4 Mar 2026 09:54:15 +0000
> David Laight <david.laight.linux@gmail.com> wrote:
> 
> > That might still be an issue if a high priority process is spinning.
> > But a %sys spike doesn't imply a latency spike.
> > 
> > Is this using the osq_lock.c code?
> > That will have problems on overprovisioned VMs, it tries to find out
> > whether the hypervisor has switched out - but ISTR that is flawed.
> > 
> > In reality a spin lock shouldn't be held for long enough to cause
> > any kind latency issue.
> > So something in the code that reads the list of filter functions
> > needs to be done differently so that the lock isn't held for as long.  
> 
> It's not a spinlock, it's an adaptive mutex which spins while the owner of
> the mutex is also still running on the CPU. If the spinner CPU triggers a
> NEED_RESCHED or the owner goes to sleep, the spinner stops spinning and
> goes to sleep too.

I think half my brain knew that - otherwise I wouldn't have mentioned
the osq_lock.c code.
That all reminded me I've a patch that optimises that code a bit.
But I do remember thinking it ought to have a 'I been spinning long
enough, time to sleep' path.

	David 

> 
> Honestly, this still looks like a non-issue or a corner case that I don't
> think requires these changes.
> 
> This looks like one of those "Patient: Doctor it hurts me when I do this.
> Doctor: Then don't do that." cases.
> 
> Why is a production system having multiple users cat
> avaliable_filter_functions to begin with?
> 
> -- Steve
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Thu, Mar 5, 2026 at 5:44 AM David Laight
<david.laight.linux@gmail.com> wrote:
>
> On Wed, 4 Mar 2026 15:57:42 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > On Wed, 4 Mar 2026 09:54:15 +0000
> > David Laight <david.laight.linux@gmail.com> wrote:
> >
> > > That might still be an issue if a high priority process is spinning.
> > > But a %sys spike doesn't imply a latency spike.
> > >
> > > Is this using the osq_lock.c code?
> > > That will have problems on overprovisioned VMs, it tries to find out
> > > whether the hypervisor has switched out - but ISTR that is flawed.
> > >
> > > In reality a spin lock shouldn't be held for long enough to cause
> > > any kind latency issue.
> > > So something in the code that reads the list of filter functions
> > > needs to be done differently so that the lock isn't held for as long.
> >
> > It's not a spinlock, it's an adaptive mutex which spins while the owner of
> > the mutex is also still running on the CPU. If the spinner CPU triggers a
> > NEED_RESCHED or the owner goes to sleep, the spinner stops spinning and
> > goes to sleep too.
>
> I think half my brain knew that - otherwise I wouldn't have mentioned
> the osq_lock.c code.
> That all reminded me I've a patch that optimises that code a bit.
> But I do remember thinking it ought to have a 'I been spinning long
> enough, time to sleep' path.
>
>         David
>
> >
> > Honestly, this still looks like a non-issue or a corner case that I don't
> > think requires these changes.
> >
> > This looks like one of those "Patient: Doctor it hurts me when I do this.
> > Doctor: Then don't do that." cases.
> >
> > Why is a production system having multiple users cat
> > avaliable_filter_functions to begin with?

bpftrace is a widely used tool for online debugging and dynamic
tracing. However, sysadmins may unknowingly run multiple bpftrace
instances concurrently without realizing the potential impact on
system performance.

If this is your answer, I believe we should clearly document the
following warning:

  Warning: Do not read available_filter_functions concurrently, as
doing so can significantly degrade system performance and potentially
impact production workloads.


--
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Steven Rostedt 1 month, 1 week ago
On Thu, 5 Mar 2026 10:17:09 +0800
Yafang Shao <laoar.shao@gmail.com> wrote:

> bpftrace is a widely used tool for online debugging and dynamic
> tracing. However, sysadmins may unknowingly run multiple bpftrace
> instances concurrently without realizing the potential impact on
> system performance.
> 
> If this is your answer, I believe we should clearly document the
> following warning:
> 
>   Warning: Do not read available_filter_functions concurrently, as
> doing so can significantly degrade system performance and potentially
> impact production workloads.

Or update bpftrace to cache that file. It's only updated on module load
and unload, which isn't done much. There's no reason it needs to
constantly read that file if bpftrace is being used constantly.


-- Steve
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Thu, Mar 5, 2026 at 10:28 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 5 Mar 2026 10:17:09 +0800
> Yafang Shao <laoar.shao@gmail.com> wrote:
>
> > bpftrace is a widely used tool for online debugging and dynamic
> > tracing. However, sysadmins may unknowingly run multiple bpftrace
> > instances concurrently without realizing the potential impact on
> > system performance.
> >
> > If this is your answer, I believe we should clearly document the
> > following warning:
> >
> >   Warning: Do not read available_filter_functions concurrently, as
> > doing so can significantly degrade system performance and potentially
> > impact production workloads.
>
> Or update bpftrace to cache that file. It's only updated on module load
> and unload, which isn't done much. There's no reason it needs to
> constantly read that file if bpftrace is being used constantly.

Other tools may also read available_filter_functions, requiring each
one to be patched individually to avoid this flaw—a clearly
impractical solution.

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Steven Rostedt 1 month, 1 week ago
On Thu, 5 Mar 2026 10:33:00 +0800
Yafang Shao <laoar.shao@gmail.com> wrote:

> Other tools may also read available_filter_functions, requiring each
> one to be patched individually to avoid this flaw—a clearly
> impractical solution.

What exactly is the issue? If a task does a while 1 in user space, it
wouldn't be much different. With PREEMPT_LAZY the most a task will spin
in the kernel is one extra tick over a user space task spinning in user
space.

available_filter_functions is definitely not a hot path, so I
personally don't care if it were to use "nospin". My worry is about
adding this "special" mutex for a single corner case, and I want to know
that its a real bug before we add something special into the kernel.

-- Steve
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 5 Mar 2026 10:33:00 +0800
> Yafang Shao <laoar.shao@gmail.com> wrote:
>
> > Other tools may also read available_filter_functions, requiring each
> > one to be patched individually to avoid this flaw—a clearly
> > impractical solution.
>
> What exactly is the issue?

It makes no sense to spin unnecessarily when it can be avoided. We
continuously improve the kernel to do the right thing—and unnecessary
spinning is certainly not the right thing.

> If a task does a while 1 in user space, it
> wouldn't be much different.

The while loop in user space performs actual work, whereas useless
spinning does nothing but burn CPU cycles. My point is simple: if this
unnecessary spinning isn't already considered an issue, it should
be—it's something that clearly needs improvement.

> With PREEMPT_LAZY the most a task will spin
> in the kernel is one extra tick over a user space task spinning in user
> space.
>
> available_filter_functions is definitely not a hot path, so I
> personally don't care if it were to use "nospin". My worry is about
> adding this "special" mutex for a single corner case, and I want to know
> that its a real bug before we add something special into the kernel.
>
> -- Steve



-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Waiman Long 1 month, 1 week ago
On 3/4/26 10:08 PM, Yafang Shao wrote:
> On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>> On Thu, 5 Mar 2026 10:33:00 +0800
>> Yafang Shao <laoar.shao@gmail.com> wrote:
>>
>>> Other tools may also read available_filter_functions, requiring each
>>> one to be patched individually to avoid this flaw—a clearly
>>> impractical solution.
>> What exactly is the issue?
> It makes no sense to spin unnecessarily when it can be avoided. We
> continuously improve the kernel to do the right thing—and unnecessary
> spinning is certainly not the right thing.
>
>> If a task does a while 1 in user space, it
>> wouldn't be much different.
> The while loop in user space performs actual work, whereas useless
> spinning does nothing but burn CPU cycles. My point is simple: if this
> unnecessary spinning isn't already considered an issue, it should
> be—it's something that clearly needs improvement.

The whole point of optimistic spinning is to reduce the lock acquisition 
latency. If the waiter sleeps, the unlock operation will have to wake up 
the waiter which can have a variable latency depending on how busy the 
system is at the time. Yes, it is burning CPU cycles while spinning, 
Most workloads will gain performance with this optimistic spinning 
feature. You do have a point that for system monitoring tools that 
observe the system behavior, they shouldn't burn that much CPU times 
that affect performance of real workload that the tools are monitoring.

BTW, you should expand the commit log of patch 1 to include the 
rationale of why we should add this feature to mutex as the information 
in the cover letter won't get included in the git log if this patch 
series is merged. You should also elaborate in comment on under what 
conditions should this this new mutex API be used.

Cheers,
Longman

Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by David Laight 1 month, 1 week ago
On Wed, 4 Mar 2026 23:30:40 -0500
Waiman Long <longman@redhat.com> wrote:

> On 3/4/26 10:08 PM, Yafang Shao wrote:
> > On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt <rostedt@goodmis.org> wrote:  
> >> On Thu, 5 Mar 2026 10:33:00 +0800
> >> Yafang Shao <laoar.shao@gmail.com> wrote:
> >>  
> >>> Other tools may also read available_filter_functions, requiring each
> >>> one to be patched individually to avoid this flaw—a clearly
> >>> impractical solution.  
> >> What exactly is the issue?  
> > It makes no sense to spin unnecessarily when it can be avoided. We
> > continuously improve the kernel to do the right thing—and unnecessary
> > spinning is certainly not the right thing.
> >  
> >> If a task does a while 1 in user space, it
> >> wouldn't be much different.  
> > The while loop in user space performs actual work, whereas useless
> > spinning does nothing but burn CPU cycles. My point is simple: if this
> > unnecessary spinning isn't already considered an issue, it should
> > be—it's something that clearly needs improvement.  
> 
> The whole point of optimistic spinning is to reduce the lock acquisition 
> latency. If the waiter sleeps, the unlock operation will have to wake up 
> the waiter which can have a variable latency depending on how busy the 
> system is at the time. Yes, it is burning CPU cycles while spinning, 
> Most workloads will gain performance with this optimistic spinning 
> feature. You do have a point that for system monitoring tools that 
> observe the system behavior, they shouldn't burn that much CPU times 
> that affect performance of real workload that the tools are monitoring.
> 
> BTW, you should expand the commit log of patch 1 to include the 
> rationale of why we should add this feature to mutex as the information 
> in the cover letter won't get included in the git log if this patch 
> series is merged. You should also elaborate in comment on under what 
> conditions should this this new mutex API be used.

Isn't changing mutex_lock() the wrong place anyway?
What you need is for the code holding the lock to indicate that
it isn't worth waiters spinning because the lock will be held
for a long time.

	David

> 
> Cheers,
> Longman
> 
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Waiman Long 1 month, 1 week ago
On 3/5/26 4:32 AM, David Laight wrote:
> On Wed, 4 Mar 2026 23:30:40 -0500
> Waiman Long <longman@redhat.com> wrote:
>
>> On 3/4/26 10:08 PM, Yafang Shao wrote:
>>> On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>>>> On Thu, 5 Mar 2026 10:33:00 +0800
>>>> Yafang Shao <laoar.shao@gmail.com> wrote:
>>>>   
>>>>> Other tools may also read available_filter_functions, requiring each
>>>>> one to be patched individually to avoid this flaw—a clearly
>>>>> impractical solution.
>>>> What exactly is the issue?
>>> It makes no sense to spin unnecessarily when it can be avoided. We
>>> continuously improve the kernel to do the right thing—and unnecessary
>>> spinning is certainly not the right thing.
>>>   
>>>> If a task does a while 1 in user space, it
>>>> wouldn't be much different.
>>> The while loop in user space performs actual work, whereas useless
>>> spinning does nothing but burn CPU cycles. My point is simple: if this
>>> unnecessary spinning isn't already considered an issue, it should
>>> be—it's something that clearly needs improvement.
>> The whole point of optimistic spinning is to reduce the lock acquisition
>> latency. If the waiter sleeps, the unlock operation will have to wake up
>> the waiter which can have a variable latency depending on how busy the
>> system is at the time. Yes, it is burning CPU cycles while spinning,
>> Most workloads will gain performance with this optimistic spinning
>> feature. You do have a point that for system monitoring tools that
>> observe the system behavior, they shouldn't burn that much CPU times
>> that affect performance of real workload that the tools are monitoring.
>>
>> BTW, you should expand the commit log of patch 1 to include the
>> rationale of why we should add this feature to mutex as the information
>> in the cover letter won't get included in the git log if this patch
>> series is merged. You should also elaborate in comment on under what
>> conditions should this this new mutex API be used.
> Isn't changing mutex_lock() the wrong place anyway?
> What you need is for the code holding the lock to indicate that
> it isn't worth waiters spinning because the lock will be held
> for a long time.

I have actually thought about having a flag somewhere in the mutex 
itself to indicate that optimistic spinning isn't needed. However the 
owner field is running out of usable flag bits. The other option is to 
add it to osq as it doesn't really need to use the full 32 bits for the 
tail. In this case, we can just initialize the mutex to say that we 
don't need optimistic spinning and no new mutex_lock() API will be needed.

Cheers,
Longman

Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Fri, Mar 6, 2026 at 3:00 AM Waiman Long <longman@redhat.com> wrote:
>
> On 3/5/26 4:32 AM, David Laight wrote:
> > On Wed, 4 Mar 2026 23:30:40 -0500
> > Waiman Long <longman@redhat.com> wrote:
> >
> >> On 3/4/26 10:08 PM, Yafang Shao wrote:
> >>> On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >>>> On Thu, 5 Mar 2026 10:33:00 +0800
> >>>> Yafang Shao <laoar.shao@gmail.com> wrote:
> >>>>
> >>>>> Other tools may also read available_filter_functions, requiring each
> >>>>> one to be patched individually to avoid this flaw—a clearly
> >>>>> impractical solution.
> >>>> What exactly is the issue?
> >>> It makes no sense to spin unnecessarily when it can be avoided. We
> >>> continuously improve the kernel to do the right thing—and unnecessary
> >>> spinning is certainly not the right thing.
> >>>
> >>>> If a task does a while 1 in user space, it
> >>>> wouldn't be much different.
> >>> The while loop in user space performs actual work, whereas useless
> >>> spinning does nothing but burn CPU cycles. My point is simple: if this
> >>> unnecessary spinning isn't already considered an issue, it should
> >>> be—it's something that clearly needs improvement.
> >> The whole point of optimistic spinning is to reduce the lock acquisition
> >> latency. If the waiter sleeps, the unlock operation will have to wake up
> >> the waiter which can have a variable latency depending on how busy the
> >> system is at the time. Yes, it is burning CPU cycles while spinning,
> >> Most workloads will gain performance with this optimistic spinning
> >> feature. You do have a point that for system monitoring tools that
> >> observe the system behavior, they shouldn't burn that much CPU times
> >> that affect performance of real workload that the tools are monitoring.
> >>
> >> BTW, you should expand the commit log of patch 1 to include the
> >> rationale of why we should add this feature to mutex as the information
> >> in the cover letter won't get included in the git log if this patch
> >> series is merged. You should also elaborate in comment on under what
> >> conditions should this this new mutex API be used.
> > Isn't changing mutex_lock() the wrong place anyway?
> > What you need is for the code holding the lock to indicate that
> > it isn't worth waiters spinning because the lock will be held
> > for a long time.
>
> I have actually thought about having a flag somewhere in the mutex
> itself to indicate that optimistic spinning isn't needed. However the
> owner field is running out of usable flag bits.

True. Introducing a new MUTEX_FLAGS would likely require a substantial
refactor of the mutex code, which may not be worth it.

> The other option is to
> add it to osq as it doesn't really need to use the full 32 bits for the
> tail. In this case, we can just initialize the mutex to say that we
> don't need optimistic spinning and no new mutex_lock() API will be needed.

I believe a new mutex_lock() variant would be clearer and easier to
understand. It also has the advantage of requiring minimal changes to
the existing mutex code.

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Thu, Mar 5, 2026 at 12:30 PM Waiman Long <longman@redhat.com> wrote:
>
> On 3/4/26 10:08 PM, Yafang Shao wrote:
> > On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >> On Thu, 5 Mar 2026 10:33:00 +0800
> >> Yafang Shao <laoar.shao@gmail.com> wrote:
> >>
> >>> Other tools may also read available_filter_functions, requiring each
> >>> one to be patched individually to avoid this flaw—a clearly
> >>> impractical solution.
> >> What exactly is the issue?
> > It makes no sense to spin unnecessarily when it can be avoided. We
> > continuously improve the kernel to do the right thing—and unnecessary
> > spinning is certainly not the right thing.
> >
> >> If a task does a while 1 in user space, it
> >> wouldn't be much different.
> > The while loop in user space performs actual work, whereas useless
> > spinning does nothing but burn CPU cycles. My point is simple: if this
> > unnecessary spinning isn't already considered an issue, it should
> > be—it's something that clearly needs improvement.
>
> The whole point of optimistic spinning is to reduce the lock acquisition
> latency. If the waiter sleeps, the unlock operation will have to wake up
> the waiter which can have a variable latency depending on how busy the
> system is at the time. Yes, it is burning CPU cycles while spinning,
> Most workloads will gain performance with this optimistic spinning
> feature. You do have a point that for system monitoring tools that
> observe the system behavior, they shouldn't burn that much CPU times
> that affect performance of real workload that the tools are monitoring.

Exactly. ftrace is intended for debugging and should not significantly
impact real workloads. Therefore, it's reasonable to make it sleep if
it cannot acquire the lock immediately, rather than spinning and
consuming CPU cycles.

>
> BTW, you should expand the commit log of patch 1 to include the
> rationale of why we should add this feature to mutex as the information
> in the cover letter won't get included in the git log if this patch
> series is merged. You should also elaborate in comment on under what
> conditions should this this new mutex API be used.

Sure.  I will update it.

BTW, these issues are notably hard to find. I suspect there are other
locks out there with the same problem.

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Waiman Long 1 month, 1 week ago
On 3/5/26 12:40 AM, Yafang Shao wrote:
> On Thu, Mar 5, 2026 at 12:30 PM Waiman Long <longman@redhat.com> wrote:
>> On 3/4/26 10:08 PM, Yafang Shao wrote:
>>> On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>>>> On Thu, 5 Mar 2026 10:33:00 +0800
>>>> Yafang Shao <laoar.shao@gmail.com> wrote:
>>>>
>>>>> Other tools may also read available_filter_functions, requiring each
>>>>> one to be patched individually to avoid this flaw—a clearly
>>>>> impractical solution.
>>>> What exactly is the issue?
>>> It makes no sense to spin unnecessarily when it can be avoided. We
>>> continuously improve the kernel to do the right thing—and unnecessary
>>> spinning is certainly not the right thing.
>>>
>>>> If a task does a while 1 in user space, it
>>>> wouldn't be much different.
>>> The while loop in user space performs actual work, whereas useless
>>> spinning does nothing but burn CPU cycles. My point is simple: if this
>>> unnecessary spinning isn't already considered an issue, it should
>>> be—it's something that clearly needs improvement.
>> The whole point of optimistic spinning is to reduce the lock acquisition
>> latency. If the waiter sleeps, the unlock operation will have to wake up
>> the waiter which can have a variable latency depending on how busy the
>> system is at the time. Yes, it is burning CPU cycles while spinning,
>> Most workloads will gain performance with this optimistic spinning
>> feature. You do have a point that for system monitoring tools that
>> observe the system behavior, they shouldn't burn that much CPU times
>> that affect performance of real workload that the tools are monitoring.
> Exactly. ftrace is intended for debugging and should not significantly
> impact real workloads. Therefore, it's reasonable to make it sleep if
> it cannot acquire the lock immediately, rather than spinning and
> consuming CPU cycles.

Your patch series use wordings that give a negative connotation about 
optimistic spinning making it look like a bad thing. In fact, it is just 
a request for a new mutex API for use cases where they can suffer higher 
latency in order to minimize the system overhead they incur. So don't 
bad-mouth optimistic spinning and emphasize the use cases you want to 
support with the new API in your next version.

My 2 cents.

Cheers,
Longman

Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Waiman Long 1 month, 1 week ago
On 3/5/26 1:34 PM, Waiman Long wrote:
> On 3/5/26 12:40 AM, Yafang Shao wrote:
>> On Thu, Mar 5, 2026 at 12:30 PM Waiman Long <longman@redhat.com> wrote:
>>> On 3/4/26 10:08 PM, Yafang Shao wrote:
>>>> On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt 
>>>> <rostedt@goodmis.org> wrote:
>>>>> On Thu, 5 Mar 2026 10:33:00 +0800
>>>>> Yafang Shao <laoar.shao@gmail.com> wrote:
>>>>>
>>>>>> Other tools may also read available_filter_functions, requiring each
>>>>>> one to be patched individually to avoid this flaw—a clearly
>>>>>> impractical solution.
>>>>> What exactly is the issue?
>>>> It makes no sense to spin unnecessarily when it can be avoided. We
>>>> continuously improve the kernel to do the right thing—and unnecessary
>>>> spinning is certainly not the right thing.
>>>>
>>>>> If a task does a while 1 in user space, it
>>>>> wouldn't be much different.
>>>> The while loop in user space performs actual work, whereas useless
>>>> spinning does nothing but burn CPU cycles. My point is simple: if this
>>>> unnecessary spinning isn't already considered an issue, it should
>>>> be—it's something that clearly needs improvement.
>>> The whole point of optimistic spinning is to reduce the lock 
>>> acquisition
>>> latency. If the waiter sleeps, the unlock operation will have to 
>>> wake up
>>> the waiter which can have a variable latency depending on how busy the
>>> system is at the time. Yes, it is burning CPU cycles while spinning,
>>> Most workloads will gain performance with this optimistic spinning
>>> feature. You do have a point that for system monitoring tools that
>>> observe the system behavior, they shouldn't burn that much CPU times
>>> that affect performance of real workload that the tools are monitoring.
>> Exactly. ftrace is intended for debugging and should not significantly
>> impact real workloads. Therefore, it's reasonable to make it sleep if
>> it cannot acquire the lock immediately, rather than spinning and
>> consuming CPU cycles.
>
> Your patch series use wordings that give a negative connotation about 
> optimistic spinning making it look like a bad thing. In fact, it is 
> just a request for a new mutex API for use cases where they can suffer 
> higher latency in order to minimize the system overhead they incur. So 
> don't bad-mouth optimistic spinning and emphasize the use cases you 
> want to support with the new API in your next version. 

BTW, for any new mutex API introduced, you should also provide an 
equivalent version in kernel/locking/rtmutex_api.c for PREEMPT_RT kernel.

Cheers,
Longman

Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Fri, Mar 6, 2026 at 2:45 AM Waiman Long <longman@redhat.com> wrote:
>
> On 3/5/26 1:34 PM, Waiman Long wrote:
> > On 3/5/26 12:40 AM, Yafang Shao wrote:
> >> On Thu, Mar 5, 2026 at 12:30 PM Waiman Long <longman@redhat.com> wrote:
> >>> On 3/4/26 10:08 PM, Yafang Shao wrote:
> >>>> On Thu, Mar 5, 2026 at 11:00 AM Steven Rostedt
> >>>> <rostedt@goodmis.org> wrote:
> >>>>> On Thu, 5 Mar 2026 10:33:00 +0800
> >>>>> Yafang Shao <laoar.shao@gmail.com> wrote:
> >>>>>
> >>>>>> Other tools may also read available_filter_functions, requiring each
> >>>>>> one to be patched individually to avoid this flaw—a clearly
> >>>>>> impractical solution.
> >>>>> What exactly is the issue?
> >>>> It makes no sense to spin unnecessarily when it can be avoided. We
> >>>> continuously improve the kernel to do the right thing—and unnecessary
> >>>> spinning is certainly not the right thing.
> >>>>
> >>>>> If a task does a while 1 in user space, it
> >>>>> wouldn't be much different.
> >>>> The while loop in user space performs actual work, whereas useless
> >>>> spinning does nothing but burn CPU cycles. My point is simple: if this
> >>>> unnecessary spinning isn't already considered an issue, it should
> >>>> be—it's something that clearly needs improvement.
> >>> The whole point of optimistic spinning is to reduce the lock
> >>> acquisition
> >>> latency. If the waiter sleeps, the unlock operation will have to
> >>> wake up
> >>> the waiter which can have a variable latency depending on how busy the
> >>> system is at the time. Yes, it is burning CPU cycles while spinning,
> >>> Most workloads will gain performance with this optimistic spinning
> >>> feature. You do have a point that for system monitoring tools that
> >>> observe the system behavior, they shouldn't burn that much CPU times
> >>> that affect performance of real workload that the tools are monitoring.
> >> Exactly. ftrace is intended for debugging and should not significantly
> >> impact real workloads. Therefore, it's reasonable to make it sleep if
> >> it cannot acquire the lock immediately, rather than spinning and
> >> consuming CPU cycles.
> >
> > Your patch series use wordings that give a negative connotation about
> > optimistic spinning making it look like a bad thing.

Perhaps I didn't phrase that well. I do understand that optimistic
spinning is valuable in use cases where we wouldn't want to disable
CONFIG_MUTEX_SPIN_ON_OWNER.

> In fact, it is
> > just a request for a new mutex API for use cases where they can suffer
> > higher latency in order to minimize the system overhead they incur. So
> > don't bad-mouth optimistic spinning and emphasize the use cases you
> > want to support with the new API in your next version.
>
> BTW, for any new mutex API introduced, you should also provide an
> equivalent version in kernel/locking/rtmutex_api.c for PREEMPT_RT kernel.

Thanks for the suggestion.

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Steven Rostedt 1 month, 1 week ago
On Thu, 5 Mar 2026 13:40:27 +0800
Yafang Shao <laoar.shao@gmail.com> wrote:

> Exactly. ftrace is intended for debugging and should not significantly
> impact real workloads. Therefore, it's reasonable to make it sleep if
> it cannot acquire the lock immediately, rather than spinning and
> consuming CPU cycles.

Actually, ftrace is more than just debugging. It is the infrastructure for
live kernel patching as well.

> 
> >
> > BTW, you should expand the commit log of patch 1 to include the
> > rationale of why we should add this feature to mutex as the information
> > in the cover letter won't get included in the git log if this patch
> > series is merged. You should also elaborate in comment on under what
> > conditions should this this new mutex API be used.  
> 
> Sure.  I will update it.
> 
> BTW, these issues are notably hard to find. I suspect there are other
> locks out there with the same problem.

As I mentioned, I'm not against the change. I just want to make sure the
rationale is strong enough to make the change.

One thing that should be modified with your patch is the name. "nospin"
references the implementation of the mutex. Instead it should be called
something like: "noncritical" or "slowpath" stating that the grabbing of
this mutex is not of a critical section.

Maybe an entirely new interface should be defined:


struct slow_mutex;

slow_mutex_lock()
slow_mutex_unlock()

etc,

that makes it obvious that this mutex may be held for long periods of time.
In fact, this would be useful for RT workloads, as these mutexes could be
flagged to warn RT critical tasks if those tasks were to take one of them.

There has been some talk to mark paths in the kernel that RT tasks would
get a SIGKILL if they were to hit a path that is known to be non
deterministic.

-- Steve
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Thu, Mar 5, 2026 at 9:20 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu, 5 Mar 2026 13:40:27 +0800
> Yafang Shao <laoar.shao@gmail.com> wrote:
>
> > Exactly. ftrace is intended for debugging and should not significantly
> > impact real workloads. Therefore, it's reasonable to make it sleep if
> > it cannot acquire the lock immediately, rather than spinning and
> > consuming CPU cycles.
>
> Actually, ftrace is more than just debugging. It is the infrastructure for
> live kernel patching as well.

good to know.

>
> >
> > >
> > > BTW, you should expand the commit log of patch 1 to include the
> > > rationale of why we should add this feature to mutex as the information
> > > in the cover letter won't get included in the git log if this patch
> > > series is merged. You should also elaborate in comment on under what
> > > conditions should this this new mutex API be used.
> >
> > Sure.  I will update it.
> >
> > BTW, these issues are notably hard to find. I suspect there are other
> > locks out there with the same problem.
>
> As I mentioned, I'm not against the change. I just want to make sure the
> rationale is strong enough to make the change.
>
> One thing that should be modified with your patch is the name. "nospin"
> references the implementation of the mutex. Instead it should be called
> something like: "noncritical" or "slowpath" stating that the grabbing of
> this mutex is not of a critical section.
>
> Maybe an entirely new interface should be defined:
>
>
> struct slow_mutex;

Is it necessary to define a new structure for this slow mutex? We
could simply reuse the existing struct mutex instead. Alternatively,
should we add some new flags to this slow_mutex for debugging
purposes?

>
> slow_mutex_lock()
> slow_mutex_unlock()

These two APIs appear sufficient to handle this use case.

>
> etc,
>
> that makes it obvious that this mutex may be held for long periods of time.
> In fact, this would be useful for RT workloads, as these mutexes could be
> flagged to warn RT critical tasks if those tasks were to take one of them.
>
> There has been some talk to mark paths in the kernel that RT tasks would
> get a SIGKILL if they were to hit a path that is known to be non
> deterministic.

Thanks for your information.

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by David Laight 1 month, 1 week ago
On Fri, 6 Mar 2026 10:22:11 +0800
Yafang Shao <laoar.shao@gmail.com> wrote:

> On Thu, Mar 5, 2026 at 9:20 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Thu, 5 Mar 2026 13:40:27 +0800
> > Yafang Shao <laoar.shao@gmail.com> wrote:
> >  
> > > Exactly. ftrace is intended for debugging and should not significantly
> > > impact real workloads. Therefore, it's reasonable to make it sleep if
> > > it cannot acquire the lock immediately, rather than spinning and
> > > consuming CPU cycles.  
> >
> > Actually, ftrace is more than just debugging. It is the infrastructure for
> > live kernel patching as well.  
> 
> good to know.
> 
> >  
> > >  
> > > >
> > > > BTW, you should expand the commit log of patch 1 to include the
> > > > rationale of why we should add this feature to mutex as the information
> > > > in the cover letter won't get included in the git log if this patch
> > > > series is merged. You should also elaborate in comment on under what
> > > > conditions should this this new mutex API be used.  
> > >
> > > Sure.  I will update it.
> > >
> > > BTW, these issues are notably hard to find. I suspect there are other
> > > locks out there with the same problem.  
> >
> > As I mentioned, I'm not against the change. I just want to make sure the
> > rationale is strong enough to make the change.
> >
> > One thing that should be modified with your patch is the name. "nospin"
> > references the implementation of the mutex. Instead it should be called
> > something like: "noncritical" or "slowpath" stating that the grabbing of
> > this mutex is not of a critical section.
> >
> > Maybe an entirely new interface should be defined:
> >
> >
> > struct slow_mutex;  
> 
> Is it necessary to define a new structure for this slow mutex? We
> could simply reuse the existing struct mutex instead. Alternatively,
> should we add some new flags to this slow_mutex for debugging
> purposes?
> 
> >
> > slow_mutex_lock()
> > slow_mutex_unlock()  
> 
> These two APIs appear sufficient to handle this use case.

Don't semaphores still exist?
IIRC they always sleep.

Although I wonder if the mutex need to be held for as long at it is.
ISTR one of the tracebacks was one the 'address to name' lookup,
that code will be slow.
Since the mutex can't be held across the multiple reads that are done
to read the full list of tracepoints it must surely be possible to
release it across the name lookup?

	David

> 
> >
> > etc,
> >
> > that makes it obvious that this mutex may be held for long periods of time.
> > In fact, this would be useful for RT workloads, as these mutexes could be
> > flagged to warn RT critical tasks if those tasks were to take one of them.
> >
> > There has been some talk to mark paths in the kernel that RT tasks would
> > get a SIGKILL if they were to hit a path that is known to be non
> > deterministic.  
> 
> Thanks for your information.
> 
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month ago
On Fri, Mar 6, 2026 at 6:00 PM David Laight
<david.laight.linux@gmail.com> wrote:
>
> On Fri, 6 Mar 2026 10:22:11 +0800
> Yafang Shao <laoar.shao@gmail.com> wrote:
>
> > On Thu, Mar 5, 2026 at 9:20 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > > On Thu, 5 Mar 2026 13:40:27 +0800
> > > Yafang Shao <laoar.shao@gmail.com> wrote:
> > >
> > > > Exactly. ftrace is intended for debugging and should not significantly
> > > > impact real workloads. Therefore, it's reasonable to make it sleep if
> > > > it cannot acquire the lock immediately, rather than spinning and
> > > > consuming CPU cycles.
> > >
> > > Actually, ftrace is more than just debugging. It is the infrastructure for
> > > live kernel patching as well.
> >
> > good to know.
> >
> > >
> > > >
> > > > >
> > > > > BTW, you should expand the commit log of patch 1 to include the
> > > > > rationale of why we should add this feature to mutex as the information
> > > > > in the cover letter won't get included in the git log if this patch
> > > > > series is merged. You should also elaborate in comment on under what
> > > > > conditions should this this new mutex API be used.
> > > >
> > > > Sure.  I will update it.
> > > >
> > > > BTW, these issues are notably hard to find. I suspect there are other
> > > > locks out there with the same problem.
> > >
> > > As I mentioned, I'm not against the change. I just want to make sure the
> > > rationale is strong enough to make the change.
> > >
> > > One thing that should be modified with your patch is the name. "nospin"
> > > references the implementation of the mutex. Instead it should be called
> > > something like: "noncritical" or "slowpath" stating that the grabbing of
> > > this mutex is not of a critical section.
> > >
> > > Maybe an entirely new interface should be defined:
> > >
> > >
> > > struct slow_mutex;
> >
> > Is it necessary to define a new structure for this slow mutex? We
> > could simply reuse the existing struct mutex instead. Alternatively,
> > should we add some new flags to this slow_mutex for debugging
> > purposes?
> >
> > >
> > > slow_mutex_lock()
> > > slow_mutex_unlock()
> >
> > These two APIs appear sufficient to handle this use case.
>
> Don't semaphores still exist?

While semaphores may present similar challenges, I'm not currently
aware of specific instances that share this exact issue. Should we
encounter any problematic semaphores in production workloads, we can
address them at that time.

> IIRC they always sleep.
>
> Although I wonder if the mutex need to be held for as long at it is.
> ISTR one of the tracebacks was one the 'address to name' lookup,
> that code will be slow.
> Since the mutex can't be held across the multiple reads that are done
> to read the full list of tracepoints it must surely be possible to
> release it across the name lookup?

That's a great point, though I'm not entirely certain at the moment.
Perhaps Steven can provide further insight.

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Wed, Mar 4, 2026 at 5:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 04, 2026 at 03:46:49PM +0800, Yafang Shao wrote:
> > Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
> > on the owner for specific heavy locks. This prevents long spinning times
> > that can lead to latency spikes for other tasks on the same runqueue.
>
> This makes no sense; spinning stops on need_resched().

Hello Peter,

The condition to stop spinning on need_resched() relies on the mutex
owner remaining unchanged. However, when multiple tasks contend for
the same lock, the owner can change frequently. This creates a
potential TOCTOU (Time of Check to Time of Use) issue.

  mutex_optimistic_spin
      owner = __mutex_trylock_or_owner(lock);
      mutex_spin_on_owner
          // the __mutex_owner(lock) might get a new owner.
          while (__mutex_owner(lock) == owner)

We observed high CPU pressure in production when this scenario occurred.

Below are the benchmark results when running 10 concurrent tasks:

for i in `seq 0 9`; do
        time cat /sys/kernel/debug/tracing/available_filter_functions
> /dev/null &
done

- before this patch

real    0m4.636s    user 0m0.001s    sys 0m3.773s
real    0m5.157s    user 0m0.001s    sys 0m4.362s
real    0m5.205s    user 0m0.000s    sys 0m4.538s
real    0m5.212s    user 0m0.001s    sys 0m4.700s
real    0m5.246s    user 0m0.001s    sys 0m4.501s
real    0m5.254s    user 0m0.003s    sys 0m4.335s
real    0m5.260s    user 0m0.003s    sys 0m4.525s
real    0m5.267s    user 0m0.004s    sys 0m4.482s
real    0m5.273s    user 0m0.002s    sys 0m4.215s
real    0m5.285s    user 0m0.003s    sys 0m4.373s


- after this patch

real    0m4.733s    user 0m0.002s    sys 0m0.511s
real    0m4.740s    user 0m0.001s    sys 0m0.509s
real    0m4.862s    user 0m0.001s    sys 0m0.513s
real    0m4.884s    user 0m0.000s    sys 0m0.507s
real    0m4.888s    user 0m0.003s    sys 0m0.513s
real    0m4.888s    user 0m0.000s    sys 0m0.511s
real    0m4.886s    user 0m0.003s    sys 0m0.508s
real    0m4.952s    user 0m0.000s    sys 0m0.513s
real    0m4.973s    user 0m0.001s    sys 0m0.510s
real    0m5.042s    user 0m0.002s    sys 0m0.515s

The results show that system time dropped dramatically from ~4.5
seconds to ~0.5 seconds, confirming that the patch can help reduce the
issue.

Please correct me if I've misunderstood anything.

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Peter Zijlstra 1 month, 1 week ago
On Wed, Mar 04, 2026 at 05:37:31PM +0800, Yafang Shao wrote:
> On Wed, Mar 4, 2026 at 5:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Mar 04, 2026 at 03:46:49PM +0800, Yafang Shao wrote:
> > > Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
> > > on the owner for specific heavy locks. This prevents long spinning times
> > > that can lead to latency spikes for other tasks on the same runqueue.
> >
> > This makes no sense; spinning stops on need_resched().
> 
> Hello Peter,
> 
> The condition to stop spinning on need_resched() relies on the mutex
> owner remaining unchanged. However, when multiple tasks contend for
> the same lock, the owner can change frequently. This creates a
> potential TOCTOU (Time of Check to Time of Use) issue.
> 
>   mutex_optimistic_spin
>       owner = __mutex_trylock_or_owner(lock);
>       mutex_spin_on_owner
>           // the __mutex_owner(lock) might get a new owner.
>           while (__mutex_owner(lock) == owner)
> 

How do these new owners become the owner? Are they succeeding the
__mutex_trylock() that sits before mutex_optimistic_spin() and
effectively starving the spinner?

Something like the below would make a difference if that were so.

---
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index c867f6c15530..0796e77a8c3b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -521,7 +521,7 @@ static __always_inline bool
 mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
 		      struct mutex_waiter *waiter)
 {
-	return false;
+	return __mutex_trylock(lock);
 }
 #endif
 
@@ -614,8 +614,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
 	trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
-	if (__mutex_trylock(lock) ||
-	    mutex_optimistic_spin(lock, ww_ctx, NULL)) {
+	if (mutex_optimistic_spin(lock, ww_ctx, NULL)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
 		if (ww_ctx)

Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Wed, Mar 4, 2026 at 6:11 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 04, 2026 at 05:37:31PM +0800, Yafang Shao wrote:
> > On Wed, Mar 4, 2026 at 5:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Mar 04, 2026 at 03:46:49PM +0800, Yafang Shao wrote:
> > > > Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
> > > > on the owner for specific heavy locks. This prevents long spinning times
> > > > that can lead to latency spikes for other tasks on the same runqueue.
> > >
> > > This makes no sense; spinning stops on need_resched().
> >
> > Hello Peter,
> >
> > The condition to stop spinning on need_resched() relies on the mutex
> > owner remaining unchanged. However, when multiple tasks contend for
> > the same lock, the owner can change frequently. This creates a
> > potential TOCTOU (Time of Check to Time of Use) issue.
> >
> >   mutex_optimistic_spin
> >       owner = __mutex_trylock_or_owner(lock);
> >       mutex_spin_on_owner
> >           // the __mutex_owner(lock) might get a new owner.
> >           while (__mutex_owner(lock) == owner)
> >
>
> How do these new owners become the owner? Are they succeeding the
> __mutex_trylock() that sits before mutex_optimistic_spin() and
> effectively starving the spinner?
>
> Something like the below would make a difference if that were so.

The following change made no difference; concurrent runs still result
in prolonged system time.

real 0m5.265s user 0m0.000s sys 0m4.921s
real 0m5.295s user 0m0.002s sys 0m4.697s
real 0m5.293s user 0m0.003s sys 0m4.844s
real 0m5.303s user 0m0.001s sys 0m4.511s
real 0m5.303s user 0m0.000s sys 0m4.694s
real 0m5.302s user 0m0.002s sys 0m4.677s
real 0m5.313s user 0m0.000s sys 0m4.837s
real 0m5.327s user 0m0.000s sys 0m4.808s
real 0m5.330s user 0m0.001s sys 0m4.893s
real 0m5.358s user 0m0.005s sys 0m4.919s

Our kernel is not built with CONFIG_PREEMPT enabled, so prolonged
system time can lead to CPU pressure and potential latency spikes.
Since we can reliably reproduce this unnecessary spinning, why not
improve it to reduce the overhead?

-- 
Regards
Yafang
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Peter Zijlstra 1 month, 1 week ago
On Wed, Mar 04, 2026 at 07:52:06PM +0800, Yafang Shao wrote:
> On Wed, Mar 4, 2026 at 6:11 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Mar 04, 2026 at 05:37:31PM +0800, Yafang Shao wrote:
> > > On Wed, Mar 4, 2026 at 5:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Mar 04, 2026 at 03:46:49PM +0800, Yafang Shao wrote:
> > > > > Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
> > > > > on the owner for specific heavy locks. This prevents long spinning times
> > > > > that can lead to latency spikes for other tasks on the same runqueue.
> > > >
> > > > This makes no sense; spinning stops on need_resched().
> > >
> > > Hello Peter,
> > >
> > > The condition to stop spinning on need_resched() relies on the mutex
> > > owner remaining unchanged. However, when multiple tasks contend for
> > > the same lock, the owner can change frequently. This creates a
> > > potential TOCTOU (Time of Check to Time of Use) issue.
> > >
> > >   mutex_optimistic_spin
> > >       owner = __mutex_trylock_or_owner(lock);
> > >       mutex_spin_on_owner
> > >           // the __mutex_owner(lock) might get a new owner.
> > >           while (__mutex_owner(lock) == owner)
> > >
> >
> > How do these new owners become the owner? Are they succeeding the
> > __mutex_trylock() that sits before mutex_optimistic_spin() and
> > effectively starving the spinner?
> >
> > Something like the below would make a difference if that were so.
> 
> The following change made no difference; concurrent runs still result
> in prolonged system time.
> 
> real 0m5.265s user 0m0.000s sys 0m4.921s
> real 0m5.295s user 0m0.002s sys 0m4.697s
> real 0m5.293s user 0m0.003s sys 0m4.844s
> real 0m5.303s user 0m0.001s sys 0m4.511s
> real 0m5.303s user 0m0.000s sys 0m4.694s
> real 0m5.302s user 0m0.002s sys 0m4.677s
> real 0m5.313s user 0m0.000s sys 0m4.837s
> real 0m5.327s user 0m0.000s sys 0m4.808s
> real 0m5.330s user 0m0.001s sys 0m4.893s
> real 0m5.358s user 0m0.005s sys 0m4.919s
> 
> Our kernel is not built with CONFIG_PREEMPT enabled, so prolonged
> system time can lead to CPU pressure and potential latency spikes.
> Since we can reliably reproduce this unnecessary spinning, why not
> improve it to reduce the overhead?

If you cannot explain what the problem is (you haven't), there is
nothing to fix.

Also, current kernels cannot be build without PREEMPT; and if you care
about latency running a PREEMPT=n kernel is daft. That said,
TIF_NEED_RESCHED should work irrespective of PREEMPT settings, the
PREEMPT settings just affect when and how you end up in schedule().

Even without PREEMPT, if there is a task waiting either the wakeup or
the tick will set TIF_NEED_RESCHED and it should stop spinning. If there
is no task waiting, there is no actual latency, just burning CPU time,
and that isn't a problem per-se.

What should happen is that the first spinner gets the lock next, the
next spinner is then promoted to first spinner and so on.

This chain continues, which means the lock owner is always
on-cpu and good progress is being made and there is no CPU contention,
or the spinner gets marked for preemption (as said, this does not
require PREEMPT=y) and will stop spinning and go sleep, or the owner
goes to sleep and all the spinners stop and also go sleep.

Again, you have not said anything specific enough to figure out what
happens on your end. You said the owner changes, this means there is
progress made. What isn't clear is if any one particular spinner is
starved (that would be a problem) or if this latency spike you observe
is worse than would be from running a while(1) loop, in which case,
that's just how it is.

What is not sane, is marking random locks with random properties just
because random workload.
Re: [RFC PATCH 1/2] locking: add mutex_lock_nospin()
Posted by Yafang Shao 1 month, 1 week ago
On Wed, Mar 4, 2026 at 8:41 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Mar 04, 2026 at 07:52:06PM +0800, Yafang Shao wrote:
> > On Wed, Mar 4, 2026 at 6:11 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Mar 04, 2026 at 05:37:31PM +0800, Yafang Shao wrote:
> > > > On Wed, Mar 4, 2026 at 5:03 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Wed, Mar 04, 2026 at 03:46:49PM +0800, Yafang Shao wrote:
> > > > > > Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
> > > > > > on the owner for specific heavy locks. This prevents long spinning times
> > > > > > that can lead to latency spikes for other tasks on the same runqueue.
> > > > >
> > > > > This makes no sense; spinning stops on need_resched().
> > > >
> > > > Hello Peter,
> > > >
> > > > The condition to stop spinning on need_resched() relies on the mutex
> > > > owner remaining unchanged. However, when multiple tasks contend for
> > > > the same lock, the owner can change frequently. This creates a
> > > > potential TOCTOU (Time of Check to Time of Use) issue.
> > > >
> > > >   mutex_optimistic_spin
> > > >       owner = __mutex_trylock_or_owner(lock);
> > > >       mutex_spin_on_owner
> > > >           // the __mutex_owner(lock) might get a new owner.
> > > >           while (__mutex_owner(lock) == owner)
> > > >
> > >
> > > How do these new owners become the owner? Are they succeeding the
> > > __mutex_trylock() that sits before mutex_optimistic_spin() and
> > > effectively starving the spinner?
> > >
> > > Something like the below would make a difference if that were so.
> >
> > The following change made no difference; concurrent runs still result
> > in prolonged system time.
> >
> > real 0m5.265s user 0m0.000s sys 0m4.921s
> > real 0m5.295s user 0m0.002s sys 0m4.697s
> > real 0m5.293s user 0m0.003s sys 0m4.844s
> > real 0m5.303s user 0m0.001s sys 0m4.511s
> > real 0m5.303s user 0m0.000s sys 0m4.694s
> > real 0m5.302s user 0m0.002s sys 0m4.677s
> > real 0m5.313s user 0m0.000s sys 0m4.837s
> > real 0m5.327s user 0m0.000s sys 0m4.808s
> > real 0m5.330s user 0m0.001s sys 0m4.893s
> > real 0m5.358s user 0m0.005s sys 0m4.919s
> >
> > Our kernel is not built with CONFIG_PREEMPT enabled, so prolonged
> > system time can lead to CPU pressure and potential latency spikes.
> > Since we can reliably reproduce this unnecessary spinning, why not
> > improve it to reduce the overhead?
>
> If you cannot explain what the problem is (you haven't), there is
> nothing to fix.
>
> Also, current kernels cannot be build without PREEMPT; and if you care
> about latency running a PREEMPT=n kernel is daft. That said,
> TIF_NEED_RESCHED should work irrespective of PREEMPT settings, the
> PREEMPT settings just affect when and how you end up in schedule().
>
> Even without PREEMPT, if there is a task waiting either the wakeup or
> the tick will set TIF_NEED_RESCHED and it should stop spinning. If there
> is no task waiting, there is no actual latency, just burning CPU time,
> and that isn't a problem per-se.
>
> What should happen is that the first spinner gets the lock next, the
> next spinner is then promoted to first spinner and so on.
>
> This chain continues, which means the lock owner is always
> on-cpu and good progress is being made and there is no CPU contention,
> or the spinner gets marked for preemption (as said, this does not
> require PREEMPT=y) and will stop spinning and go sleep, or the owner
> goes to sleep and all the spinners stop and also go sleep.
>
> Again, you have not said anything specific enough to figure out what
> happens on your end. You said the owner changes, this means there is
> progress made. What isn't clear is if any one particular spinner is
> starved (that would be a problem)

As far as I can tell, no spinner is starved. The spinner and the
impacted task are interleaved, which is expected behavior.

> or if this latency spike you observe
> is worse than would be from running a while(1) loop, in which case,
> that's just how it is.

The latency spike occurs because the impacted task must wait for the
spinner to voluntarily yield the CPU. In effect, the spinner behaves
similarly to a while (1) {} loop.

So the real problem here is that we should avoid unnecessary spinning.
Is there any reason we have to spin to speed up the ftrace_lock? I
believe not.

>
> What is not sane, is marking random locks with random properties just
> because random workload.



-- 
Regards
Yafang