[PATCH v2 8/8] locking/rwsem: Restore old write lock slow path behavior

Waiman Long posted 8 patches 2 years, 5 months ago
[PATCH v2 8/8] locking/rwsem: Restore old write lock slow path behavior
Posted by Waiman Long 2 years, 5 months ago
An earlier commit ("locking/rwsem: Rework writer wakeup") moves writer
lock acquisition to the write wakeup path only. This can result in
an almost immediate transfer of write lock ownership after an unlock
leaving little time for lock stealing. That can be long before the new
write lock owner wakes up and run its critical section.

As a result, write lock stealing from optimistic spinning will be
greatly suppressed. By enabling CONFIG_LOCK_EVENT_COUNTS and running a
rwsem locking microbenmark on a 2-socket x86-64 test machine for 15s,
it was found that the locking rate was reduced to about 30% of that
before the patch - 584,091 op/s vs. 171,184 ops/s.  The total number
of lock stealings within the testing period was reduced to less than 1%
of that before the patch - 4,252,147 vs 17,939 [1].

To restore the lost performance, this patch restores the old write lock
slow path behavior of acquiring the lock after the waiter has been woken
up with the exception that lock transfer may happen in the wakeup path
if the HANDOFF bit has been set. In addition, the waiter that sets the
HANDOFF bit will still try to spin on the lock owner if it is on cpu.

With this patch, the locking rate is now back up to 580,256 ops/s which
is almost the same as before.

[1] https://lore.kernel.org/lkml/c126f079-88a2-4067-6f94-82f51cf5ff2b@redhat.com
/

Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/locking/rwsem.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 7bd26e64827a..cf9dc1e250e0 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -426,6 +426,7 @@ rwsem_waiter_wake(struct rwsem_waiter *waiter, struct wake_q_head *wake_q)
 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 					struct rwsem_waiter *waiter)
 {
+	bool first = (rwsem_first_waiter(sem) == waiter);
 	long count, new;
 
 	lockdep_assert_held(&sem->wait_lock);
@@ -434,6 +435,9 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 	do {
 		new = count;
 
+		if (!first && (count & (RWSEM_FLAG_HANDOFF | RWSEM_LOCK_MASK)))
+			return false;
+
 		if (count & RWSEM_LOCK_MASK) {
 			/*
 			 * A waiter (first or not) can set the handoff bit
@@ -501,11 +505,18 @@ static void rwsem_writer_wake(struct rw_semaphore *sem,
 		 */
 		list_del(&waiter->list);
 		atomic_long_set(&sem->owner, (long)waiter->task);
-
-	} else if (!rwsem_try_write_lock(sem, waiter))
+		rwsem_waiter_wake(waiter, wake_q);
 		return;
+	}
 
-	rwsem_waiter_wake(waiter, wake_q);
+	/*
+	 * Mark writer at the front of the queue for wakeup.
+	 * Until the task is actually awoken later by the caller, other
+	 * writers are able to steal it. Readers, on the other hand, will
+	 * block as they will notice the queued writer.
+	 */
+	wake_q_add(wake_q, waiter->task);
+	lockevent_inc(rwsem_wake_writer);
 }
 
 static void rwsem_reader_wake(struct rw_semaphore *sem,
@@ -1038,6 +1049,23 @@ rwsem_waiter_wait(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
 			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
 			break;
 		}
+		if (!reader) {
+			/*
+			 * Writer still needs to do a trylock here
+			 */
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter->task && rwsem_try_write_lock(sem, waiter))
+				waiter->task = NULL;
+			raw_spin_unlock_irq(&sem->wait_lock);
+			if (!smp_load_acquire(&waiter->task))
+				break;
+
+			if (waiter->handoff_set) {
+				rwsem_spin_on_owner(sem);
+				if (!smp_load_acquire(&waiter->task))
+					break;
+			}
+		}
 		if (signal_pending_state(state, current)) {
 			raw_spin_lock_irq(&sem->wait_lock);
 			if (waiter->task)
-- 
2.31.1
Re: [PATCH v2 8/8] locking/rwsem: Restore old write lock slow path behavior
Posted by Peter Zijlstra 2 years, 5 months ago
On Mon, Mar 27, 2023 at 04:24:13PM -0400, Waiman Long wrote:

>  kernel/locking/rwsem.c | 34 +++++++++++++++++++++++++++++++---
>  1 file changed, 31 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index 7bd26e64827a..cf9dc1e250e0 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -426,6 +426,7 @@ rwsem_waiter_wake(struct rwsem_waiter *waiter, struct wake_q_head *wake_q)
>  static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
>  					struct rwsem_waiter *waiter)
>  {
> +	bool first = (rwsem_first_waiter(sem) == waiter);
>  	long count, new;
>  
>  	lockdep_assert_held(&sem->wait_lock);
> @@ -434,6 +435,9 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
>  	do {
>  		new = count;
>  
> +		if (!first && (count & (RWSEM_FLAG_HANDOFF | RWSEM_LOCK_MASK)))
> +			return false;
> +
>  		if (count & RWSEM_LOCK_MASK) {
>  			/*
>  			 * A waiter (first or not) can set the handoff bit

I couldn't immediately make sense of the above, and I think there's case
where not-first would still want to set handoff you're missing.

After a few detours, I ended up with the below; does that make sense or
did I just make a bigger mess? (entirely possible due to insufficient
sleep etc..).


--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -426,12 +426,27 @@ rwsem_waiter_wake(struct rwsem_waiter *w
 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
 					struct rwsem_waiter *waiter)
 {
+	bool first = (rwsem_first_waiter(sem) == waiter);
 	long count, new;
 
 	lockdep_assert_held(&sem->wait_lock);
 
 	count = atomic_long_read(&sem->count);
 	do {
+		/*
+		 * first handoff
+		 *
+		 *   0     0     | take
+		 *   0     1     | not take
+		 *   1     0     | take
+		 *   1     1     | take
+		 *
+		 */
+		bool handoff = count & RWSEM_FLAG_HANDOFF;
+
+		if (!first && handoff)
+			return false;
+
 		new = count;
 
 		if (count & RWSEM_LOCK_MASK) {
@@ -440,7 +455,7 @@ static inline bool rwsem_try_write_lock(
 			 * if it is an RT task or wait in the wait queue
 			 * for too long.
 			 */
-			if ((count & RWSEM_FLAG_HANDOFF) ||
+			if (handoff ||
 			    (!rt_task(waiter->task) &&
 			     !time_after(jiffies, waiter->timeout)))
 				return false;
@@ -501,11 +516,19 @@ static void rwsem_writer_wake(struct rw_
 		 */
 		list_del(&waiter->list);
 		atomic_long_set(&sem->owner, (long)waiter->task);
-
-	} else if (!rwsem_try_write_lock(sem, waiter))
+		rwsem_waiter_wake(waiter, wake_q);
 		return;
+	}
 
-	rwsem_waiter_wake(waiter, wake_q);
+	/*
+	 * Mark writer at the front of the queue for wakeup.
+	 *
+	 * Until the task is actually awoken later by the caller, other writers
+	 * are able to steal it. Readers, on the other hand, will block as they
+	 * will notice the queued writer.
+	 */
+	wake_q_add(wake_q, waiter->task);
+	lockevent_inc(rwsem_wake_writer);
 }
 
 static void rwsem_reader_wake(struct rw_semaphore *sem,
@@ -1038,6 +1061,25 @@ rwsem_waiter_wait(struct rw_semaphore *s
 			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
 			break;
 		}
+		if (!reader) {
+			/*
+			 * If rwsem_writer_wake() did not take the lock, we must
+			 * rwsem_try_write_lock() here.
+			 */
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter->task && rwsem_try_write_lock(sem, waiter)) {
+				waiter->task = NULL;
+				raw_spin_unlock_irq(&sem->wait_lock);
+				break;
+			}
+			raw_spin_unlock_irq(&sem->wait_lock);
+
+			if (waiter->handoff_set)
+				rwsem_spin_on_owner(sem);
+
+			if (!smp_load_acquire(&waiter->task))
+				break;
+		}
 		if (signal_pending_state(state, current)) {
 			raw_spin_lock_irq(&sem->wait_lock);
 			if (waiter->task)
Re: [PATCH v2 8/8] locking/rwsem: Restore old write lock slow path behavior
Posted by Waiman Long 2 years, 5 months ago
On 3/28/23 10:02, Peter Zijlstra wrote:
> On Mon, Mar 27, 2023 at 04:24:13PM -0400, Waiman Long wrote:
>
>>   kernel/locking/rwsem.c | 34 +++++++++++++++++++++++++++++++---
>>   1 file changed, 31 insertions(+), 3 deletions(-)
>>
>> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
>> index 7bd26e64827a..cf9dc1e250e0 100644
>> --- a/kernel/locking/rwsem.c
>> +++ b/kernel/locking/rwsem.c
>> @@ -426,6 +426,7 @@ rwsem_waiter_wake(struct rwsem_waiter *waiter, struct wake_q_head *wake_q)
>>   static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
>>   					struct rwsem_waiter *waiter)
>>   {
>> +	bool first = (rwsem_first_waiter(sem) == waiter);
>>   	long count, new;
>>   
>>   	lockdep_assert_held(&sem->wait_lock);
>> @@ -434,6 +435,9 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
>>   	do {
>>   		new = count;
>>   
>> +		if (!first && (count & (RWSEM_FLAG_HANDOFF | RWSEM_LOCK_MASK)))
>> +			return false;
>> +
>>   		if (count & RWSEM_LOCK_MASK) {
>>   			/*
>>   			 * A waiter (first or not) can set the handoff bit
> I couldn't immediately make sense of the above, and I think there's case
> where not-first would still want to set handoff you're missing.

It is possible to do that, but we need a minor change to make sure that 
you set the handoff_set flag of the first waiter instead of the current 
waiter which is what the current rwsem code is doing. Other than that, I 
think the change is OK, though I need to take a further look into it 
tomorrow as it is now late for me too.

Cheers,
longman

>
> After a few detours, I ended up with the below; does that make sense or
> did I just make a bigger mess? (entirely possible due to insufficient
> sleep etc..).
>
>
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -426,12 +426,27 @@ rwsem_waiter_wake(struct rwsem_waiter *w
>   static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
>   					struct rwsem_waiter *waiter)
>   {
> +	bool first = (rwsem_first_waiter(sem) == waiter);
>   	long count, new;
>   
>   	lockdep_assert_held(&sem->wait_lock);
>   
>   	count = atomic_long_read(&sem->count);
>   	do {
> +		/*
> +		 * first handoff
> +		 *
> +		 *   0     0     | take
> +		 *   0     1     | not take
> +		 *   1     0     | take
> +		 *   1     1     | take
> +		 *
> +		 */
> +		bool handoff = count & RWSEM_FLAG_HANDOFF;
> +
> +		if (!first && handoff)
> +			return false;
> +
>   		new = count;
>   
>   		if (count & RWSEM_LOCK_MASK) {
> @@ -440,7 +455,7 @@ static inline bool rwsem_try_write_lock(
>   			 * if it is an RT task or wait in the wait queue
>   			 * for too long.
>   			 */
> -			if ((count & RWSEM_FLAG_HANDOFF) ||
> +			if (handoff ||
>   			    (!rt_task(waiter->task) &&
>   			     !time_after(jiffies, waiter->timeout)))
>   				return false;
> @@ -501,11 +516,19 @@ static void rwsem_writer_wake(struct rw_
>   		 */
>   		list_del(&waiter->list);
>   		atomic_long_set(&sem->owner, (long)waiter->task);
> -
> -	} else if (!rwsem_try_write_lock(sem, waiter))
> +		rwsem_waiter_wake(waiter, wake_q);
>   		return;
> +	}
>   
> -	rwsem_waiter_wake(waiter, wake_q);
> +	/*
> +	 * Mark writer at the front of the queue for wakeup.
> +	 *
> +	 * Until the task is actually awoken later by the caller, other writers
> +	 * are able to steal it. Readers, on the other hand, will block as they
> +	 * will notice the queued writer.
> +	 */
> +	wake_q_add(wake_q, waiter->task);
> +	lockevent_inc(rwsem_wake_writer);
>   }
>   
>   static void rwsem_reader_wake(struct rw_semaphore *sem,
> @@ -1038,6 +1061,25 @@ rwsem_waiter_wait(struct rw_semaphore *s
>   			/* Matches rwsem_waiter_wake()'s smp_store_release(). */
>   			break;
>   		}
> +		if (!reader) {
> +			/*
> +			 * If rwsem_writer_wake() did not take the lock, we must
> +			 * rwsem_try_write_lock() here.
> +			 */
> +			raw_spin_lock_irq(&sem->wait_lock);
> +			if (waiter->task && rwsem_try_write_lock(sem, waiter)) {
> +				waiter->task = NULL;
> +				raw_spin_unlock_irq(&sem->wait_lock);
> +				break;
> +			}
> +			raw_spin_unlock_irq(&sem->wait_lock);
> +
> +			if (waiter->handoff_set)
> +				rwsem_spin_on_owner(sem);
> +
> +			if (!smp_load_acquire(&waiter->task))
> +				break;
> +		}
>   		if (signal_pending_state(state, current)) {
>   			raw_spin_lock_irq(&sem->wait_lock);
>   			if (waiter->task)
>