[PATCH v9 09/12] bpf/rqspinlock: switch check_timeout() to a clock interface

Ankur Arora posted 12 patches 9 hours ago
[PATCH v9 09/12] bpf/rqspinlock: switch check_timeout() to a clock interface
Posted by Ankur Arora 9 hours ago
check_timeout() gets the current time value and depending on how
much time has passed, checks for deadlock or times out, returning 0
or -errno on deadlock or timeout.

Switch this out to a clock style interface, where it functions as a
clock in the "lock-domain", returning the current time until a
deadlock or timeout occurs. Once a deadlock or timeout has occurred,
it stops functioning as a clock and returns error.

Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: bpf@vger.kernel.org
Signed-off-by: Ankur Arora <ankur.a.arora@oracle.com>
---
 kernel/bpf/rqspinlock.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index f7d0c8d4644e..ac9b3572e42f 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -196,8 +196,12 @@ static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask)
 	return 0;
 }
 
-static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
-				  struct rqspinlock_timeout *ts)
+/*
+ * Returns current monotonic time in ns on success or, negative errno
+ * value on failure due to timeout expiration or detection of deadlock.
+ */
+static noinline s64 clock_deadlock(rqspinlock_t *lock, u32 mask,
+				   struct rqspinlock_timeout *ts)
 {
 	u64 prev = ts->cur;
 	u64 time;
@@ -207,7 +211,7 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 			return -EDEADLK;
 		ts->cur = ktime_get_mono_fast_ns();
 		ts->timeout_end = ts->cur + ts->duration;
-		return 0;
+		return (s64)ts->cur;
 	}
 
 	time = ktime_get_mono_fast_ns();
@@ -219,11 +223,15 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 	 * checks.
 	 */
 	if (prev + NSEC_PER_MSEC < time) {
+		int ret;
 		ts->cur = time;
-		return check_deadlock_ABBA(lock, mask);
+		ret = check_deadlock_ABBA(lock, mask);
+		if (ret)
+			return ret;
+
 	}
 
-	return 0;
+	return (s64)time;
 }
 
 /*
@@ -234,12 +242,12 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 #define RES_CHECK_TIMEOUT(ts, ret, mask)                              \
 	({                                                            \
 		if (!(ts).spin++)                                     \
-			(ret) = check_timeout((lock), (mask), &(ts)); \
+			(ret) = clock_deadlock((lock), (mask), &(ts));\
 		(ret);                                                \
 	})
 #else
 #define RES_CHECK_TIMEOUT(ts, ret, mask)			      \
-	({ (ret) = check_timeout((lock), (mask), &(ts)); })
+	({ (ret) = clock_deadlock((lock), (mask), &(ts)); })
 #endif
 
 /*
@@ -261,7 +269,8 @@ static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
 int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
 {
 	struct rqspinlock_timeout ts;
-	int val, ret = 0;
+	s64 ret = 0;
+	int val;
 
 	RES_INIT_TIMEOUT(ts);
 	/*
@@ -280,7 +289,7 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
 	val = atomic_read(&lock->val);
 
 	if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) {
-		if (RES_CHECK_TIMEOUT(ts, ret, ~0u))
+		if (RES_CHECK_TIMEOUT(ts, ret, ~0u) < 0)
 			goto out;
 		cpu_relax();
 		goto retry;
@@ -339,6 +348,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 {
 	struct mcs_spinlock *prev, *next, *node;
 	struct rqspinlock_timeout ts;
+	s64 timeout_err = 0;
 	int idx, ret = 0;
 	u32 old, tail;
 
@@ -405,10 +415,10 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 */
 	if (val & _Q_LOCKED_MASK) {
 		RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
-		res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+		res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, timeout_err, _Q_LOCKED_MASK) < 0);
 	}
 
-	if (ret) {
+	if (timeout_err < 0) {
 		/*
 		 * We waited for the locked bit to go back to 0, as the pending
 		 * waiter, but timed out. We need to clear the pending bit since
@@ -420,6 +430,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 		 */
 		clear_pending(lock);
 		lockevent_inc(rqspinlock_lock_timeout);
+		ret = timeout_err;
 		goto err_release_entry;
 	}
 
@@ -567,18 +578,19 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 	 */
 	RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
 	val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
-					   RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+					   RES_CHECK_TIMEOUT(ts, timeout_err, _Q_LOCKED_PENDING_MASK) < 0);
 
 	/* Disable queue destruction when we detect deadlocks. */
-	if (ret == -EDEADLK) {
+	if (timeout_err == -EDEADLK) {
 		if (!next)
 			next = smp_cond_load_relaxed(&node->next, (VAL));
 		arch_mcs_spin_unlock_contended(&next->locked);
+		ret = timeout_err;
 		goto err_release_node;
 	}
 
 waitq_timeout:
-	if (ret) {
+	if (timeout_err < 0) {
 		/*
 		 * If the tail is still pointing to us, then we are the final waiter,
 		 * and are responsible for resetting the tail back to 0. Otherwise, if
@@ -608,6 +620,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
 			WRITE_ONCE(next->locked, RES_TIMEOUT_VAL);
 		}
 		lockevent_inc(rqspinlock_lock_timeout);
+		ret = timeout_err;
 		goto err_release_node;
 	}
 
-- 
2.31.1
Re: [PATCH v9 09/12] bpf/rqspinlock: switch check_timeout() to a clock interface
Posted by bot+bpf-ci@kernel.org 9 hours ago
> diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
> index 2fdfa828e3d3..f02b1c5f0222 100644
> --- a/kernel/bpf/rqspinlock.c
> +++ b/kernel/bpf/rqspinlock.c

[ ... ]

> @@ -340,6 +349,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
> {
> 	struct mcs_spinlock *prev, *next, *node;
> 	struct rqspinlock_timeout ts;
> +	s64 timeout_err = 0;
> 	int idx, ret = 0;
> 	u32 old, tail;

[ ... ]

In arch_mcs_spin_lock_contended(), when a waiter in the MCS queue receives
a timeout signal (RES_TIMEOUT_VAL) from the previous waiter:

> 	if (old & _Q_TAIL_MASK) {
> 		int val;
>
> 		prev = decode_tail(old, rqnodes);
>
> 		/* Link @node into the waitqueue. */
> 		WRITE_ONCE(prev->next, node);
>
> 		val = arch_mcs_spin_lock_contended(&node->locked);
> 		if (val == RES_TIMEOUT_VAL) {
> 			ret = -ETIMEDOUT;
> 			goto waitq_timeout;
> 		}

This sets ret = -ETIMEDOUT but does not set timeout_err. When execution
reaches the waitq_timeout label:

> waitq_timeout:
> -	if (ret) {
> +	if (timeout_err < 0) {

The condition evaluates to false since timeout_err is still 0. This causes
the timeout handling to be skipped entirely, and execution falls through
to the lock acquisition path. If the lock is acquired, the function
returns ret (which is -ETIMEDOUT), making the caller believe the lock was
not acquired when it actually was. This appears to be a lock leak.

Should the goto waitq_timeout path also set timeout_err = -ETIMEDOUT?


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/21810635323

AI-authorship-score: low
AI-authorship-explanation: The commit shows typical human developer patterns including a missed edge case during refactoring, which is a common human error rather than systematic AI-generated code issues.
issues-found: 1
issue-severity-score: high
issue-severity-explanation: Lock leak bug where the function may acquire a lock but return an error code, causing callers to think acquisition failed when it actually succeeded.