use rqspinlock for bpf lru map

[PATCH bpf 1/2] bpf: use rqspinlock for lru map

Posted by Menglong Dong 3 months, 1 week ago

For now, raw_spinlock is used during adding, deleting and updating in the
bpf lru map, which can lead to deadlock if it is done in the NMI context,
as described in [1].

Fix this by convert the raw_spinlock_t in bpf_lru_list and
bpf_lru_locallist to rqspinlock_t.

Link: https://lore.kernel.org/bpf/CAEf4BzbTJCUx0D=zjx6+5m5iiGhwLzaP94hnw36ZMDHAf4-U_w@mail.gmail.com/
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 kernel/bpf/bpf_lru_list.c | 47 +++++++++++++++++++++++----------------
 kernel/bpf/bpf_lru_list.h |  5 +++--
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e7a2fc60523f..38fddcb1e28c 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -307,9 +307,10 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l,
 	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
 		return;
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&l->lock, flags))
+		return;
 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&l->lock, flags);
 }
 
 static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
@@ -319,7 +320,8 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 	struct bpf_lru_node *node, *tmp_node;
 	unsigned int nfree = 0;
 
-	raw_spin_lock(&l->lock);
+	if (raw_res_spin_lock(&l->lock))
+		return;
 
 	__local_list_flush(l, loc_l);
 
@@ -338,7 +340,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 				      local_free_list(loc_l),
 				      BPF_LRU_LOCAL_LIST_T_FREE);
 
-	raw_spin_unlock(&l->lock);
+	raw_res_spin_unlock(&l->lock);
 }
 
 static void __local_list_add_pending(struct bpf_lru *lru,
@@ -404,7 +406,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 
 	l = per_cpu_ptr(lru->percpu_lru, cpu);
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&l->lock, flags))
+		return NULL;
 
 	__bpf_lru_list_rotate(lru, l);
 
@@ -420,7 +423,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
 	}
 
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&l->lock, flags);
 
 	return node;
 }
@@ -437,7 +440,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 
 	loc_l = per_cpu_ptr(clru->local_list, cpu);
 
-	raw_spin_lock_irqsave(&loc_l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
+		return NULL;
 
 	node = __local_list_pop_free(loc_l);
 	if (!node) {
@@ -448,7 +452,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	if (node)
 		__local_list_add_pending(lru, loc_l, cpu, node, hash);
 
-	raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
 
 	if (node)
 		return node;
@@ -466,23 +470,26 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	do {
 		steal_loc_l = per_cpu_ptr(clru->local_list, steal);
 
-		raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
+		if (raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags))
+			goto out_next;
 
 		node = __local_list_pop_free(steal_loc_l);
 		if (!node)
 			node = __local_list_pop_pending(lru, steal_loc_l);
 
-		raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+		raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
 
+out_next:
 		steal = cpumask_next_wrap(steal, cpu_possible_mask);
 	} while (!node && steal != first_steal);
 
 	loc_l->next_steal = steal;
 
 	if (node) {
-		raw_spin_lock_irqsave(&loc_l->lock, flags);
+		if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
+			return NULL;
 		__local_list_add_pending(lru, loc_l, cpu, node, hash);
-		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+		raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
 	}
 
 	return node;
@@ -511,10 +518,11 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
 
 		loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
 
-		raw_spin_lock_irqsave(&loc_l->lock, flags);
+		if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
+			return;
 
 		if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
-			raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+			raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
 			goto check_lru_list;
 		}
 
@@ -522,7 +530,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
 		bpf_lru_node_clear_ref(node);
 		list_move(&node->list, local_free_list(loc_l));
 
-		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+		raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
 		return;
 	}
 
@@ -538,11 +546,12 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
 
 	l = per_cpu_ptr(lru->percpu_lru, node->cpu);
 
-	raw_spin_lock_irqsave(&l->lock, flags);
+	if (raw_res_spin_lock_irqsave(&l->lock, flags))
+		return;
 
 	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
 
-	raw_spin_unlock_irqrestore(&l->lock, flags);
+	raw_res_spin_unlock_irqrestore(&l->lock, flags);
 }
 
 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
@@ -625,7 +634,7 @@ static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
 
 	loc_l->next_steal = cpu;
 
-	raw_spin_lock_init(&loc_l->lock);
+	raw_res_spin_lock_init(&loc_l->lock);
 }
 
 static void bpf_lru_list_init(struct bpf_lru_list *l)
@@ -640,7 +649,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 
 	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 
-	raw_spin_lock_init(&l->lock);
+	raw_res_spin_lock_init(&l->lock);
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index fe2661a58ea9..61fc7d7f9de1 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -7,6 +7,7 @@
 #include <linux/cache.h>
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
+#include <asm-generic/rqspinlock.h>
 
 #define NR_BPF_LRU_LIST_T	(3)
 #define NR_BPF_LRU_LIST_COUNT	(2)
@@ -34,13 +35,13 @@ struct bpf_lru_list {
 	/* The next inactive list rotation starts from here */
 	struct list_head *next_inactive_rotation;
 
-	raw_spinlock_t lock ____cacheline_aligned_in_smp;
+	rqspinlock_t lock ____cacheline_aligned_in_smp;
 };
 
 struct bpf_lru_locallist {
 	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
 	u16 next_steal;
-	raw_spinlock_t lock;
+	rqspinlock_t lock;
 };
 
 struct bpf_common_lru {
-- 
2.51.2

Re: [PATCH bpf 1/2] bpf: use rqspinlock for lru map

Posted by bot+bpf-ci@kernel.org 3 months, 1 week ago

```
commit 8ae9fd98fe53c45a1a89adf6d254c2db90d26544
Author: Menglong Dong <menglong8.dong@gmail.com>

bpf: use rqspinlock for lru map

This patch converts raw_spinlock_t to rqspinlock_t in BPF LRU maps to
prevent NMI context deadlocks. The new lock API can fail gracefully
instead of deadlocking, returning an error code.

Link: https://lore.kernel.org/bpf/CAEf4BzbTJCUx0D=zjx6+5m5iiGhwLzaP94hnw36ZMDHAf4-U_w@mail.gmail.com/

> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
> index e7a2fc605..38fddcb1e 100644
> --- a/kernel/bpf/bpf_lru_list.c
> +++ b/kernel/bpf/bpf_lru_list.c

[ ... ]

> @@ -466,23 +470,26 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
>  	do {
>  		steal_loc_l = per_cpu_ptr(clru->local_list, steal);
>
> -		raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
> +		if (raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags))
> +			goto out_next;
>
>  		node = __local_list_pop_free(steal_loc_l);
>  		if (!node)
>  			node = __local_list_pop_pending(lru, steal_loc_l);
>
> -		raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
> +		raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
>
> +out_next:
>  		steal = cpumask_next_wrap(steal, cpu_possible_mask);
>  	} while (!node && steal != first_steal);
>
>  	loc_l->next_steal = steal;
>
>  	if (node) {
> -		raw_spin_lock_irqsave(&loc_l->lock, flags);
> +		if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
> +			return NULL;
                        ^^^^^^^^^^^

Can this leak the node? At this point in bpf_common_lru_pop_free(),
a node was successfully stolen from another CPU's list. The
__local_list_pop_free() or __local_list_pop_pending() call removed
the node from steal_loc_l, and steal_loc_l was unlocked. The node
now exists but is not in any list.

When raw_res_spin_lock_irqsave() fails here, the function returns
NULL, but the stolen node is never added to the local pending list
and never returned to any list. The node becomes orphaned.

In NMI context (which this patch is designed to handle), lock
failures are expected to be common. Each failure would leak one
pre-allocated node from the BPF LRU pool, eventually exhausting
the available nodes.

>  		__local_list_add_pending(lru, loc_l, cpu, node, hash);
> -		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
> +		raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
>  	}
>
>  	return node;

[ ... ]


```

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

In-Reply-To-Subject: `bpf: use rqspinlock for lru map`
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/18928591681

Re: [PATCH bpf 1/2] bpf: use rqspinlock for lru map

Posted by Alexei Starovoitov 3 months, 1 week ago

On Wed, Oct 29, 2025 at 8:38 PM <bot+bpf-ci@kernel.org> wrote:
>
>
> ```
> commit 8ae9fd98fe53c45a1a89adf6d254c2db90d26544
> Author: Menglong Dong <menglong8.dong@gmail.com>
>
> bpf: use rqspinlock for lru map
>
> This patch converts raw_spinlock_t to rqspinlock_t in BPF LRU maps to
> prevent NMI context deadlocks. The new lock API can fail gracefully
> instead of deadlocking, returning an error code.
>
> Link: https://lore.kernel.org/bpf/CAEf4BzbTJCUx0D=zjx6+5m5iiGhwLzaP94hnw36ZMDHAf4-U_w@mail.gmail.com/
>
> > diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
> > index e7a2fc605..38fddcb1e 100644
> > --- a/kernel/bpf/bpf_lru_list.c
> > +++ b/kernel/bpf/bpf_lru_list.c
>
> [ ... ]
>
> > @@ -466,23 +470,26 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
> >       do {
> >               steal_loc_l = per_cpu_ptr(clru->local_list, steal);
> >
> > -             raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
> > +             if (raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags))
> > +                     goto out_next;
> >
> >               node = __local_list_pop_free(steal_loc_l);
> >               if (!node)
> >                       node = __local_list_pop_pending(lru, steal_loc_l);
> >
> > -             raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
> > +             raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
> >
> > +out_next:
> >               steal = cpumask_next_wrap(steal, cpu_possible_mask);
> >       } while (!node && steal != first_steal);
> >
> >       loc_l->next_steal = steal;
> >
> >       if (node) {
> > -             raw_spin_lock_irqsave(&loc_l->lock, flags);
> > +             if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))
> > +                     return NULL;
>                         ^^^^^^^^^^^
>
> Can this leak the node? At this point in bpf_common_lru_pop_free(),
> a node was successfully stolen from another CPU's list. The
> __local_list_pop_free() or __local_list_pop_pending() call removed
> the node from steal_loc_l, and steal_loc_l was unlocked. The node
> now exists but is not in any list.
>
> When raw_res_spin_lock_irqsave() fails here, the function returns
> NULL, but the stolen node is never added to the local pending list
> and never returned to any list. The node becomes orphaned.

AI is right. Here and in other places you can just leak the objects.
res_spin_lock() is not a drop-in replacement.
The whole thing needs to be thought through.

Re: [PATCH bpf 1/2] bpf: use rqspinlock for lru map

Posted by Menglong Dong 3 months, 1 week ago

On Fri, Oct 31, 2025 at 6:38 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Oct 29, 2025 at 8:38 PM <bot+bpf-ci@kernel.org> wrote:
> >
[......]
>
> AI is right. Here and in other places you can just leak the objects.
> res_spin_lock() is not a drop-in replacement.
> The whole thing needs to be thought through.

Yeah, I see. I'll analyse how to handle the "return" path.

Thanks!
Menglong Dong