[v3] bpf: lru: Fix unintended eviction when updating lru hash maps

[PATCH bpf-next v3 1/5] bpf: lru: Tidy hash handling in LRU code

Posted by Leon Hwang 1 month ago

The hash field is not used by the LRU list itself.

Setting hash while manipulating the LRU list also obscures the intent
of the code and makes it harder to follow.

Tidy this up by moving the hash assignment to prealloc_lru_pop(),
where the element is prepared for insertion into the hash table.

Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 kernel/bpf/bpf_lru_list.c | 24 +++++++++---------------
 kernel/bpf/bpf_lru_list.h |  5 ++---
 kernel/bpf/hashtab.c      |  5 ++---
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e7a2fc60523f..f4e183a9c28f 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -344,10 +344,8 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
 static void __local_list_add_pending(struct bpf_lru *lru,
 				     struct bpf_lru_locallist *loc_l,
 				     int cpu,
-				     struct bpf_lru_node *node,
-				     u32 hash)
+				     struct bpf_lru_node *node)
 {
-	*(u32 *)((void *)node + lru->hash_offset) = hash;
 	node->cpu = cpu;
 	node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
 	bpf_lru_node_clear_ref(node);
@@ -393,8 +391,7 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
 	return NULL;
 }
 
-static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
-						    u32 hash)
+static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru)
 {
 	struct list_head *free_list;
 	struct bpf_lru_node *node = NULL;
@@ -415,7 +412,6 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 
 	if (!list_empty(free_list)) {
 		node = list_first_entry(free_list, struct bpf_lru_node, list);
-		*(u32 *)((void *)node + lru->hash_offset) = hash;
 		bpf_lru_node_clear_ref(node);
 		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
 	}
@@ -425,8 +421,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
 	return node;
 }
 
-static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
-						    u32 hash)
+static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru)
 {
 	struct bpf_lru_locallist *loc_l, *steal_loc_l;
 	struct bpf_common_lru *clru = &lru->common_lru;
@@ -446,7 +441,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 	}
 
 	if (node)
-		__local_list_add_pending(lru, loc_l, cpu, node, hash);
+		__local_list_add_pending(lru, loc_l, cpu, node);
 
 	raw_spin_unlock_irqrestore(&loc_l->lock, flags);
 
@@ -481,19 +476,19 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
 
 	if (node) {
 		raw_spin_lock_irqsave(&loc_l->lock, flags);
-		__local_list_add_pending(lru, loc_l, cpu, node, hash);
+		__local_list_add_pending(lru, loc_l, cpu, node);
 		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
 	}
 
 	return node;
 }
 
-struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru)
 {
 	if (lru->percpu)
-		return bpf_percpu_lru_pop_free(lru, hash);
+		return bpf_percpu_lru_pop_free(lru);
 	else
-		return bpf_common_lru_pop_free(lru, hash);
+		return bpf_common_lru_pop_free(lru);
 }
 
 static void bpf_common_lru_push_free(struct bpf_lru *lru,
@@ -643,7 +638,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 	raw_spin_lock_init(&l->lock);
 }
 
-int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+int bpf_lru_init(struct bpf_lru *lru, bool percpu,
 		 del_from_htab_func del_from_htab, void *del_arg)
 {
 	int cpu;
@@ -681,7 +676,6 @@ int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
 	lru->percpu = percpu;
 	lru->del_from_htab = del_from_htab;
 	lru->del_arg = del_arg;
-	lru->hash_offset = hash_offset;
 
 	return 0;
 }
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index fe2661a58ea9..29e8300e0fd1 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -57,7 +57,6 @@ struct bpf_lru {
 	};
 	del_from_htab_func del_from_htab;
 	void *del_arg;
-	unsigned int hash_offset;
 	unsigned int target_free;
 	unsigned int nr_scans;
 	bool percpu;
@@ -69,12 +68,12 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
 		WRITE_ONCE(node->ref, 1);
 }
 
-int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+int bpf_lru_init(struct bpf_lru *lru, bool percpu,
 		 del_from_htab_func del_from_htab, void *delete_arg);
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 		      u32 elem_size, u32 nr_elems);
 void bpf_lru_destroy(struct bpf_lru *lru);
-struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru);
 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
 
 #endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 441ff5bc54ac..c2d12db9036a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -296,12 +296,13 @@ static void htab_free_elems(struct bpf_htab *htab)
 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 					  u32 hash)
 {
-	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
+	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru);
 	struct htab_elem *l;
 
 	if (node) {
 		bpf_map_inc_elem_count(&htab->map);
 		l = container_of(node, struct htab_elem, lru_node);
+		l->hash = hash;
 		memcpy(l->key, key, htab->map.key_size);
 		return l;
 	}
@@ -342,8 +343,6 @@ static int prealloc_init(struct bpf_htab *htab)
 	if (htab_is_lru(htab))
 		err = bpf_lru_init(&htab->lru,
 				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
-				   offsetof(struct htab_elem, hash) -
-				   offsetof(struct htab_elem, lru_node),
 				   htab_lru_map_delete_node,
 				   htab);
 	else
-- 
2.52.0

Re: [PATCH bpf-next v3 1/5] bpf: lru: Tidy hash handling in LRU code

Posted by Martin KaFai Lau 3 weeks, 2 days ago


On 1/7/26 7:14 AM, Leon Hwang wrote:
> The hash field is not used by the LRU list itself.
> 
> Setting hash while manipulating the LRU list also obscures the intent
> of the code and makes it harder to follow.
> 
> Tidy this up by moving the hash assignment to prealloc_lru_pop(),
> where the element is prepared for insertion into the hash table.
> 
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
>   kernel/bpf/bpf_lru_list.c | 24 +++++++++---------------
>   kernel/bpf/bpf_lru_list.h |  5 ++---
>   kernel/bpf/hashtab.c      |  5 ++---
>   3 files changed, 13 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
> index e7a2fc60523f..f4e183a9c28f 100644
> --- a/kernel/bpf/bpf_lru_list.c
> +++ b/kernel/bpf/bpf_lru_list.c
> @@ -344,10 +344,8 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
>   static void __local_list_add_pending(struct bpf_lru *lru,
>   				     struct bpf_lru_locallist *loc_l,
>   				     int cpu,
> -				     struct bpf_lru_node *node,
> -				     u32 hash)
> +				     struct bpf_lru_node *node)
>   {
> -	*(u32 *)((void *)node + lru->hash_offset) = hash;
>   	node->cpu = cpu;
>   	node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
>   	bpf_lru_node_clear_ref(node);
> @@ -393,8 +391,7 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
>   	return NULL;
>   }
>   
> -static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
> -						    u32 hash)
> +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru)
>   {
>   	struct list_head *free_list;
>   	struct bpf_lru_node *node = NULL;
> @@ -415,7 +412,6 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
>   
>   	if (!list_empty(free_list)) {
>   		node = list_first_entry(free_list, struct bpf_lru_node, list);
> -		*(u32 *)((void *)node + lru->hash_offset) = hash;
>   		bpf_lru_node_clear_ref(node);
>   		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);

init the hash value later (after releasing l->lock) is not correct. The 
node is in the inactive list. The inactive list is one of the rotate and 
_evict_ candidates, meaning tgt_l->hash will be used in 
htab_lru_map_delete_node(). In practice, it does not matter if 
htab_lru_map_delete_node() cannot find the node in an incorrect bucket. 
However, it still should not use an uninitialized value to begin with.

> index 441ff5bc54ac..c2d12db9036a 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -296,12 +296,13 @@ static void htab_free_elems(struct bpf_htab *htab)
>   static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
>   					  u32 hash)
>   {
> -	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
> +	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru);
>   	struct htab_elem *l;
>   
>   	if (node) {
>   		bpf_map_inc_elem_count(&htab->map);
>   		l = container_of(node, struct htab_elem, lru_node);
> +		l->hash = hash;
>   		memcpy(l->key, key, htab->map.key_size);
>   		return l;
>   	}

Re: [PATCH bpf-next v3 1/5] bpf: lru: Tidy hash handling in LRU code

Posted by Leon Hwang 3 weeks, 2 days ago


On 15/1/26 02:44, Martin KaFai Lau wrote:
> 
> 
> On 1/7/26 7:14 AM, Leon Hwang wrote:
>> The hash field is not used by the LRU list itself.
>>
>> Setting hash while manipulating the LRU list also obscures the intent
>> of the code and makes it harder to follow.
>>
>> Tidy this up by moving the hash assignment to prealloc_lru_pop(),
>> where the element is prepared for insertion into the hash table.
>>
>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
>> ---
>>   kernel/bpf/bpf_lru_list.c | 24 +++++++++---------------
>>   kernel/bpf/bpf_lru_list.h |  5 ++---
>>   kernel/bpf/hashtab.c      |  5 ++---
>>   3 files changed, 13 insertions(+), 21 deletions(-)
>>
>> diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
>> index e7a2fc60523f..f4e183a9c28f 100644
>> --- a/kernel/bpf/bpf_lru_list.c
>> +++ b/kernel/bpf/bpf_lru_list.c
>> @@ -344,10 +344,8 @@ static void bpf_lru_list_pop_free_to_local(struct
>> bpf_lru *lru,
>>   static void __local_list_add_pending(struct bpf_lru *lru,
>>                        struct bpf_lru_locallist *loc_l,
>>                        int cpu,
>> -                     struct bpf_lru_node *node,
>> -                     u32 hash)
>> +                     struct bpf_lru_node *node)
>>   {
>> -    *(u32 *)((void *)node + lru->hash_offset) = hash;
>>       node->cpu = cpu;
>>       node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
>>       bpf_lru_node_clear_ref(node);
>> @@ -393,8 +391,7 @@ __local_list_pop_pending(struct bpf_lru *lru,
>> struct bpf_lru_locallist *loc_l)
>>       return NULL;
>>   }
>>   -static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru
>> *lru,
>> -                            u32 hash)
>> +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru)
>>   {
>>       struct list_head *free_list;
>>       struct bpf_lru_node *node = NULL;
>> @@ -415,7 +412,6 @@ static struct bpf_lru_node
>> *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
>>         if (!list_empty(free_list)) {
>>           node = list_first_entry(free_list, struct bpf_lru_node, list);
>> -        *(u32 *)((void *)node + lru->hash_offset) = hash;
>>           bpf_lru_node_clear_ref(node);
>>           __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
> 
> init the hash value later (after releasing l->lock) is not correct. The
> node is in the inactive list. The inactive list is one of the rotate and
> _evict_ candidates, meaning tgt_l->hash will be used in
> htab_lru_map_delete_node(). In practice, it does not matter if
> htab_lru_map_delete_node() cannot find the node in an incorrect bucket.
> However, it still should not use an uninitialized value to begin with.
> 

Thanks for the explanation — this is the part I missed earlier.

Without additional context or comments in the code, it was not obvious
why the hash needs to be set at that point.

I’ll drop this change as-is. If you have suggestions for a clearer or
better way to handle the hash assignment while preserving the required
ordering, I’d appreciate your guidance.

Thanks,
Leon

[PATCH bpf-next v3 1/5] bpf: lru: Tidy hash handling in LRU code
[PATCH bpf-next v3 2/5] bpf: lru: Factor out bpf_lru_node_reset_state helper
[PATCH bpf-next v3 3/5] bpf: lru: Factor out bpf_lru_move_next_inactive_rotation helper
[PATCH bpf-next v3 4/5] bpf: lru: Fix unintended eviction when updating lru hash maps
[PATCH bpf-next v3 5/5] selftests/bpf: Add tests to verify no unintended eviction when updating lru_[percpu_,]hash maps