[PATCH bpf-next v2 4/5] bpf: lru: Fix unintended eviction when updating lru hash maps

Leon Hwang posted 5 patches 1 month ago
There is a newer version of this series
[PATCH bpf-next v2 4/5] bpf: lru: Fix unintended eviction when updating lru hash maps
Posted by Leon Hwang 1 month ago
When updating an existing element in lru_[percpu_,]hash maps, the current
implementation always calls prealloc_lru_pop() to get a new node before
checking if the key already exists. If the map is full, this triggers
LRU eviction and removes an existing element, even though the update
operation only needs to modify the value of an existing key in-place.

This is problematic because:
1. Users may unexpectedly lose entries when doing simple value updates
2. The eviction overhead is unnecessary for existing key updates

Fix this by first checking if the key exists before allocating a new
node. If the key is found, update the value using the extra lru node
without triggering any eviction.

Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH")
Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH")
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 kernel/bpf/bpf_lru_list.c | 164 +++++++++++++++++++++++++++++++++++---
 kernel/bpf/bpf_lru_list.h |   5 +-
 kernel/bpf/hashtab.c      |  85 ++++++++++++++++++--
 3 files changed, 239 insertions(+), 15 deletions(-)

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 563707af8035..142b0f10b011 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -124,6 +124,41 @@ static void __bpf_lru_node_move(struct bpf_lru_list *l,
 	list_move(&node->list, &l->lists[tgt_type]);
 }
 
+static struct bpf_lru_node *__bpf_lru_node_move_from_extra(struct bpf_lru_list *l,
+							   enum bpf_lru_list_type tgt_type)
+{
+	struct bpf_lru_node *node;
+
+	node = list_first_entry_or_null(&l->extra, struct bpf_lru_node, list);
+	if (!node)
+		return NULL;
+
+	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+		return NULL;
+
+	bpf_lru_list_count_inc(l, tgt_type);
+	bpf_lru_node_reset_state(node, tgt_type);
+	list_move(&node->list, &l->lists[tgt_type]);
+	return node;
+}
+
+static bool __bpf_lru_node_move_to_extra(struct bpf_lru_list *l,
+					 struct bpf_lru_node *node)
+{
+	if (!list_empty(&l->extra))
+		return false;
+
+	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+		return false;
+
+	bpf_lru_move_next_inactive_rotation(l, node);
+
+	bpf_lru_list_count_dec(l, node->type);
+	bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+	list_move(&node->list, &l->extra);
+	return true;
+}
+
 static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
 {
 	return l->counts[BPF_LRU_LIST_T_INACTIVE] <
@@ -305,6 +340,69 @@ static void __local_list_flush(struct bpf_lru_list *l,
 	}
 }
 
+static struct bpf_lru_node *bpf_percpu_lru_pop_extra(struct bpf_lru *lru)
+{
+	int cpu = raw_smp_processor_id();
+	struct bpf_lru_node *node;
+	struct bpf_lru_list *l;
+	unsigned long flags;
+
+	l = per_cpu_ptr(lru->percpu_lru, cpu);
+
+	raw_spin_lock_irqsave(&l->lock, flags);
+
+	node = __bpf_lru_node_move_from_extra(l, BPF_LRU_LIST_T_ACTIVE);
+
+	raw_spin_unlock_irqrestore(&l->lock, flags);
+
+	return node;
+}
+
+static struct bpf_lru_node *bpf_lru_locallist_extra_pop(struct bpf_lru_locallist *l)
+{
+	struct bpf_lru_node *node;
+
+	node = list_first_entry_or_null(&l->extra, struct bpf_lru_node, list);
+	if (node)
+		list_del(&node->list);
+
+	return node;
+}
+
+static void __local_list_add_pending(struct bpf_lru *lru,
+				     struct bpf_lru_locallist *loc_l,
+				     int cpu,
+				     struct bpf_lru_node *node);
+
+static struct bpf_lru_node *bpf_common_lru_pop_extra(struct bpf_lru *lru)
+{
+	struct bpf_common_lru *clru = &lru->common_lru;
+	int cpu = raw_smp_processor_id();
+	struct bpf_lru_locallist *loc_l;
+	struct bpf_lru_node *node;
+	unsigned long flags;
+
+	loc_l = per_cpu_ptr(clru->local_list, cpu);
+
+	raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+	node = bpf_lru_locallist_extra_pop(loc_l);
+	if (node)
+		__local_list_add_pending(lru, loc_l, cpu, node);
+
+	raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+
+	return node;
+}
+
+struct bpf_lru_node *bpf_lru_pop_extra(struct bpf_lru *lru)
+{
+	if (lru->percpu)
+		return bpf_percpu_lru_pop_extra(lru);
+	else
+		return bpf_common_lru_pop_extra(lru);
+}
+
 static void bpf_lru_list_push_free(struct bpf_lru_list *l,
 				   struct bpf_lru_node *node)
 {
@@ -496,6 +594,16 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru)
 		return bpf_common_lru_pop_free(lru);
 }
 
+static bool bpf_lru_locallist_extra_push(struct bpf_lru_locallist *loc_l, struct bpf_lru_node *node)
+{
+	if (!list_empty(&loc_l->extra))
+		return false;
+
+	bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+	list_move(&node->list, &loc_l->extra);
+	return true;
+}
+
 static void bpf_common_lru_push_free(struct bpf_lru *lru,
 				     struct bpf_lru_node *node)
 {
@@ -518,8 +626,10 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
 			goto check_lru_list;
 		}
 
-		bpf_lru_node_reset_state(node, BPF_LRU_LOCAL_LIST_T_FREE);
-		list_move(&node->list, local_free_list(loc_l));
+		if (!bpf_lru_locallist_extra_push(loc_l, node)) {
+			bpf_lru_node_reset_state(node, BPF_LRU_LOCAL_LIST_T_FREE);
+			list_move(&node->list, local_free_list(loc_l));
+		}
 
 		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
 		return;
@@ -539,7 +649,8 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
 
 	raw_spin_lock_irqsave(&l->lock, flags);
 
-	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+	if (!__bpf_lru_node_move_to_extra(l, node))
+		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
 
 	raw_spin_unlock_irqrestore(&l->lock, flags);
 }
@@ -554,9 +665,11 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
 
 static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
 				    u32 node_offset, u32 elem_size,
-				    u32 nr_elems)
+				    u32 nr_elems, u32 nr_extra_elems)
 {
-	struct bpf_lru_list *l = &lru->common_lru.lru_list;
+	struct bpf_common_lru *clru = &lru->common_lru;
+	struct bpf_lru_list *l = &clru->lru_list;
+	int cpu;
 	u32 i;
 
 	for (i = 0; i < nr_elems; i++) {
@@ -570,11 +683,26 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
 
 	lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
 				 1, LOCAL_FREE_TARGET);
+
+	if (WARN_ON_ONCE(nr_extra_elems != num_possible_cpus()))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct bpf_lru_locallist *loc_l;
+		struct bpf_lru_node *node;
+
+		loc_l = per_cpu_ptr(clru->local_list, cpu);
+		node = (struct bpf_lru_node *)(buf + node_offset);
+		node->cpu = cpu;
+		bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+		list_add(&node->list, &loc_l->extra);
+		buf += elem_size;
+	}
 }
 
 static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
 				    u32 node_offset, u32 elem_size,
-				    u32 nr_elems)
+				    u32 nr_elems, u32 nr_extra_elems)
 {
 	u32 i, pcpu_entries;
 	int cpu;
@@ -600,17 +728,31 @@ static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
 		if (i % pcpu_entries)
 			goto again;
 	}
+
+	if (WARN_ON_ONCE(nr_extra_elems != num_possible_cpus()))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct bpf_lru_node *node;
+
+		l = per_cpu_ptr(lru->percpu_lru, cpu);
+		node = (struct bpf_lru_node *)(buf + node_offset);
+		node->cpu = cpu;
+		bpf_lru_node_reset_state(node, BPF_LRU_LIST_T_FREE);
+		list_add(&node->list, &l->extra);
+		buf += elem_size;
+	}
 }
 
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
-		      u32 elem_size, u32 nr_elems)
+		      u32 elem_size, u32 nr_elems, u32 nr_extra_elems)
 {
 	if (lru->percpu)
 		bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
-					nr_elems);
+					nr_elems, nr_extra_elems);
 	else
 		bpf_common_lru_populate(lru, buf, node_offset, elem_size,
-					nr_elems);
+					nr_elems, nr_extra_elems);
 }
 
 static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
@@ -620,6 +762,8 @@ static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
 	for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
 		INIT_LIST_HEAD(&loc_l->lists[i]);
 
+	INIT_LIST_HEAD(&loc_l->extra);
+
 	loc_l->next_steal = cpu;
 
 	raw_spin_lock_init(&loc_l->lock);
@@ -637,6 +781,8 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 
 	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
 
+	INIT_LIST_HEAD(&l->extra);
+
 	raw_spin_lock_init(&l->lock);
 }
 
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 29e8300e0fd1..446779341b34 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -33,12 +33,14 @@ struct bpf_lru_list {
 	unsigned int counts[NR_BPF_LRU_LIST_COUNT];
 	/* The next inactive list rotation starts from here */
 	struct list_head *next_inactive_rotation;
+	struct list_head extra; /* for percpu lru */
 
 	raw_spinlock_t lock ____cacheline_aligned_in_smp;
 };
 
 struct bpf_lru_locallist {
 	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+	struct list_head extra; /* for common lru */
 	u16 next_steal;
 	raw_spinlock_t lock;
 };
@@ -71,9 +73,10 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
 int bpf_lru_init(struct bpf_lru *lru, bool percpu,
 		 del_from_htab_func del_from_htab, void *delete_arg);
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
-		      u32 elem_size, u32 nr_elems);
+		      u32 elem_size, u32 nr_elems, u32 nr_extra_elems);
 void bpf_lru_destroy(struct bpf_lru *lru);
 struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru);
 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
+struct bpf_lru_node *bpf_lru_pop_extra(struct bpf_lru *lru);
 
 #endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d029690246f8..8665eb6b8a7d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -207,12 +207,12 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
 }
 
 /* Both percpu and fd htab support in-place update, so no need for
- * extra elem. LRU itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
+ * extra elem. LRU requires extra elems to avoid unintended eviction when
+ * updating the existing elems.
  */
 static bool htab_has_extra_elems(struct bpf_htab *htab)
 {
-	return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
+	return htab_is_lru(htab) || (!htab_is_percpu(htab) && !is_fd_htab(htab));
 }
 
 static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
@@ -313,6 +313,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 static int prealloc_init(struct bpf_htab *htab)
 {
 	u32 num_entries = htab->map.max_entries;
+	u32 lru_num_entries = num_entries;
 	int err = -ENOMEM, i;
 
 	if (htab_has_extra_elems(htab))
@@ -354,7 +355,8 @@ static int prealloc_init(struct bpf_htab *htab)
 	if (htab_is_lru(htab))
 		bpf_lru_populate(&htab->lru, htab->elems,
 				 offsetof(struct htab_elem, lru_node),
-				 htab->elem_size, num_entries);
+				 htab->elem_size, lru_num_entries,
+				 num_entries - lru_num_entries);
 	else
 		pcpu_freelist_populate(&htab->freelist,
 				       htab->elems + offsetof(struct htab_elem, fnode),
@@ -557,7 +559,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		if (err)
 			goto free_map_locked;
 
-		if (htab_has_extra_elems(htab)) {
+		if (htab_has_extra_elems(htab) && !htab_is_lru(htab)) {
 			err = alloc_extra_elems(htab);
 			if (err)
 				goto free_prealloc;
@@ -1182,6 +1184,69 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
 
+static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key, void *value,
+					     u64 map_flags, struct bucket *b,
+					     struct hlist_nulls_head *head, u32 hash,
+					     bool percpu, bool onallcpus)
+{
+	struct htab_elem *l_new, *l_old, *l_free;
+	struct bpf_map *map = &htab->map;
+	u32 key_size = map->key_size;
+	struct bpf_lru_node *node;
+	unsigned long flags;
+	void *l_val;
+	int ret;
+
+	node = bpf_lru_pop_extra(&htab->lru);
+	if (!node)
+		return -ENOENT;
+
+	l_new = container_of(node, struct htab_elem, lru_node);
+	l_new->hash = hash;
+	memcpy(l_new->key, key, key_size);
+	if (!percpu) {
+		l_val = htab_elem_value(l_new, map->key_size);
+		copy_map_value(map, l_val, value);
+		bpf_obj_free_fields(map->record, l_val);
+	}
+
+	ret = htab_lock_bucket(b, &flags);
+	if (ret)
+		goto err_lock_bucket;
+
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
+		goto err;
+
+	if (l_old) {
+		bpf_lru_node_set_ref(&l_new->lru_node);
+		if (percpu) {
+			/* per-cpu hash map can update value in-place.
+			 * Keep the same logic in __htab_lru_percpu_map_update_elem().
+			 */
+			pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+					value, onallcpus);
+			l_free = l_new;
+		} else {
+			hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+			hlist_nulls_del_rcu(&l_old->hash_node);
+			l_free = l_old;
+		}
+	} else {
+		ret = -ENOENT;
+	}
+
+err:
+	htab_unlock_bucket(b, flags);
+
+err_lock_bucket:
+	bpf_lru_push_free(&htab->lru, ret ? node : &l_free->lru_node);
+
+	return ret;
+}
+
 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 				     u64 map_flags)
 {
@@ -1206,6 +1271,11 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
+	ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, head, hash, false,
+						false);
+	if (!ret)
+		return 0;
+
 	/* For LRU, we need to alloc before taking bucket's
 	 * spinlock because getting free nodes from LRU may need
 	 * to remove older elements from htab and this removal
@@ -1336,6 +1406,11 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
+	ret = htab_lru_map_update_elem_in_place(htab, key, value, map_flags, b, head, hash, true,
+						onallcpus);
+	if (!ret)
+		return 0;
+
 	/* For LRU, we need to alloc before taking bucket's
 	 * spinlock because LRU's elem alloc may need
 	 * to remove older elem from htab and this removal
-- 
2.52.0
Re: [PATCH bpf-next v2 4/5] bpf: lru: Fix unintended eviction when updating lru hash maps
Posted by bot+bpf-ci@kernel.org 1 month ago
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index d029690246f8..8665eb6b8a7d 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c

[ ... ]

> +static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key, void *value,
> +					     u64 map_flags, struct bucket *b,
> +					     struct hlist_nulls_head *head, u32 hash,
> +					     bool percpu, bool onallcpus)
> +{
> +	struct htab_elem *l_new, *l_old, *l_free;
> +	struct bpf_map *map = &htab->map;
> +	u32 key_size = map->key_size;
> +	struct bpf_lru_node *node;
> +	unsigned long flags;
> +	void *l_val;
> +	int ret;
> +
> +	node = bpf_lru_pop_extra(&htab->lru);
> +	if (!node)
> +		return -ENOENT;
> +
> +	l_new = container_of(node, struct htab_elem, lru_node);
> +	l_new->hash = hash;
> +	memcpy(l_new->key, key, key_size);
> +	if (!percpu) {
> +		l_val = htab_elem_value(l_new, map->key_size);
> +		copy_map_value(map, l_val, value);
> +		bpf_obj_free_fields(map->record, l_val);
> +	}

Should bpf_obj_free_fields() be called here on l_new? The value was just
copied into l_new and l_new will be inserted into the hash table when
l_old is found. Calling bpf_obj_free_fields() on l_new's value frees the
special fields (kptr, bpf_list_head, etc.) that were just copied, then
l_new is inserted with freed fields. This appears to be a use-after-free
for maps with special fields.

Looking at htab_lru_map_update_elem(), the pattern is to copy the value
and NOT call bpf_obj_free_fields() on the new element. The fields are
only freed when an element is removed from the hash table.

> +
> +	ret = htab_lock_bucket(b, &flags);
> +	if (ret)
> +		goto err_lock_bucket;
> +
> +	l_old = lookup_elem_raw(head, hash, key, key_size);
> +
> +	ret = check_flags(htab, l_old, map_flags);
> +	if (ret)
> +		goto err;
> +
> +	if (l_old) {
> +		bpf_lru_node_set_ref(&l_new->lru_node);
> +		if (percpu) {
> +			/* per-cpu hash map can update value in-place.
> +			 * Keep the same logic in __htab_lru_percpu_map_update_elem().
> +			 */
> +			pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
> +					value, onallcpus);
> +			l_free = l_new;
> +		} else {
> +			hlist_nulls_add_head_rcu(&l_new->hash_node, head);
> +			hlist_nulls_del_rcu(&l_old->hash_node);
> +			l_free = l_old;
> +		}
> +	} else {
> +		ret = -ENOENT;
> +	}
> +
> +err:
> +	htab_unlock_bucket(b, flags);
> +
> +err_lock_bucket:
> +	bpf_lru_push_free(&htab->lru, ret ? node : &l_free->lru_node);
                                              ^^^^^^^^^^^^^^^^^^

When ret == 0 and !percpu, l_free points to l_old which is being removed
from the hash table. The code calls bpf_lru_push_free() directly instead
of htab_lru_push_free(). Looking at htab_lru_push_free():

    static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
    {
        check_and_free_fields(htab, elem);
        bpf_map_dec_elem_count(&htab->map);
        bpf_lru_push_free(&htab->lru, &elem->lru_node);
    }

It calls check_and_free_fields() to free special fields in the element
before pushing to the free list. The code here bypasses that, which would
leak special fields (kptr, etc.) in l_old when it has them.

Should this use htab_lru_push_free() instead of bpf_lru_push_free() for
the non-error path where l_old is being freed?

> +
> +	return ret;
> +}


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/20720201621
Re: [PATCH bpf-next v2 4/5] bpf: lru: Fix unintended eviction when updating lru hash maps
Posted by Leon Hwang 1 month ago

On 2026/1/5 23:43, bot+bpf-ci@kernel.org wrote:
>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>> index d029690246f8..8665eb6b8a7d 100644
>> --- a/kernel/bpf/hashtab.c
>> +++ b/kernel/bpf/hashtab.c
> 
> [ ... ]
> 
>> +static int htab_lru_map_update_elem_in_place(struct bpf_htab *htab, void *key, void *value,
>> +					     u64 map_flags, struct bucket *b,
>> +					     struct hlist_nulls_head *head, u32 hash,
>> +					     bool percpu, bool onallcpus)
>> +{
>> +	struct htab_elem *l_new, *l_old, *l_free;
>> +	struct bpf_map *map = &htab->map;
>> +	u32 key_size = map->key_size;
>> +	struct bpf_lru_node *node;
>> +	unsigned long flags;
>> +	void *l_val;
>> +	int ret;
>> +
>> +	node = bpf_lru_pop_extra(&htab->lru);
>> +	if (!node)
>> +		return -ENOENT;
>> +
>> +	l_new = container_of(node, struct htab_elem, lru_node);
>> +	l_new->hash = hash;
>> +	memcpy(l_new->key, key, key_size);
>> +	if (!percpu) {
>> +		l_val = htab_elem_value(l_new, map->key_size);
>> +		copy_map_value(map, l_val, value);
>> +		bpf_obj_free_fields(map->record, l_val);
>> +	}
> 
> Should bpf_obj_free_fields() be called here on l_new? The value was just
> copied into l_new and l_new will be inserted into the hash table when
> l_old is found. Calling bpf_obj_free_fields() on l_new's value frees the
> special fields (kptr, bpf_list_head, etc.) that were just copied, then
> l_new is inserted with freed fields. This appears to be a use-after-free
> for maps with special fields.
> 
> Looking at htab_lru_map_update_elem(), the pattern is to copy the value
> and NOT call bpf_obj_free_fields() on the new element. The fields are
> only freed when an element is removed from the hash table.
> 

You're wrong here, that the copied value must be cleaned by
bpf_obj_free_fields().

>> +
>> +	ret = htab_lock_bucket(b, &flags);
>> +	if (ret)
>> +		goto err_lock_bucket;
>> +
>> +	l_old = lookup_elem_raw(head, hash, key, key_size);
>> +
>> +	ret = check_flags(htab, l_old, map_flags);
>> +	if (ret)
>> +		goto err;
>> +
>> +	if (l_old) {
>> +		bpf_lru_node_set_ref(&l_new->lru_node);
>> +		if (percpu) {
>> +			/* per-cpu hash map can update value in-place.
>> +			 * Keep the same logic in __htab_lru_percpu_map_update_elem().
>> +			 */
>> +			pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
>> +					value, onallcpus);
>> +			l_free = l_new;
>> +		} else {
>> +			hlist_nulls_add_head_rcu(&l_new->hash_node, head);
>> +			hlist_nulls_del_rcu(&l_old->hash_node);
>> +			l_free = l_old;
>> +		}
>> +	} else {
>> +		ret = -ENOENT;
>> +	}
>> +
>> +err:
>> +	htab_unlock_bucket(b, flags);
>> +
>> +err_lock_bucket:
>> +	bpf_lru_push_free(&htab->lru, ret ? node : &l_free->lru_node);
>                                               ^^^^^^^^^^^^^^^^^^
> 
> When ret == 0 and !percpu, l_free points to l_old which is being removed
> from the hash table. The code calls bpf_lru_push_free() directly instead
> of htab_lru_push_free(). Looking at htab_lru_push_free():
> 
>     static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
>     {
>         check_and_free_fields(htab, elem);
>         bpf_map_dec_elem_count(&htab->map);
>         bpf_lru_push_free(&htab->lru, &elem->lru_node);
>     }
> 
> It calls check_and_free_fields() to free special fields in the element
> before pushing to the free list. The code here bypasses that, which would
> leak special fields (kptr, etc.) in l_old when it has them.
> 
> Should this use htab_lru_push_free() instead of bpf_lru_push_free() for
> the non-error path where l_old is being freed?
> 

You're right here, that I did not free the special fields of l_old. But
htab_lru_push_free() shouldn't be used here for the non-error path,
because bpf_map_dec_elem_count() was not required here.

I'll fix it in the next revision.

Thanks,
Leon

>> +
>> +	return ret;
>> +}
> 
> 
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/20720201621