[v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

[PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Scott Mitchell 2 months, 2 weeks ago

From: Scott Mitchell <scott.k.mitch1@gmail.com>

The current implementation uses a linear list to find queued packets by
ID when processing verdicts from userspace. With large queue depths and
out-of-order verdicting, this O(n) lookup becomes a significant
bottleneck, causing userspace verdict processing to dominate CPU time.

Replace the linear search with a hash table for O(1) average-case
packet lookup by ID. The hash table size is configurable via the new
NFQA_CFG_HASH_SIZE netlink attribute (default 1024 buckets, matching
NFQNL_QMAX_DEFAULT; max 131072). The size is normalized to a power of
two to enable efficient bitwise masking instead of modulo operations.
Unpatched kernels silently ignore the new attribute, maintaining
backward compatibility.

The existing list data structure is retained for operations requiring
linear iteration (e.g. flush, device down events). Hot fields
(queue_hash_mask, queue_hash pointer) are placed in the same cache line
as the spinlock and packet counters for optimal memory access patterns.

Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>

Tested-by: syzbot@syzkaller.appspotmail.com
---
Changes in v5:
- Use GFP_ATOMIC with kvmalloc_array instead of GFP_KERNEL_ACCOUNT due to
  rcu_read_lock held in nfqnl_recv_config. Add comment explaining that
  GFP_KERNEL_ACCOUNT would require lock refactoring (Florian Westphal)

Changes in v4:
- Fix sleeping while atomic bug: allocate hash table before taking
  spinlock in instance_create() (syzbot)

Changes in v3:
- Simplify hash function to use direct masking (id & mask) instead of
  hash_32() for better cache locality with sequential IDs (Eric Dumazet)

Changes in v2:
- Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
  tables with vmalloc fallback (Florian Westphal)
- Remove incorrect comment about concurrent resizes - nfnetlink subsystem
  mutex already serializes config operations (Florian Westphal)
- Fix style: remove unnecessary braces around single-line if (Florian Westphal)

 include/net/netfilter/nf_queue.h              |   1 +
 .../uapi/linux/netfilter/nfnetlink_queue.h    |   1 +
 net/netfilter/nfnetlink_queue.c               | 133 ++++++++++++++++--
 3 files changed, 127 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 4aeffddb7586..3d0def310523 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -11,6 +11,7 @@
 /* Each queued (to userspace) skbuff has one of these. */
 struct nf_queue_entry {
 	struct list_head	list;
+	struct hlist_node	hash_node;
 	struct sk_buff		*skb;
 	unsigned int		id;
 	unsigned int		hook_index;	/* index in hook_entries->hook[] */
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index efcb7c044a74..bc296a17e5aa 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -107,6 +107,7 @@ enum nfqnl_attr_config {
 	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
 	NFQA_CFG_MASK,			/* identify which flags to change */
 	NFQA_CFG_FLAGS,			/* value of these flags (__u32) */
+	NFQA_CFG_HASH_SIZE,		/* __u32 hash table size (rounded to power of 2) */
 	__NFQA_CFG_MAX
 };
 #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 8b7b39d8a109..b142fac70ed9 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -46,7 +46,10 @@
 #include <net/netfilter/nf_conntrack.h>
 #endif
 
-#define NFQNL_QMAX_DEFAULT 1024
+#define NFQNL_QMAX_DEFAULT      1024
+#define NFQNL_MIN_HASH_SIZE     16
+#define NFQNL_DEFAULT_HASH_SIZE 1024
+#define NFQNL_MAX_HASH_SIZE     131072
 
 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
  * includes the header length. Thus, the maximum packet length that we
@@ -65,6 +68,7 @@ struct nfqnl_instance {
 	unsigned int copy_range;
 	unsigned int queue_dropped;
 	unsigned int queue_user_dropped;
+	unsigned int queue_hash_size;
 
 
 	u_int16_t queue_num;			/* number of this queue */
@@ -77,6 +81,8 @@ struct nfqnl_instance {
 	spinlock_t	lock	____cacheline_aligned_in_smp;
 	unsigned int	queue_total;
 	unsigned int	id_sequence;		/* 'sequence' of pkt ids */
+	unsigned int	queue_hash_mask;
+	struct hlist_head *queue_hash;
 	struct list_head queue_list;		/* packets in queue */
 };
 
@@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
 	return net_generic(net, nfnl_queue_net_id);
 }
 
+static inline unsigned int
+nfqnl_packet_hash(u32 id, unsigned int mask)
+{
+	return id & mask;
+}
+
+static inline u32
+nfqnl_normalize_hash_size(u32 hash_size)
+{
+	/* Must be power of two for queue_hash_mask to work correctly.
+	 * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
+	 */
+	BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
+		     !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
+		     !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
+		     NFQNL_MAX_HASH_SIZE > 1U << 31);
+
+	if (!hash_size)
+		return NFQNL_DEFAULT_HASH_SIZE;
+
+	/* Clamp to valid range before power of two to avoid overflow */
+	if (hash_size <= NFQNL_MIN_HASH_SIZE)
+		return NFQNL_MIN_HASH_SIZE;
+
+	if (hash_size >= NFQNL_MAX_HASH_SIZE)
+		return NFQNL_MAX_HASH_SIZE;
+
+	if (!is_power_of_2(hash_size))
+		hash_size = roundup_pow_of_two(hash_size);
+
+	return hash_size;
+}
+
 static inline u_int8_t instance_hashfn(u_int16_t queue_num)
 {
 	return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
@@ -114,13 +153,70 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
 	return NULL;
 }
 
+static int
+nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)
+{
+	struct hlist_head *new_hash, *old_hash;
+	struct nf_queue_entry *entry;
+	unsigned int h, hash_mask;
+
+	hash_size = nfqnl_normalize_hash_size(hash_size);
+	if (hash_size == inst->queue_hash_size)
+		return 0;
+
+	/* GFP_ATOMIC required: called under rcu_read_lock in nfqnl_recv_config.
+	 * Using GFP_KERNEL_ACCOUNT would require refactoring lock placement.
+	 */
+	new_hash = kvmalloc_array(hash_size, sizeof(*new_hash), GFP_ATOMIC);
+	if (!new_hash)
+		return -ENOMEM;
+
+	hash_mask = hash_size - 1;
+
+	for (h = 0; h < hash_size; h++)
+		INIT_HLIST_HEAD(&new_hash[h]);
+
+	spin_lock_bh(&inst->lock);
+
+	list_for_each_entry(entry, &inst->queue_list, list) {
+		/* No hlist_del() since old_hash will be freed and we hold lock */
+		h = nfqnl_packet_hash(entry->id, hash_mask);
+		hlist_add_head(&entry->hash_node, &new_hash[h]);
+	}
+
+	old_hash = inst->queue_hash;
+	inst->queue_hash_size = hash_size;
+	inst->queue_hash_mask = hash_mask;
+	inst->queue_hash = new_hash;
+
+	spin_unlock_bh(&inst->lock);
+
+	kvfree(old_hash);
+
+	return 0;
+}
+
 static struct nfqnl_instance *
-instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
+		u32 hash_size)
 {
 	struct nfqnl_instance *inst;
+	struct hlist_head *queue_hash;
 	unsigned int h;
 	int err;
 
+	hash_size = nfqnl_normalize_hash_size(hash_size);
+
+	/* GFP_ATOMIC required: called under rcu_read_lock in nfqnl_recv_config.
+	 * Using GFP_KERNEL_ACCOUNT would require refactoring lock placement.
+	 */
+	queue_hash = kvmalloc_array(hash_size, sizeof(*queue_hash), GFP_ATOMIC);
+	if (!queue_hash)
+		return ERR_PTR(-ENOMEM);
+
+	for (h = 0; h < hash_size; h++)
+		INIT_HLIST_HEAD(&queue_hash[h]);
+
 	spin_lock(&q->instances_lock);
 	if (instance_lookup(q, queue_num)) {
 		err = -EEXIST;
@@ -133,11 +229,14 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
 		goto out_unlock;
 	}
 
+	inst->queue_hash = queue_hash;
 	inst->queue_num = queue_num;
 	inst->peer_portid = portid;
 	inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
 	inst->copy_range = NFQNL_MAX_COPY_RANGE;
 	inst->copy_mode = NFQNL_COPY_NONE;
+	inst->queue_hash_size = hash_size;
+	inst->queue_hash_mask = hash_size - 1;
 	spin_lock_init(&inst->lock);
 	INIT_LIST_HEAD(&inst->queue_list);
 
@@ -157,6 +256,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
 	kfree(inst);
 out_unlock:
 	spin_unlock(&q->instances_lock);
+	kvfree(queue_hash);
 	return ERR_PTR(err);
 }
 
@@ -172,6 +272,7 @@ instance_destroy_rcu(struct rcu_head *head)
 	rcu_read_lock();
 	nfqnl_flush(inst, NULL, 0);
 	rcu_read_unlock();
+	kvfree(inst->queue_hash);
 	kfree(inst);
 	module_put(THIS_MODULE);
 }
@@ -194,13 +295,17 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
 static inline void
 __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
 {
-       list_add_tail(&entry->list, &queue->queue_list);
-       queue->queue_total++;
+	unsigned int hash = nfqnl_packet_hash(entry->id, queue->queue_hash_mask);
+
+	hlist_add_head(&entry->hash_node, &queue->queue_hash[hash]);
+	list_add_tail(&entry->list, &queue->queue_list);
+	queue->queue_total++;
 }
 
 static void
 __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
 {
+	hlist_del(&entry->hash_node);
 	list_del(&entry->list);
 	queue->queue_total--;
 }
@@ -209,10 +314,11 @@ static struct nf_queue_entry *
 find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 {
 	struct nf_queue_entry *entry = NULL, *i;
+	unsigned int hash = nfqnl_packet_hash(id, queue->queue_hash_mask);
 
 	spin_lock_bh(&queue->lock);
 
-	list_for_each_entry(i, &queue->queue_list, list) {
+	hlist_for_each_entry(i, &queue->queue_hash[hash], hash_node) {
 		if (i->id == id) {
 			entry = i;
 			break;
@@ -407,8 +513,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 	spin_lock_bh(&queue->lock);
 	list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
 		if (!cmpfn || cmpfn(entry, data)) {
-			list_del(&entry->list);
-			queue->queue_total--;
+			__dequeue_entry(queue, entry);
 			nfqnl_reinject(entry, NF_DROP);
 		}
 	}
@@ -1483,6 +1588,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
 	[NFQA_CFG_QUEUE_MAXLEN]	= { .type = NLA_U32 },
 	[NFQA_CFG_MASK]		= { .type = NLA_U32 },
 	[NFQA_CFG_FLAGS]	= { .type = NLA_U32 },
+	[NFQA_CFG_HASH_SIZE]    = { .type = NLA_U32 },
 };
 
 static const struct nf_queue_handler nfqh = {
@@ -1495,11 +1601,15 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 {
 	struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
 	u_int16_t queue_num = ntohs(info->nfmsg->res_id);
+	u32 hash_size = 0;
 	struct nfqnl_msg_config_cmd *cmd = NULL;
 	struct nfqnl_instance *queue;
 	__u32 flags = 0, mask = 0;
 	int ret = 0;
 
+	if (nfqa[NFQA_CFG_HASH_SIZE])
+		hash_size = ntohl(nla_get_be32(nfqa[NFQA_CFG_HASH_SIZE]));
+
 	if (nfqa[NFQA_CFG_CMD]) {
 		cmd = nla_data(nfqa[NFQA_CFG_CMD]);
 
@@ -1559,11 +1669,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 				goto err_out_unlock;
 			}
 			queue = instance_create(q, queue_num,
-						NETLINK_CB(skb).portid);
+						NETLINK_CB(skb).portid, hash_size);
 			if (IS_ERR(queue)) {
 				ret = PTR_ERR(queue);
 				goto err_out_unlock;
 			}
+			hash_size = 0; /* avoid resize later in this function */
 			break;
 		case NFQNL_CFG_CMD_UNBIND:
 			if (!queue) {
@@ -1586,6 +1697,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
 		goto err_out_unlock;
 	}
 
+	if (hash_size > 0) {
+		ret = nfqnl_hash_resize(queue, hash_size);
+		if (ret)
+			goto err_out_unlock;
+	}
+
 	if (nfqa[NFQA_CFG_PARAMS]) {
 		struct nfqnl_msg_config_params *params =
 			nla_data(nfqa[NFQA_CFG_PARAMS]);
-- 
2.39.5 (Apple Git-154)

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Pablo Neira Ayuso 3 weeks, 4 days ago

Hi Scott,

> diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
> index 4aeffddb7586..3d0def310523 100644
> --- a/include/net/netfilter/nf_queue.h
> +++ b/include/net/netfilter/nf_queue.h
> @@ -11,6 +11,7 @@
>  /* Each queued (to userspace) skbuff has one of these. */
>  struct nf_queue_entry {
>  	struct list_head	list;
> +	struct hlist_node	hash_node;
>  	struct sk_buff		*skb;
>  	unsigned int		id;
>  	unsigned int		hook_index;	/* index in hook_entries->hook[] */
> diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
> index efcb7c044a74..bc296a17e5aa 100644
> --- a/include/uapi/linux/netfilter/nfnetlink_queue.h
> +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
> @@ -107,6 +107,7 @@ enum nfqnl_attr_config {
>  	NFQA_CFG_QUEUE_MAXLEN,		/* __u32 */
>  	NFQA_CFG_MASK,			/* identify which flags to change */
>  	NFQA_CFG_FLAGS,			/* value of these flags (__u32) */
> +	NFQA_CFG_HASH_SIZE,		/* __u32 hash table size (rounded to power of 2) */

This should use the rhashtable implementation, I don't find a good
reason why this is not used in first place for this enhancement.

>  	__NFQA_CFG_MAX
>  };
>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> index 8b7b39d8a109..b142fac70ed9 100644
> --- a/net/netfilter/nfnetlink_queue.c
> +++ b/net/netfilter/nfnetlink_queue.c
> @@ -46,7 +46,10 @@
>  #include <net/netfilter/nf_conntrack.h>
>  #endif
>  
> -#define NFQNL_QMAX_DEFAULT 1024
> +#define NFQNL_QMAX_DEFAULT      1024
> +#define NFQNL_MIN_HASH_SIZE     16
> +#define NFQNL_DEFAULT_HASH_SIZE 1024
> +#define NFQNL_MAX_HASH_SIZE     131072
>  
>  /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
>   * includes the header length. Thus, the maximum packet length that we
> @@ -65,6 +68,7 @@ struct nfqnl_instance {
>  	unsigned int copy_range;
>  	unsigned int queue_dropped;
>  	unsigned int queue_user_dropped;
> +	unsigned int queue_hash_size;
>  
>  
>  	u_int16_t queue_num;			/* number of this queue */
> @@ -77,6 +81,8 @@ struct nfqnl_instance {
>  	spinlock_t	lock	____cacheline_aligned_in_smp;
>  	unsigned int	queue_total;
>  	unsigned int	id_sequence;		/* 'sequence' of pkt ids */
> +	unsigned int	queue_hash_mask;
> +	struct hlist_head *queue_hash;
>  	struct list_head queue_list;		/* packets in queue */
>  };
>  
> @@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
>  	return net_generic(net, nfnl_queue_net_id);
>  }
>  
> +static inline unsigned int
> +nfqnl_packet_hash(u32 id, unsigned int mask)
> +{
> +	return id & mask;
> +}
> +
> +static inline u32
> +nfqnl_normalize_hash_size(u32 hash_size)
> +{
> +	/* Must be power of two for queue_hash_mask to work correctly.
> +	 * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
> +	 */
> +	BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
> +		     !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
> +		     !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
> +		     NFQNL_MAX_HASH_SIZE > 1U << 31);
> +
> +	if (!hash_size)
> +		return NFQNL_DEFAULT_HASH_SIZE;
> +
> +	/* Clamp to valid range before power of two to avoid overflow */
> +	if (hash_size <= NFQNL_MIN_HASH_SIZE)
> +		return NFQNL_MIN_HASH_SIZE;
> +
> +	if (hash_size >= NFQNL_MAX_HASH_SIZE)
> +		return NFQNL_MAX_HASH_SIZE;
> +
> +	if (!is_power_of_2(hash_size))
> +		hash_size = roundup_pow_of_two(hash_size);
> +
> +	return hash_size;
> +}
> +
>  static inline u_int8_t instance_hashfn(u_int16_t queue_num)
>  {
>  	return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
> @@ -114,13 +153,70 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
>  	return NULL;
>  }
>  
> +static int
> +nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)

rhashtable can just handle this for you, then users do not need
to tune this hash_size parameter.

> +{
> +	struct hlist_head *new_hash, *old_hash;
> +	struct nf_queue_entry *entry;
> +	unsigned int h, hash_mask;
> +
> +	hash_size = nfqnl_normalize_hash_size(hash_size);
> +	if (hash_size == inst->queue_hash_size)
> +		return 0;
> +
> +	/* GFP_ATOMIC required: called under rcu_read_lock in nfqnl_recv_config.
> +	 * Using GFP_KERNEL_ACCOUNT would require refactoring lock placement.
> +	 */
> +	new_hash = kvmalloc_array(hash_size, sizeof(*new_hash), GFP_ATOMIC);
> +	if (!new_hash)
> +		return -ENOMEM;
> +
> +	hash_mask = hash_size - 1;
> +
> +	for (h = 0; h < hash_size; h++)
> +		INIT_HLIST_HEAD(&new_hash[h]);
> +
> +	spin_lock_bh(&inst->lock);
> +
> +	list_for_each_entry(entry, &inst->queue_list, list) {
> +		/* No hlist_del() since old_hash will be freed and we hold lock */
> +		h = nfqnl_packet_hash(entry->id, hash_mask);
> +		hlist_add_head(&entry->hash_node, &new_hash[h]);
> +	}
> +
> +	old_hash = inst->queue_hash;
> +	inst->queue_hash_size = hash_size;
> +	inst->queue_hash_mask = hash_mask;
> +	inst->queue_hash = new_hash;
> +
> +	spin_unlock_bh(&inst->lock);
> +
> +	kvfree(old_hash);
> +
> +	return 0;
> +}
> +
>  static struct nfqnl_instance *
> -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
> +		u32 hash_size)
>  {
>  	struct nfqnl_instance *inst;
> +	struct hlist_head *queue_hash;
>  	unsigned int h;
>  	int err;
>  
> +	hash_size = nfqnl_normalize_hash_size(hash_size);
> +
> +	/* GFP_ATOMIC required: called under rcu_read_lock in nfqnl_recv_config.
> +	 * Using GFP_KERNEL_ACCOUNT would require refactoring lock placement.
> +	 */
> +	queue_hash = kvmalloc_array(hash_size, sizeof(*queue_hash), GFP_ATOMIC);

If rhashtable is used, this can be allocate perns and then you avoid
this GFP_ATOMIC for each instance.

> +	if (!queue_hash)
> +		return ERR_PTR(-ENOMEM);
> +
> +	for (h = 0; h < hash_size; h++)
> +		INIT_HLIST_HEAD(&queue_hash[h]);
> +
>  	spin_lock(&q->instances_lock);
>  	if (instance_lookup(q, queue_num)) {
>  		err = -EEXIST;

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Scott Mitchell 3 weeks, 3 days ago

> > +     NFQA_CFG_HASH_SIZE,             /* __u32 hash table size (rounded to power of 2) */
>
> This should use the rhashtable implementation, I don't find a good
> reason why this is not used in first place for this enhancement.

Thank you for the review! I can make the changes. Before implementing,
I have a few questions to ensure I understand the preferred approach:

1. For the "perns" allocation comment - which approach did you have in mind:
  a) Shared rhashtable in nfnl_queue_net (initialized in
nfnl_queue_net_init) with key={queue_num, packet_id}
  b) Per-instance rhashtable in nfqnl_instance, with lock refactoring
so initialization happens outside rcu_read_lock
2. The lock refactoring (GFP_ATOMIC → GFP_KERNEL) is independent of
the hash structure choice, correct? We could fix that separately?
3. Can you help me understand the trade-offs you considered for
rhashtable vs hlist_head? Removing the API makes sense, and I want to
better understand how to weigh that against runtime overhead (RCU,
locks, atomic ops) for future design decisions.

I'll use a custom hashfn to preserve the current mask-based hashing
for the incrementing IDs.

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Florian Westphal 3 weeks, 2 days ago

Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
> > > +     NFQA_CFG_HASH_SIZE,             /* __u32 hash table size (rounded to power of 2) */
> >
> > This should use the rhashtable implementation, I don't find a good
> > reason why this is not used in first place for this enhancement.
> 
> Thank you for the review! I can make the changes. Before implementing,
> I have a few questions to ensure I understand the preferred approach:
> 
> 1. For the "perns" allocation comment - which approach did you have in mind:
>   a) Shared rhashtable in nfnl_queue_net (initialized in
> nfnl_queue_net_init) with key={queue_num, packet_id}
>   b) Per-instance rhashtable in nfqnl_instance, with lock refactoring

You could also go with c), single rhashtable created at module init
time, like what af_netlink.c is doing.

hash and compare function would then have to include struct net *
in the hash and the compare.

b) makes no sense; if you do the lock refactoring to also allow
   GFP_ACCOUNT you could also keep the existing hashtable approach,
   I think.

> 2. The lock refactoring (GFP_ATOMIC → GFP_KERNEL) is independent of
> the hash structure choice, correct? We could fix that separately?

Not needed if you go with a) or c).

> 3. Can you help me understand the trade-offs you considered for
> rhashtable vs hlist_head? Removing the API makes sense, and I want to
> better understand how to weigh that against runtime overhead (RCU,
> locks, atomic ops) for future design decisions.

I think for this not using rhashtable is fine, but as-is the patch would
allow almost unlimited memory consumption due to ability to create 64k
queues.

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Scott Mitchell 3 weeks ago

Thanks for the clarification on the options. I'd like to propose going
with a refined version of the v5 approach (per-instance hlist_head
with auto-resize). Approach is to submit a v6 series with 2 commits:
1. Refactor instance_create() locking to enable GFP_KERNEL_ACCOUNT
instead of GFP_ATOMIC
2. Add per-instance hash table with auto-resize (no uapi changes)

Rationale for per-instance approach over shared rhashtable:
1. Resize algorithm matched to nfqueue behavior: outstanding packet
count depends on verdict/traffic rate. rhashtable resizes based on
element count, which for nfqueue means resize-up during bursts
followed by resize-down as verdicts drain the queue to zero. This
burst-drain pattern would cause repeated resize operations. The resize
approach can be tailored to nfqueue use case to reduce resize
thrashing.
2. Per-queue memory attribution/limits:  With GFP_KERNEL_ACCOUNT, hash
table allocations are charged to the cgroup that triggered the resize,
so memory consumption is bounded by cgroup limits rather than
requiring an additional kernel-internal limit.
3. Simpler key structure: Avoids composite keys (net, queue_num,
packet_id) needed for a shared hash table

I'm open to reconsidering the shared rhashtable approach if you feel
the benefits outweigh these tradeoffs.

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Pablo Neira Ayuso 3 weeks, 2 days ago

Hi Scott,

On Tue, Jan 13, 2026 at 08:32:56PM -0500, Scott Mitchell wrote:
> > > +     NFQA_CFG_HASH_SIZE,             /* __u32 hash table size (rounded to power of 2) */
> >
> > This should use the rhashtable implementation, I don't find a good
> > reason why this is not used in first place for this enhancement.
> 
> Thank you for the review! I can make the changes. Before implementing,
> I have a few questions to ensure I understand the preferred approach:
> 
> 1. For the "perns" allocation comment - which approach did you have in mind:
>   a) Shared rhashtable in nfnl_queue_net (initialized in
> nfnl_queue_net_init) with key={queue_num, packet_id}
>   b) Per-instance rhashtable in nfqnl_instance, with lock refactoring
> so initialization happens outside rcu_read_lock

Yes, but...

Florian suggests a single rhashtable for all netns should be good
enough, you only have to include net_hash_mix(net) in the hash.

> 2. The lock refactoring (GFP_ATOMIC → GFP_KERNEL) is independent of
> the hash structure choice, correct? We could fix that separately?

No lock refactoring anymore since rhashtable would be initialized only
once for all netns, as Florian suggests.

> 3. Can you help me understand the trade-offs you considered for
> rhashtable vs hlist_head? Removing the API makes sense, and I want to
> better understand how to weigh that against runtime overhead (RCU,
> locks, atomic ops) for future design decisions.

Your approach consumes ~1Mbyte per queue instance, and we could end
up with 64k queues per-netns.

This is exposed to unprivileged containers, this allows userspace
to deplete the atomic reserves since GFP_ATOMIC is toggled, and...
there is no GFP_ATOMIC_ACCOUNT flag, then accounting does not apply in
this case.

While rhashtable a bit heavyweight, it should consume a lot less
memory and users does not have to do any hashtable bucket tunning.

> I'll use a custom hashfn to preserve the current mask-based hashing
> for the incrementing IDs.

OK.

Thanks.

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Scott Mitchell 2 months ago

Hello folks, friendly ping :) Please let me know if any other changes
are required before merging.

On Fri, Nov 21, 2025 at 4:37 PM Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
>
> From: Scott Mitchell <scott.k.mitch1@gmail.com>
>
> The current implementation uses a linear list to find queued packets by
> ID when processing verdicts from userspace. With large queue depths and
> out-of-order verdicting, this O(n) lookup becomes a significant
> bottleneck, causing userspace verdict processing to dominate CPU time.
>
> Replace the linear search with a hash table for O(1) average-case
> packet lookup by ID. The hash table size is configurable via the new
> NFQA_CFG_HASH_SIZE netlink attribute (default 1024 buckets, matching
> NFQNL_QMAX_DEFAULT; max 131072). The size is normalized to a power of
> two to enable efficient bitwise masking instead of modulo operations.
> Unpatched kernels silently ignore the new attribute, maintaining
> backward compatibility.
>
> The existing list data structure is retained for operations requiring
> linear iteration (e.g. flush, device down events). Hot fields
> (queue_hash_mask, queue_hash pointer) are placed in the same cache line
> as the spinlock and packet counters for optimal memory access patterns.
>
> Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
>
> Tested-by: syzbot@syzkaller.appspotmail.com
> ---
> Changes in v5:
> - Use GFP_ATOMIC with kvmalloc_array instead of GFP_KERNEL_ACCOUNT due to
>   rcu_read_lock held in nfqnl_recv_config. Add comment explaining that
>   GFP_KERNEL_ACCOUNT would require lock refactoring (Florian Westphal)
>
> Changes in v4:
> - Fix sleeping while atomic bug: allocate hash table before taking
>   spinlock in instance_create() (syzbot)
>
> Changes in v3:
> - Simplify hash function to use direct masking (id & mask) instead of
>   hash_32() for better cache locality with sequential IDs (Eric Dumazet)
>
> Changes in v2:
> - Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
>   tables with vmalloc fallback (Florian Westphal)
> - Remove incorrect comment about concurrent resizes - nfnetlink subsystem
>   mutex already serializes config operations (Florian Westphal)
> - Fix style: remove unnecessary braces around single-line if (Florian Westphal)
>
>  include/net/netfilter/nf_queue.h              |   1 +
>  .../uapi/linux/netfilter/nfnetlink_queue.h    |   1 +
>  net/netfilter/nfnetlink_queue.c               | 133 ++++++++++++++++--
>  3 files changed, 127 insertions(+), 8 deletions(-)
>
> diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
> index 4aeffddb7586..3d0def310523 100644
> --- a/include/net/netfilter/nf_queue.h
> +++ b/include/net/netfilter/nf_queue.h
> @@ -11,6 +11,7 @@
>  /* Each queued (to userspace) skbuff has one of these. */
>  struct nf_queue_entry {
>         struct list_head        list;
> +       struct hlist_node       hash_node;
>         struct sk_buff          *skb;
>         unsigned int            id;
>         unsigned int            hook_index;     /* index in hook_entries->hook[] */
> diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
> index efcb7c044a74..bc296a17e5aa 100644
> --- a/include/uapi/linux/netfilter/nfnetlink_queue.h
> +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
> @@ -107,6 +107,7 @@ enum nfqnl_attr_config {
>         NFQA_CFG_QUEUE_MAXLEN,          /* __u32 */
>         NFQA_CFG_MASK,                  /* identify which flags to change */
>         NFQA_CFG_FLAGS,                 /* value of these flags (__u32) */
> +       NFQA_CFG_HASH_SIZE,             /* __u32 hash table size (rounded to power of 2) */
>         __NFQA_CFG_MAX
>  };
>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> index 8b7b39d8a109..b142fac70ed9 100644
> --- a/net/netfilter/nfnetlink_queue.c
> +++ b/net/netfilter/nfnetlink_queue.c
> @@ -46,7 +46,10 @@
>  #include <net/netfilter/nf_conntrack.h>
>  #endif
>
> -#define NFQNL_QMAX_DEFAULT 1024
> +#define NFQNL_QMAX_DEFAULT      1024
> +#define NFQNL_MIN_HASH_SIZE     16
> +#define NFQNL_DEFAULT_HASH_SIZE 1024
> +#define NFQNL_MAX_HASH_SIZE     131072
>
>  /* We're using struct nlattr which has 16bit nla_len. Note that nla_len
>   * includes the header length. Thus, the maximum packet length that we
> @@ -65,6 +68,7 @@ struct nfqnl_instance {
>         unsigned int copy_range;
>         unsigned int queue_dropped;
>         unsigned int queue_user_dropped;
> +       unsigned int queue_hash_size;
>
>
>         u_int16_t queue_num;                    /* number of this queue */
> @@ -77,6 +81,8 @@ struct nfqnl_instance {
>         spinlock_t      lock    ____cacheline_aligned_in_smp;
>         unsigned int    queue_total;
>         unsigned int    id_sequence;            /* 'sequence' of pkt ids */
> +       unsigned int    queue_hash_mask;
> +       struct hlist_head *queue_hash;
>         struct list_head queue_list;            /* packets in queue */
>  };
>
> @@ -95,6 +101,39 @@ static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
>         return net_generic(net, nfnl_queue_net_id);
>  }
>
> +static inline unsigned int
> +nfqnl_packet_hash(u32 id, unsigned int mask)
> +{
> +       return id & mask;
> +}
> +
> +static inline u32
> +nfqnl_normalize_hash_size(u32 hash_size)
> +{
> +       /* Must be power of two for queue_hash_mask to work correctly.
> +        * Avoid overflow of is_power_of_2 by bounding NFQNL_MAX_HASH_SIZE.
> +        */
> +       BUILD_BUG_ON(!is_power_of_2(NFQNL_MIN_HASH_SIZE) ||
> +                    !is_power_of_2(NFQNL_DEFAULT_HASH_SIZE) ||
> +                    !is_power_of_2(NFQNL_MAX_HASH_SIZE) ||
> +                    NFQNL_MAX_HASH_SIZE > 1U << 31);
> +
> +       if (!hash_size)
> +               return NFQNL_DEFAULT_HASH_SIZE;
> +
> +       /* Clamp to valid range before power of two to avoid overflow */
> +       if (hash_size <= NFQNL_MIN_HASH_SIZE)
> +               return NFQNL_MIN_HASH_SIZE;
> +
> +       if (hash_size >= NFQNL_MAX_HASH_SIZE)
> +               return NFQNL_MAX_HASH_SIZE;
> +
> +       if (!is_power_of_2(hash_size))
> +               hash_size = roundup_pow_of_two(hash_size);
> +
> +       return hash_size;
> +}
> +
>  static inline u_int8_t instance_hashfn(u_int16_t queue_num)
>  {
>         return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
> @@ -114,13 +153,70 @@ instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
>         return NULL;
>  }
>
> +static int
> +nfqnl_hash_resize(struct nfqnl_instance *inst, u32 hash_size)
> +{
> +       struct hlist_head *new_hash, *old_hash;
> +       struct nf_queue_entry *entry;
> +       unsigned int h, hash_mask;
> +
> +       hash_size = nfqnl_normalize_hash_size(hash_size);
> +       if (hash_size == inst->queue_hash_size)
> +               return 0;
> +
> +       /* GFP_ATOMIC required: called under rcu_read_lock in nfqnl_recv_config.
> +        * Using GFP_KERNEL_ACCOUNT would require refactoring lock placement.
> +        */
> +       new_hash = kvmalloc_array(hash_size, sizeof(*new_hash), GFP_ATOMIC);
> +       if (!new_hash)
> +               return -ENOMEM;
> +
> +       hash_mask = hash_size - 1;
> +
> +       for (h = 0; h < hash_size; h++)
> +               INIT_HLIST_HEAD(&new_hash[h]);
> +
> +       spin_lock_bh(&inst->lock);
> +
> +       list_for_each_entry(entry, &inst->queue_list, list) {
> +               /* No hlist_del() since old_hash will be freed and we hold lock */
> +               h = nfqnl_packet_hash(entry->id, hash_mask);
> +               hlist_add_head(&entry->hash_node, &new_hash[h]);
> +       }
> +
> +       old_hash = inst->queue_hash;
> +       inst->queue_hash_size = hash_size;
> +       inst->queue_hash_mask = hash_mask;
> +       inst->queue_hash = new_hash;
> +
> +       spin_unlock_bh(&inst->lock);
> +
> +       kvfree(old_hash);
> +
> +       return 0;
> +}
> +
>  static struct nfqnl_instance *
> -instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
> +instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid,
> +               u32 hash_size)
>  {
>         struct nfqnl_instance *inst;
> +       struct hlist_head *queue_hash;
>         unsigned int h;
>         int err;
>
> +       hash_size = nfqnl_normalize_hash_size(hash_size);
> +
> +       /* GFP_ATOMIC required: called under rcu_read_lock in nfqnl_recv_config.
> +        * Using GFP_KERNEL_ACCOUNT would require refactoring lock placement.
> +        */
> +       queue_hash = kvmalloc_array(hash_size, sizeof(*queue_hash), GFP_ATOMIC);
> +       if (!queue_hash)
> +               return ERR_PTR(-ENOMEM);
> +
> +       for (h = 0; h < hash_size; h++)
> +               INIT_HLIST_HEAD(&queue_hash[h]);
> +
>         spin_lock(&q->instances_lock);
>         if (instance_lookup(q, queue_num)) {
>                 err = -EEXIST;
> @@ -133,11 +229,14 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
>                 goto out_unlock;
>         }
>
> +       inst->queue_hash = queue_hash;
>         inst->queue_num = queue_num;
>         inst->peer_portid = portid;
>         inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
>         inst->copy_range = NFQNL_MAX_COPY_RANGE;
>         inst->copy_mode = NFQNL_COPY_NONE;
> +       inst->queue_hash_size = hash_size;
> +       inst->queue_hash_mask = hash_size - 1;
>         spin_lock_init(&inst->lock);
>         INIT_LIST_HEAD(&inst->queue_list);
>
> @@ -157,6 +256,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
>         kfree(inst);
>  out_unlock:
>         spin_unlock(&q->instances_lock);
> +       kvfree(queue_hash);
>         return ERR_PTR(err);
>  }
>
> @@ -172,6 +272,7 @@ instance_destroy_rcu(struct rcu_head *head)
>         rcu_read_lock();
>         nfqnl_flush(inst, NULL, 0);
>         rcu_read_unlock();
> +       kvfree(inst->queue_hash);
>         kfree(inst);
>         module_put(THIS_MODULE);
>  }
> @@ -194,13 +295,17 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
>  static inline void
>  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>  {
> -       list_add_tail(&entry->list, &queue->queue_list);
> -       queue->queue_total++;
> +       unsigned int hash = nfqnl_packet_hash(entry->id, queue->queue_hash_mask);
> +
> +       hlist_add_head(&entry->hash_node, &queue->queue_hash[hash]);
> +       list_add_tail(&entry->list, &queue->queue_list);
> +       queue->queue_total++;
>  }
>
>  static void
>  __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>  {
> +       hlist_del(&entry->hash_node);
>         list_del(&entry->list);
>         queue->queue_total--;
>  }
> @@ -209,10 +314,11 @@ static struct nf_queue_entry *
>  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
>  {
>         struct nf_queue_entry *entry = NULL, *i;
> +       unsigned int hash = nfqnl_packet_hash(id, queue->queue_hash_mask);
>
>         spin_lock_bh(&queue->lock);
>
> -       list_for_each_entry(i, &queue->queue_list, list) {
> +       hlist_for_each_entry(i, &queue->queue_hash[hash], hash_node) {
>                 if (i->id == id) {
>                         entry = i;
>                         break;
> @@ -407,8 +513,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
>         spin_lock_bh(&queue->lock);
>         list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
>                 if (!cmpfn || cmpfn(entry, data)) {
> -                       list_del(&entry->list);
> -                       queue->queue_total--;
> +                       __dequeue_entry(queue, entry);
>                         nfqnl_reinject(entry, NF_DROP);
>                 }
>         }
> @@ -1483,6 +1588,7 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
>         [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 },
>         [NFQA_CFG_MASK]         = { .type = NLA_U32 },
>         [NFQA_CFG_FLAGS]        = { .type = NLA_U32 },
> +       [NFQA_CFG_HASH_SIZE]    = { .type = NLA_U32 },
>  };
>
>  static const struct nf_queue_handler nfqh = {
> @@ -1495,11 +1601,15 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
>  {
>         struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
>         u_int16_t queue_num = ntohs(info->nfmsg->res_id);
> +       u32 hash_size = 0;
>         struct nfqnl_msg_config_cmd *cmd = NULL;
>         struct nfqnl_instance *queue;
>         __u32 flags = 0, mask = 0;
>         int ret = 0;
>
> +       if (nfqa[NFQA_CFG_HASH_SIZE])
> +               hash_size = ntohl(nla_get_be32(nfqa[NFQA_CFG_HASH_SIZE]));
> +
>         if (nfqa[NFQA_CFG_CMD]) {
>                 cmd = nla_data(nfqa[NFQA_CFG_CMD]);
>
> @@ -1559,11 +1669,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
>                                 goto err_out_unlock;
>                         }
>                         queue = instance_create(q, queue_num,
> -                                               NETLINK_CB(skb).portid);
> +                                               NETLINK_CB(skb).portid, hash_size);
>                         if (IS_ERR(queue)) {
>                                 ret = PTR_ERR(queue);
>                                 goto err_out_unlock;
>                         }
> +                       hash_size = 0; /* avoid resize later in this function */
>                         break;
>                 case NFQNL_CFG_CMD_UNBIND:
>                         if (!queue) {
> @@ -1586,6 +1697,12 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
>                 goto err_out_unlock;
>         }
>
> +       if (hash_size > 0) {
> +               ret = nfqnl_hash_resize(queue, hash_size);
> +               if (ret)
> +                       goto err_out_unlock;
> +       }
> +
>         if (nfqa[NFQA_CFG_PARAMS]) {
>                 struct nfqnl_msg_config_params *params =
>                         nla_data(nfqa[NFQA_CFG_PARAMS]);
> --
> 2.39.5 (Apple Git-154)
>

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Florian Westphal 2 months ago

Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
> Hello folks, friendly ping :) Please let me know if any other changes
> are required before merging.

net-next is closed and I don't think that it will re-open before new
year, so this patch (like all others) has to wait.

I could place it in nf-next:testing but it won't speed up the
required pull request.

Re: [PATCH v5] netfilter: nfnetlink_queue: optimize verdict lookup with hash table

Posted by Scott Mitchell 2 months ago

Thanks for the timely response! If you are satisfied I'm happy to have
it placed in nf-next:testing whenever convenient.

On Wed, Dec 3, 2025 at 10:41 AM Florian Westphal <fw@strlen.de> wrote:
>
> Scott Mitchell <scott.k.mitch1@gmail.com> wrote:
> > Hello folks, friendly ping :) Please let me know if any other changes
> > are required before merging.
>
> net-next is closed and I don't think that it will re-open before new
> year, so this patch (like all others) has to wait.
>
> I could place it in nf-next:testing but it won't speed up the
> required pull request.