[v5] tun: Introduce virtio-net hashing feature

[PATCH RFC v5 06/10] tun: Introduce virtio-net hash reporting feature

Posted by Akihiko Odaki 1 month, 2 weeks ago

Allow the guest to reuse the hash value to make receive steering
consistent between the host and guest, and to save hash computation.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
---
 Documentation/networking/tuntap.rst |   7 +++
 drivers/net/Kconfig                 |   1 +
 drivers/net/tap.c                   |  45 ++++++++++++++--
 drivers/net/tun.c                   |  46 ++++++++++++----
 drivers/net/tun_vnet.h              | 102 +++++++++++++++++++++++++++++++-----
 include/linux/if_tap.h              |   2 +
 include/uapi/linux/if_tun.h         |  48 +++++++++++++++++
 7 files changed, 223 insertions(+), 28 deletions(-)

diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
index 4d7087f727be..86b4ae8caa8a 100644
--- a/Documentation/networking/tuntap.rst
+++ b/Documentation/networking/tuntap.rst
@@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
       return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
   }
 
+3.4 Reference
+-------------
+
+``linux/if_tun.h`` defines the interface described below:
+
+.. kernel-doc:: include/uapi/linux/if_tun.h
+
 Universal TUN/TAP device driver Frequently Asked Question
 =========================================================
 
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 9920b3a68ed1..e2a7bd703550 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -395,6 +395,7 @@ config TUN
 	tristate "Universal TUN/TAP device driver support"
 	depends on INET
 	select CRC32
+	select SKB_EXTENSIONS
 	help
 	  TUN/TAP provides packet reception and transmission for user space
 	  programs.  It can be viewed as a simple Point-to-Point or Ethernet
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 9a34ceed0c2c..5e2fbe63ca47 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -179,6 +179,16 @@ static void tap_put_queue(struct tap_queue *q)
 	sock_put(&q->sk);
 }
 
+static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb)
+{
+	return (struct virtio_net_hash *)skb->cb;
+}
+
+static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb)
+{
+	return (const struct virtio_net_hash *)skb->cb;
+}
+
 /*
  * Select a queue based on the rxq of the device on which this packet
  * arrived. If the incoming device is not mq, calculate a flow hash
@@ -189,6 +199,7 @@ static void tap_put_queue(struct tap_queue *q)
 static struct tap_queue *tap_get_queue(struct tap_dev *tap,
 				       struct sk_buff *skb)
 {
+	struct flow_keys_basic keys_basic;
 	struct tap_queue *queue = NULL;
 	/* Access to taps array is protected by rcu, but access to numvtaps
 	 * isn't. Below we use it to lookup a queue, but treat it as a hint
@@ -198,15 +209,32 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
 	int numvtaps = READ_ONCE(tap->numvtaps);
 	__u32 rxq;
 
+	*tap_add_hash(skb) = (struct virtio_net_hash) { .report = VIRTIO_NET_HASH_REPORT_NONE };
+
 	if (!numvtaps)
 		goto out;
 
 	if (numvtaps == 1)
 		goto single;
 
+	if (!skb->l4_hash && !skb->sw_hash) {
+		struct flow_keys keys;
+
+		skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+		rxq = flow_hash_from_keys(&keys);
+		keys_basic = (struct flow_keys_basic) {
+			.control = keys.control,
+			.basic = keys.basic
+		};
+	} else {
+		skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0,
+						 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+		rxq = skb->hash;
+	}
+
 	/* Check if we can use flow to select a queue */
-	rxq = skb_get_hash(skb);
 	if (rxq) {
+		tun_vnet_hash_report(&tap->vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
 		queue = rcu_dereference(tap->taps[rxq % numvtaps]);
 		goto out;
 	}
@@ -713,15 +741,16 @@ static ssize_t tap_put_user(struct tap_queue *q,
 	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
-		struct virtio_net_hdr vnet_hdr;
+		struct virtio_net_hdr_v1_hash vnet_hdr;
 
 		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
 
-		ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
+		ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb,
+					    tap_find_hash, &vnet_hdr);
 		if (ret < 0)
 			goto done;
 
-		ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr);
+		ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr, ret);
 		if (ret < 0)
 			goto done;
 	}
@@ -1025,7 +1054,13 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 		return ret;
 
 	default:
-		return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp);
+		rtnl_lock();
+		tap = rtnl_dereference(q->tap);
+		ret = tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags,
+				     tap ? &tap->vnet_hash : NULL, -EINVAL,
+				     cmd, sp);
+		rtnl_unlock();
+		return ret;
 	}
 }
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index dd8799d19518..27308417b834 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -209,6 +209,7 @@ struct tun_struct {
 	struct bpf_prog __rcu *xdp_prog;
 	struct tun_prog __rcu *steering_prog;
 	struct tun_prog __rcu *filter_prog;
+	struct tun_vnet_hash vnet_hash;
 	struct ethtool_link_ksettings link_ksettings;
 	/* init args */
 	struct file *file;
@@ -451,6 +452,16 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 		e->rps_rxhash = hash;
 }
 
+static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb)
+{
+	return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
+}
+
+static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
+{
+	return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
+}
+
 /* We try to identify a flow through its rxhash. The reason that
  * we do not check rxq no. is because some cards(e.g 82599), chooses
  * the rxq based on the txq where the last packet of the flow comes. As
@@ -459,12 +470,17 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
  */
 static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 {
+	struct flow_keys keys;
+	struct flow_keys_basic keys_basic;
 	struct tun_flow_entry *e;
 	u32 txq, numqueues;
 
 	numqueues = READ_ONCE(tun->numqueues);
 
-	txq = __skb_get_hash_symmetric(skb);
+	memset(&keys, 0, sizeof(keys));
+	skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
+
+	txq = flow_hash_from_keys(&keys);
 	e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
 	if (e) {
 		tun_flow_save_rps_rxhash(e, txq);
@@ -473,6 +489,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 		txq = reciprocal_scale(txq, numqueues);
 	}
 
+	keys_basic = (struct flow_keys_basic) {
+		.control = keys.control,
+		.basic = keys.basic
+	};
+	tun_vnet_hash_report(&tun->vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
+			     tun_add_hash);
+
 	return txq;
 }
 
@@ -1990,10 +2013,8 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 	size_t total;
 
 	if (tun->flags & IFF_VNET_HDR) {
-		struct virtio_net_hdr gso = { 0 };
-
 		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
-		ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
+		ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, NULL, 0);
 		if (ret < 0)
 			return ret;
 	}
@@ -2018,7 +2039,6 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	int vlan_offset = 0;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
-	int ret;
 
 	if (skb_vlan_tag_present(skb))
 		vlan_hlen = VLAN_HLEN;
@@ -2043,13 +2063,15 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	}
 
 	if (vnet_hdr_sz) {
-		struct virtio_net_hdr gso;
+		struct virtio_net_hdr_v1_hash gso;
+		int ret;
 
-		ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
+		ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, skb,
+					    tun_find_hash, &gso);
 		if (ret < 0)
 			goto done;
 
-		ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
+		ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso, ret);
 		if (ret < 0)
 			goto done;
 	}
@@ -3055,9 +3077,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		goto unlock;
 	}
 
-	ret = -EBADFD;
-	if (!tun)
+	if (!tun) {
+		ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, cmd, argp);
 		goto unlock;
+	}
 
 	netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
 
@@ -3256,7 +3279,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		break;
 
 	default:
-		ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
+		ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
+				     &tun->vnet_hash, -EINVAL, cmd, argp);
 	}
 
 	if (do_notify)
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
index c40bde0fdf8c..589a97dd7d02 100644
--- a/drivers/net/tun_vnet.h
+++ b/drivers/net/tun_vnet.h
@@ -6,6 +6,9 @@
 #define TUN_VNET_LE 0x80000000
 #define TUN_VNET_BE 0x40000000
 
+typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
+typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *);
+
 static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
 {
 	return !(IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE)) &&
@@ -59,18 +62,31 @@ static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val)
 }
 
 static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
-				  unsigned int cmd, int __user *sp)
+				  struct tun_vnet_hash *hash, long fallback,
+				  unsigned int cmd, void __user *argp)
 {
+	static const struct tun_vnet_hash cap = {
+		.flags = TUN_VNET_HASH_REPORT,
+		.types = VIRTIO_NET_SUPPORTED_HASH_TYPES
+	};
+	struct tun_vnet_hash hash_buf;
+	int __user *sp = argp;
 	int s;
 
 	switch (cmd) {
 	case TUNGETVNETHDRSZ:
+		if (!sz)
+			return -EBADFD;
+
 		s = *sz;
 		if (put_user(s, sp))
 			return -EFAULT;
 		return 0;
 
 	case TUNSETVNETHDRSZ:
+		if (!sz)
+			return -EBADFD;
+
 		if (get_user(s, sp))
 			return -EFAULT;
 		if (s < (int)sizeof(struct virtio_net_hdr))
@@ -80,12 +96,18 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
 		return 0;
 
 	case TUNGETVNETLE:
+		if (!flags)
+			return -EBADFD;
+
 		s = !!(*flags & TUN_VNET_LE);
 		if (put_user(s, sp))
 			return -EFAULT;
 		return 0;
 
 	case TUNSETVNETLE:
+		if (!flags)
+			return -EBADFD;
+
 		if (get_user(s, sp))
 			return -EFAULT;
 		if (s)
@@ -95,16 +117,56 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
 		return 0;
 
 	case TUNGETVNETBE:
+		if (!flags)
+			return -EBADFD;
+
 		return tun_vnet_get_be(*flags, sp);
 
 	case TUNSETVNETBE:
+		if (!flags)
+			return -EBADFD;
+
 		return tun_vnet_set_be(flags, sp);
 
+	case TUNGETVNETHASHCAP:
+		return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0;
+
+	case TUNSETVNETHASH:
+		if (!hash)
+			return -EBADFD;
+
+		if (copy_from_user(&hash_buf, argp, sizeof(hash_buf)))
+			return -EFAULT;
+
+		*hash = hash_buf;
+		return 0;
+
 	default:
-		return -EINVAL;
+		return fallback;
 	}
 }
 
+static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
+					struct sk_buff *skb,
+					const struct flow_keys_basic *keys,
+					u32 value,
+					tun_vnet_hash_add vnet_hash_add)
+{
+	struct virtio_net_hash *report;
+
+	if (!(hash->flags & TUN_VNET_HASH_REPORT))
+		return;
+
+	report = vnet_hash_add(skb);
+	if (!report)
+		return;
+
+	*report = (struct virtio_net_hash) {
+		.report = virtio_net_hash_report(hash->types, keys),
+		.value = value
+	};
+}
+
 static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
 				   struct iov_iter *from,
 				   struct virtio_net_hdr *hdr)
@@ -130,15 +192,15 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
 }
 
 static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
-				   const struct virtio_net_hdr *hdr)
+				   const void *hdr, int content_sz)
 {
 	if (iov_iter_count(iter) < sz)
 		return -EINVAL;
 
-	if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
+	if (copy_to_iter(hdr, content_sz, iter) != content_sz)
 		return -EFAULT;
 
-	if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
+	if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz)
 		return -EFAULT;
 
 	return 0;
@@ -151,32 +213,48 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags,
 	return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags));
 }
 
-static inline int tun_vnet_hdr_from_skb(unsigned int flags,
+static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags,
 					const struct net_device *dev,
 					const struct sk_buff *skb,
-					struct virtio_net_hdr *hdr)
+					tun_vnet_hash_find vnet_hash_find,
+					struct virtio_net_hdr_v1_hash *hdr)
 {
 	int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
+	const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ?
+					       NULL : vnet_hash_find(skb);
+	int content_sz;
+
+	if (report) {
+		content_sz = sizeof(struct virtio_net_hdr_v1_hash);
+
+		*hdr = (struct virtio_net_hdr_v1_hash) {
+			.hdr = { .num_buffers = __cpu_to_virtio16(true, 1) },
+			.hash_value = cpu_to_le32(report->value),
+			.hash_report = cpu_to_le16(report->report)
+		};
+	} else {
+		content_sz = sizeof(struct virtio_net_hdr);
+	}
 
-	if (virtio_net_hdr_from_skb(skb, hdr,
+	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr,
 				    tun_vnet_is_little_endian(flags), true,
 				    vlan_hlen)) {
 		struct skb_shared_info *sinfo = skb_shinfo(skb);
 
 		if (net_ratelimit()) {
 			netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
-				   sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size),
-				   tun_vnet16_to_cpu(flags, hdr->hdr_len));
+				   sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size),
+				   tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len));
 			print_hex_dump(KERN_ERR, "tun: ",
 				       DUMP_PREFIX_NONE,
 				       16, 1, skb->head,
-				       min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true);
+				       min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true);
 		}
 		WARN_ON_ONCE(1);
 		return -EINVAL;
 	}
 
-	return 0;
+	return content_sz;
 }
 
 #endif /* TUN_VNET_H */
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 553552fa635c..5bbb343a6dba 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -4,6 +4,7 @@
 
 #include <net/sock.h>
 #include <linux/skb_array.h>
+#include <uapi/linux/if_tun.h>
 
 struct file;
 struct socket;
@@ -43,6 +44,7 @@ struct tap_dev {
 	int			numqueues;
 	netdev_features_t	tap_features;
 	int			minor;
+	struct tun_vnet_hash	vnet_hash;
 
 	void (*update_features)(struct tap_dev *tap, netdev_features_t features);
 	void (*count_tx_dropped)(struct tap_dev *tap);
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 287cdc81c939..d11e79b4e0dc 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -62,6 +62,34 @@
 #define TUNSETCARRIER _IOW('T', 226, int)
 #define TUNGETDEVNETNS _IO('T', 227)
 
+/**
+ * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
+ *
+ * The argument is a pointer to &struct tun_vnet_hash which will store the
+ * maximal virtio_net hashing configuration.
+ */
+#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
+
+/**
+ * define TUNSETVNETHASH - ioctl to configure virtio_net hashing
+ *
+ * The argument is a pointer to &struct tun_vnet_hash.
+ *
+ * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only
+ * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal
+ * to the size of &struct virtio_net_hdr_v1_hash.
+ *
+ * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will
+ * always be little-endian.
+ *
+ * This ioctl results in %EBADFD if the underlying device is deleted. It affects
+ * all queues attached to the same device.
+ *
+ * This ioctl currently has no effect on XDP packets and packets with
+ * queue_mapping set by TC.
+ */
+#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
+
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
@@ -115,4 +143,24 @@ struct tun_filter {
 	__u8   addr[][ETH_ALEN];
 };
 
+/**
+ * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
+ */
+#define TUN_VNET_HASH_REPORT	0x0001
+
+/**
+ * struct tun_vnet_hash - virtio_net hashing configuration
+ * @flags:
+ *		Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS
+ * @pad:
+ *		Should be filled with zero before passing to %TUNSETVNETHASH
+ * @types:
+ *		Bitmask of allowed hash types
+ */
+struct tun_vnet_hash {
+	__u16 flags;
+	__u8 pad[2];
+	__u32 types;
+};
+
 #endif /* _UAPI__IF_TUN_H */

-- 
2.46.2

Re: [PATCH RFC v5 06/10] tun: Introduce virtio-net hash reporting feature

Posted by Willem de Bruijn 1 month, 2 weeks ago

Akihiko Odaki wrote:
> Allow the guest to reuse the hash value to make receive steering
> consistent between the host and guest, and to save hash computation.
> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> ---
>  Documentation/networking/tuntap.rst |   7 +++
>  drivers/net/Kconfig                 |   1 +
>  drivers/net/tap.c                   |  45 ++++++++++++++--
>  drivers/net/tun.c                   |  46 ++++++++++++----
>  drivers/net/tun_vnet.h              | 102 +++++++++++++++++++++++++++++++-----
>  include/linux/if_tap.h              |   2 +
>  include/uapi/linux/if_tun.h         |  48 +++++++++++++++++
>  7 files changed, 223 insertions(+), 28 deletions(-)
> 
> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
> index 4d7087f727be..86b4ae8caa8a 100644
> --- a/Documentation/networking/tuntap.rst
> +++ b/Documentation/networking/tuntap.rst
> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
>        return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
>    }
>  
> +3.4 Reference
> +-------------
> +
> +``linux/if_tun.h`` defines the interface described below:
> +
> +.. kernel-doc:: include/uapi/linux/if_tun.h
> +
>  Universal TUN/TAP device driver Frequently Asked Question
>  =========================================================
>  
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index 9920b3a68ed1..e2a7bd703550 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -395,6 +395,7 @@ config TUN
>  	tristate "Universal TUN/TAP device driver support"
>  	depends on INET
>  	select CRC32
> +	select SKB_EXTENSIONS
>  	help
>  	  TUN/TAP provides packet reception and transmission for user space
>  	  programs.  It can be viewed as a simple Point-to-Point or Ethernet
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 9a34ceed0c2c..5e2fbe63ca47 100644

Merge the earlier tiny patch 2 into this one.

Re: [PATCH RFC v5 06/10] tun: Introduce virtio-net hash reporting feature

Posted by Jason Wang 1 month, 2 weeks ago

On Tue, Oct 8, 2024 at 2:55 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> Allow the guest to reuse the hash value to make receive steering
> consistent between the host and guest, and to save hash computation.
>
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>

I wonder if this would cause overhead when hash reporting is not enabled?

> ---
>  Documentation/networking/tuntap.rst |   7 +++
>  drivers/net/Kconfig                 |   1 +
>  drivers/net/tap.c                   |  45 ++++++++++++++--

Tile should be for tap as well or is this just for tun?

>  drivers/net/tun.c                   |  46 ++++++++++++----
>  drivers/net/tun_vnet.h              | 102 +++++++++++++++++++++++++++++++-----
>  include/linux/if_tap.h              |   2 +
>  include/uapi/linux/if_tun.h         |  48 +++++++++++++++++
>  7 files changed, 223 insertions(+), 28 deletions(-)
>
> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
> index 4d7087f727be..86b4ae8caa8a 100644
> --- a/Documentation/networking/tuntap.rst
> +++ b/Documentation/networking/tuntap.rst
> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
>        return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
>    }
>
> +3.4 Reference
> +-------------
> +
> +``linux/if_tun.h`` defines the interface described below:
> +
> +.. kernel-doc:: include/uapi/linux/if_tun.h
> +
>  Universal TUN/TAP device driver Frequently Asked Question
>  =========================================================
>
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index 9920b3a68ed1..e2a7bd703550 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -395,6 +395,7 @@ config TUN
>         tristate "Universal TUN/TAP device driver support"
>         depends on INET
>         select CRC32
> +       select SKB_EXTENSIONS

Then we need this for macvtap at least as well?

>         help
>           TUN/TAP provides packet reception and transmission for user space
>           programs.  It can be viewed as a simple Point-to-Point or Ethernet
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 9a34ceed0c2c..5e2fbe63ca47 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -179,6 +179,16 @@ static void tap_put_queue(struct tap_queue *q)
>         sock_put(&q->sk);
>  }
>
> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb)
> +{
> +       return (struct virtio_net_hash *)skb->cb;

Any reason that tap uses skb->cb but not skb extensions? (And is it
safe to use that without cloning?)

> +}
> +
> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb)
> +{
> +       return (const struct virtio_net_hash *)skb->cb;
> +}
> +
>  /*
>   * Select a queue based on the rxq of the device on which this packet
>   * arrived. If the incoming device is not mq, calculate a flow hash
> @@ -189,6 +199,7 @@ static void tap_put_queue(struct tap_queue *q)
>  static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>                                        struct sk_buff *skb)
>  {
> +       struct flow_keys_basic keys_basic;
>         struct tap_queue *queue = NULL;
>         /* Access to taps array is protected by rcu, but access to numvtaps
>          * isn't. Below we use it to lookup a queue, but treat it as a hint
> @@ -198,15 +209,32 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>         int numvtaps = READ_ONCE(tap->numvtaps);
>         __u32 rxq;
>
> +       *tap_add_hash(skb) = (struct virtio_net_hash) { .report = VIRTIO_NET_HASH_REPORT_NONE };
> +
>         if (!numvtaps)
>                 goto out;
>
>         if (numvtaps == 1)
>                 goto single;
>
> +       if (!skb->l4_hash && !skb->sw_hash) {
> +               struct flow_keys keys;
> +
> +               skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
> +               rxq = flow_hash_from_keys(&keys);
> +               keys_basic = (struct flow_keys_basic) {
> +                       .control = keys.control,
> +                       .basic = keys.basic
> +               };
> +       } else {
> +               skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0,
> +                                                FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
> +               rxq = skb->hash;
> +       }
> +
>         /* Check if we can use flow to select a queue */
> -       rxq = skb_get_hash(skb);
>         if (rxq) {
> +               tun_vnet_hash_report(&tap->vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
>                 queue = rcu_dereference(tap->taps[rxq % numvtaps]);
>                 goto out;
>         }
> @@ -713,15 +741,16 @@ static ssize_t tap_put_user(struct tap_queue *q,
>         int total;
>
>         if (q->flags & IFF_VNET_HDR) {
> -               struct virtio_net_hdr vnet_hdr;
> +               struct virtio_net_hdr_v1_hash vnet_hdr;
>
>                 vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
>
> -               ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
> +               ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb,
> +                                           tap_find_hash, &vnet_hdr);
>                 if (ret < 0)
>                         goto done;
>
> -               ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr);
> +               ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr, ret);
>                 if (ret < 0)
>                         goto done;
>         }
> @@ -1025,7 +1054,13 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
>                 return ret;
>
>         default:
> -               return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp);
> +               rtnl_lock();
> +               tap = rtnl_dereference(q->tap);
> +               ret = tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags,
> +                                    tap ? &tap->vnet_hash : NULL, -EINVAL,
> +                                    cmd, sp);
> +               rtnl_unlock();
> +               return ret;
>         }
>  }
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index dd8799d19518..27308417b834 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -209,6 +209,7 @@ struct tun_struct {
>         struct bpf_prog __rcu *xdp_prog;
>         struct tun_prog __rcu *steering_prog;
>         struct tun_prog __rcu *filter_prog;
> +       struct tun_vnet_hash vnet_hash;
>         struct ethtool_link_ksettings link_ksettings;
>         /* init args */
>         struct file *file;
> @@ -451,6 +452,16 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
>                 e->rps_rxhash = hash;
>  }
>
> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb)
> +{
> +       return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
> +}
> +
> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
> +{
> +       return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
> +}
> +
>  /* We try to identify a flow through its rxhash. The reason that
>   * we do not check rxq no. is because some cards(e.g 82599), chooses
>   * the rxq based on the txq where the last packet of the flow comes. As
> @@ -459,12 +470,17 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
>   */
>  static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>  {
> +       struct flow_keys keys;
> +       struct flow_keys_basic keys_basic;
>         struct tun_flow_entry *e;
>         u32 txq, numqueues;
>
>         numqueues = READ_ONCE(tun->numqueues);
>
> -       txq = __skb_get_hash_symmetric(skb);
> +       memset(&keys, 0, sizeof(keys));
> +       skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
> +
> +       txq = flow_hash_from_keys(&keys);
>         e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
>         if (e) {
>                 tun_flow_save_rps_rxhash(e, txq);
> @@ -473,6 +489,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>                 txq = reciprocal_scale(txq, numqueues);
>         }
>
> +       keys_basic = (struct flow_keys_basic) {
> +               .control = keys.control,
> +               .basic = keys.basic
> +       };
> +       tun_vnet_hash_report(&tun->vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
> +                            tun_add_hash);

Is using txq required when not l4_hash is required by the virtio-spec?

> +
>         return txq;
>  }
>
> @@ -1990,10 +2013,8 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
>         size_t total;
>
>         if (tun->flags & IFF_VNET_HDR) {
> -               struct virtio_net_hdr gso = { 0 };
> -
>                 vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> -               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
> +               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, NULL, 0);
>                 if (ret < 0)
>                         return ret;
>         }
> @@ -2018,7 +2039,6 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>         int vlan_offset = 0;
>         int vlan_hlen = 0;
>         int vnet_hdr_sz = 0;
> -       int ret;
>
>         if (skb_vlan_tag_present(skb))
>                 vlan_hlen = VLAN_HLEN;
> @@ -2043,13 +2063,15 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>         }
>
>         if (vnet_hdr_sz) {
> -               struct virtio_net_hdr gso;
> +               struct virtio_net_hdr_v1_hash gso;
> +               int ret;
>
> -               ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
> +               ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, skb,
> +                                           tun_find_hash, &gso);
>                 if (ret < 0)
>                         goto done;
>
> -               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
> +               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso, ret);
>                 if (ret < 0)
>                         goto done;
>         }
> @@ -3055,9 +3077,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>                 goto unlock;
>         }
>
> -       ret = -EBADFD;
> -       if (!tun)
> +       if (!tun) {
> +               ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, cmd, argp);

This seems not elegant (passing three NULL pointers). Any reason we
can't just modify __tun_chr_ioctl() instead of introducing things like
tun_vnet_ioctl()?

>                 goto unlock;
> +       }
>
>         netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
>
> @@ -3256,7 +3279,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>                 break;
>
>         default:
> -               ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
> +               ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
> +                                    &tun->vnet_hash, -EINVAL, cmd, argp);
>         }
>
>         if (do_notify)
> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
> index c40bde0fdf8c..589a97dd7d02 100644
> --- a/drivers/net/tun_vnet.h
> +++ b/drivers/net/tun_vnet.h
> @@ -6,6 +6,9 @@
>  #define TUN_VNET_LE 0x80000000
>  #define TUN_VNET_BE 0x40000000
>
> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *);
> +
>  static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
>  {
>         return !(IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE)) &&
> @@ -59,18 +62,31 @@ static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val)
>  }
>
>  static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
> -                                 unsigned int cmd, int __user *sp)
> +                                 struct tun_vnet_hash *hash, long fallback,
> +                                 unsigned int cmd, void __user *argp)
>  {
> +       static const struct tun_vnet_hash cap = {
> +               .flags = TUN_VNET_HASH_REPORT,
> +               .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
> +       };

Let's find a way to reuse virtio-net uAPI instead of introducing new
stuff to stress the management layer.

> +       struct tun_vnet_hash hash_buf;
> +       int __user *sp = argp;
>         int s;
>
>         switch (cmd) {
>         case TUNGETVNETHDRSZ:
> +               if (!sz)
> +                       return -EBADFD;
> +
>                 s = *sz;
>                 if (put_user(s, sp))
>                         return -EFAULT;
>                 return 0;
>
>         case TUNSETVNETHDRSZ:
> +               if (!sz)
> +                       return -EBADFD;
> +
>                 if (get_user(s, sp))
>                         return -EFAULT;
>                 if (s < (int)sizeof(struct virtio_net_hdr))
> @@ -80,12 +96,18 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>                 return 0;
>
>         case TUNGETVNETLE:
> +               if (!flags)
> +                       return -EBADFD;
> +
>                 s = !!(*flags & TUN_VNET_LE);
>                 if (put_user(s, sp))
>                         return -EFAULT;
>                 return 0;
>
>         case TUNSETVNETLE:
> +               if (!flags)
> +                       return -EBADFD;
> +
>                 if (get_user(s, sp))
>                         return -EFAULT;
>                 if (s)
> @@ -95,16 +117,56 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>                 return 0;
>
>         case TUNGETVNETBE:
> +               if (!flags)
> +                       return -EBADFD;
> +
>                 return tun_vnet_get_be(*flags, sp);
>
>         case TUNSETVNETBE:
> +               if (!flags)
> +                       return -EBADFD;
> +
>                 return tun_vnet_set_be(flags, sp);
>
> +       case TUNGETVNETHASHCAP:
> +               return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0;
> +
> +       case TUNSETVNETHASH:
> +               if (!hash)
> +                       return -EBADFD;
> +
> +               if (copy_from_user(&hash_buf, argp, sizeof(hash_buf)))
> +                       return -EFAULT;
> +
> +               *hash = hash_buf;
> +               return 0;
> +
>         default:
> -               return -EINVAL;
> +               return fallback;
>         }
>  }
>
> +static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
> +                                       struct sk_buff *skb,
> +                                       const struct flow_keys_basic *keys,
> +                                       u32 value,
> +                                       tun_vnet_hash_add vnet_hash_add)
> +{
> +       struct virtio_net_hash *report;
> +
> +       if (!(hash->flags & TUN_VNET_HASH_REPORT))
> +               return;
> +
> +       report = vnet_hash_add(skb);
> +       if (!report)
> +               return;
> +
> +       *report = (struct virtio_net_hash) {
> +               .report = virtio_net_hash_report(hash->types, keys),
> +               .value = value
> +       };
> +}
> +
>  static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>                                    struct iov_iter *from,
>                                    struct virtio_net_hdr *hdr)
> @@ -130,15 +192,15 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>  }
>
>  static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
> -                                  const struct virtio_net_hdr *hdr)
> +                                  const void *hdr, int content_sz)
>  {
>         if (iov_iter_count(iter) < sz)
>                 return -EINVAL;
>
> -       if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
> +       if (copy_to_iter(hdr, content_sz, iter) != content_sz)
>                 return -EFAULT;
>
> -       if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
> +       if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz)
>                 return -EFAULT;
>
>         return 0;
> @@ -151,32 +213,48 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags,
>         return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags));
>  }
>
> -static inline int tun_vnet_hdr_from_skb(unsigned int flags,
> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags,
>                                         const struct net_device *dev,
>                                         const struct sk_buff *skb,
> -                                       struct virtio_net_hdr *hdr)
> +                                       tun_vnet_hash_find vnet_hash_find,
> +                                       struct virtio_net_hdr_v1_hash *hdr)
>  {
>         int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
> +       const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ?
> +                                              NULL : vnet_hash_find(skb);
> +       int content_sz;
> +
> +       if (report) {
> +               content_sz = sizeof(struct virtio_net_hdr_v1_hash);
> +
> +               *hdr = (struct virtio_net_hdr_v1_hash) {
> +                       .hdr = { .num_buffers = __cpu_to_virtio16(true, 1) },
> +                       .hash_value = cpu_to_le32(report->value),
> +                       .hash_report = cpu_to_le16(report->report)
> +               };
> +       } else {
> +               content_sz = sizeof(struct virtio_net_hdr);
> +       }
>
> -       if (virtio_net_hdr_from_skb(skb, hdr,
> +       if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr,
>                                     tun_vnet_is_little_endian(flags), true,
>                                     vlan_hlen)) {
>                 struct skb_shared_info *sinfo = skb_shinfo(skb);
>
>                 if (net_ratelimit()) {
>                         netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
> -                                  sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size),
> -                                  tun_vnet16_to_cpu(flags, hdr->hdr_len));
> +                                  sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size),
> +                                  tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len));
>                         print_hex_dump(KERN_ERR, "tun: ",
>                                        DUMP_PREFIX_NONE,
>                                        16, 1, skb->head,
> -                                      min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true);
> +                                      min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true);
>                 }
>                 WARN_ON_ONCE(1);
>                 return -EINVAL;
>         }
>
> -       return 0;
> +       return content_sz;
>  }
>
>  #endif /* TUN_VNET_H */
> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
> index 553552fa635c..5bbb343a6dba 100644
> --- a/include/linux/if_tap.h
> +++ b/include/linux/if_tap.h
> @@ -4,6 +4,7 @@
>
>  #include <net/sock.h>
>  #include <linux/skb_array.h>
> +#include <uapi/linux/if_tun.h>
>
>  struct file;
>  struct socket;
> @@ -43,6 +44,7 @@ struct tap_dev {
>         int                     numqueues;
>         netdev_features_t       tap_features;
>         int                     minor;
> +       struct tun_vnet_hash    vnet_hash;
>
>         void (*update_features)(struct tap_dev *tap, netdev_features_t features);
>         void (*count_tx_dropped)(struct tap_dev *tap);
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 287cdc81c939..d11e79b4e0dc 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -62,6 +62,34 @@
>  #define TUNSETCARRIER _IOW('T', 226, int)
>  #define TUNGETDEVNETNS _IO('T', 227)
>
> +/**
> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
> + *
> + * The argument is a pointer to &struct tun_vnet_hash which will store the
> + * maximal virtio_net hashing configuration.
> + */
> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
> +
> +/**
> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing
> + *
> + * The argument is a pointer to &struct tun_vnet_hash.
> + *
> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only
> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal
> + * to the size of &struct virtio_net_hdr_v1_hash.

I think we don't need & here.

> + *
> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will
> + * always be little-endian.
> + *
> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects
> + * all queues attached to the same device.
> + *
> + * This ioctl currently has no effect on XDP packets and packets with
> + * queue_mapping set by TC.

This needs to be fixed?

> + */
> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
> +
>  /* TUNSETIFF ifr flags */
>  #define IFF_TUN                0x0001
>  #define IFF_TAP                0x0002
> @@ -115,4 +143,24 @@ struct tun_filter {
>         __u8   addr[][ETH_ALEN];
>  };
>
> +/**
> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
> + */
> +#define TUN_VNET_HASH_REPORT   0x0001
> +
> +/**
> + * struct tun_vnet_hash - virtio_net hashing configuration
> + * @flags:
> + *             Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS

Could we reuse TUNGETIFF by introduce new IFF_XXX stuffs?

> + * @pad:
> + *             Should be filled with zero before passing to %TUNSETVNETHASH
> + * @types:
> + *             Bitmask of allowed hash types

What are they?

> + */
> +struct tun_vnet_hash {
> +       __u16 flags;
> +       __u8 pad[2];
> +       __u32 types;
> +};
> +
>  #endif /* _UAPI__IF_TUN_H */
>
> --
> 2.46.2
>

Thanks

Re: [PATCH RFC v5 06/10] tun: Introduce virtio-net hash reporting feature

Posted by Akihiko Odaki 1 month, 2 weeks ago

On 2024/10/09 17:05, Jason Wang wrote:
> On Tue, Oct 8, 2024 at 2:55 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> Allow the guest to reuse the hash value to make receive steering
>> consistent between the host and guest, and to save hash computation.
>>
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> 
> I wonder if this would cause overhead when hash reporting is not enabled?

It only adds two branches in the data path. The first one is in 
tun_vnet_hash_report(), which determines to add the hash value to 
sk_buff. The second one is in tun_vnet_hdr_from_skb(), which determines 
to report the added hash value.

> 
>> ---
>>   Documentation/networking/tuntap.rst |   7 +++
>>   drivers/net/Kconfig                 |   1 +
>>   drivers/net/tap.c                   |  45 ++++++++++++++--
> 
> Tile should be for tap as well or is this just for tun?

It is also for tap. I will update the title in v6.

> 
>>   drivers/net/tun.c                   |  46 ++++++++++++----
>>   drivers/net/tun_vnet.h              | 102 +++++++++++++++++++++++++++++++-----
>>   include/linux/if_tap.h              |   2 +
>>   include/uapi/linux/if_tun.h         |  48 +++++++++++++++++
>>   7 files changed, 223 insertions(+), 28 deletions(-)
>>
>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
>> index 4d7087f727be..86b4ae8caa8a 100644
>> --- a/Documentation/networking/tuntap.rst
>> +++ b/Documentation/networking/tuntap.rst
>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
>>         return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
>>     }
>>
>> +3.4 Reference
>> +-------------
>> +
>> +``linux/if_tun.h`` defines the interface described below:
>> +
>> +.. kernel-doc:: include/uapi/linux/if_tun.h
>> +
>>   Universal TUN/TAP device driver Frequently Asked Question
>>   =========================================================
>>
>> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
>> index 9920b3a68ed1..e2a7bd703550 100644
>> --- a/drivers/net/Kconfig
>> +++ b/drivers/net/Kconfig
>> @@ -395,6 +395,7 @@ config TUN
>>          tristate "Universal TUN/TAP device driver support"
>>          depends on INET
>>          select CRC32
>> +       select SKB_EXTENSIONS
> 
> Then we need this for macvtap at least as well?
> 
>>          help
>>            TUN/TAP provides packet reception and transmission for user space
>>            programs.  It can be viewed as a simple Point-to-Point or Ethernet
>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>> index 9a34ceed0c2c..5e2fbe63ca47 100644
>> --- a/drivers/net/tap.c
>> +++ b/drivers/net/tap.c
>> @@ -179,6 +179,16 @@ static void tap_put_queue(struct tap_queue *q)
>>          sock_put(&q->sk);
>>   }
>>
>> +static struct virtio_net_hash *tap_add_hash(struct sk_buff *skb)
>> +{
>> +       return (struct virtio_net_hash *)skb->cb;
> 
> Any reason that tap uses skb->cb but not skb extensions? (And is it
> safe to use that without cloning?)

tun adds virtio_net_hash to a skb in ndo_select_queue(), but it does not 
immediately put it into its ptr_ring; instead ndo_start_xmit() does so. 
It is hard to ensure that nobody modifies skb->cb between the two calls.

The situation is different for tap. tap_handle_frame() adds 
virtio_net_hash to a skb and immediately adds it in its ptr_ring so 
nobody should touch it between that.

> 
>> +}
>> +
>> +static const struct virtio_net_hash *tap_find_hash(const struct sk_buff *skb)
>> +{
>> +       return (const struct virtio_net_hash *)skb->cb;
>> +}
>> +
>>   /*
>>    * Select a queue based on the rxq of the device on which this packet
>>    * arrived. If the incoming device is not mq, calculate a flow hash
>> @@ -189,6 +199,7 @@ static void tap_put_queue(struct tap_queue *q)
>>   static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>>                                         struct sk_buff *skb)
>>   {
>> +       struct flow_keys_basic keys_basic;
>>          struct tap_queue *queue = NULL;
>>          /* Access to taps array is protected by rcu, but access to numvtaps
>>           * isn't. Below we use it to lookup a queue, but treat it as a hint
>> @@ -198,15 +209,32 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>>          int numvtaps = READ_ONCE(tap->numvtaps);
>>          __u32 rxq;
>>
>> +       *tap_add_hash(skb) = (struct virtio_net_hash) { .report = VIRTIO_NET_HASH_REPORT_NONE };
>> +
>>          if (!numvtaps)
>>                  goto out;
>>
>>          if (numvtaps == 1)
>>                  goto single;
>>
>> +       if (!skb->l4_hash && !skb->sw_hash) {
>> +               struct flow_keys keys;
>> +
>> +               skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
>> +               rxq = flow_hash_from_keys(&keys);
>> +               keys_basic = (struct flow_keys_basic) {
>> +                       .control = keys.control,
>> +                       .basic = keys.basic
>> +               };
>> +       } else {
>> +               skb_flow_dissect_flow_keys_basic(NULL, skb, &keys_basic, NULL, 0, 0, 0,
>> +                                                FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
>> +               rxq = skb->hash;
>> +       }
>> +
>>          /* Check if we can use flow to select a queue */
>> -       rxq = skb_get_hash(skb);
>>          if (rxq) {
>> +               tun_vnet_hash_report(&tap->vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
>>                  queue = rcu_dereference(tap->taps[rxq % numvtaps]);
>>                  goto out;
>>          }
>> @@ -713,15 +741,16 @@ static ssize_t tap_put_user(struct tap_queue *q,
>>          int total;
>>
>>          if (q->flags & IFF_VNET_HDR) {
>> -               struct virtio_net_hdr vnet_hdr;
>> +               struct virtio_net_hdr_v1_hash vnet_hdr;
>>
>>                  vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
>>
>> -               ret = tun_vnet_hdr_from_skb(q->flags, NULL, skb, &vnet_hdr);
>> +               ret = tun_vnet_hdr_from_skb(vnet_hdr_len, q->flags, NULL, skb,
>> +                                           tap_find_hash, &vnet_hdr);
>>                  if (ret < 0)
>>                          goto done;
>>
>> -               ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr);
>> +               ret = tun_vnet_hdr_put(vnet_hdr_len, iter, &vnet_hdr, ret);
>>                  if (ret < 0)
>>                          goto done;
>>          }
>> @@ -1025,7 +1054,13 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
>>                  return ret;
>>
>>          default:
>> -               return tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags, cmd, sp);
>> +               rtnl_lock();
>> +               tap = rtnl_dereference(q->tap);
>> +               ret = tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags,
>> +                                    tap ? &tap->vnet_hash : NULL, -EINVAL,
>> +                                    cmd, sp);
>> +               rtnl_unlock();
>> +               return ret;
>>          }
>>   }
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index dd8799d19518..27308417b834 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -209,6 +209,7 @@ struct tun_struct {
>>          struct bpf_prog __rcu *xdp_prog;
>>          struct tun_prog __rcu *steering_prog;
>>          struct tun_prog __rcu *filter_prog;
>> +       struct tun_vnet_hash vnet_hash;
>>          struct ethtool_link_ksettings link_ksettings;
>>          /* init args */
>>          struct file *file;
>> @@ -451,6 +452,16 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
>>                  e->rps_rxhash = hash;
>>   }
>>
>> +static struct virtio_net_hash *tun_add_hash(struct sk_buff *skb)
>> +{
>> +       return skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>> +}
>> +
>> +static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
>> +{
>> +       return skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
>> +}
>> +
>>   /* We try to identify a flow through its rxhash. The reason that
>>    * we do not check rxq no. is because some cards(e.g 82599), chooses
>>    * the rxq based on the txq where the last packet of the flow comes. As
>> @@ -459,12 +470,17 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
>>    */
>>   static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>>   {
>> +       struct flow_keys keys;
>> +       struct flow_keys_basic keys_basic;
>>          struct tun_flow_entry *e;
>>          u32 txq, numqueues;
>>
>>          numqueues = READ_ONCE(tun->numqueues);
>>
>> -       txq = __skb_get_hash_symmetric(skb);
>> +       memset(&keys, 0, sizeof(keys));
>> +       skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
>> +
>> +       txq = flow_hash_from_keys(&keys);
>>          e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
>>          if (e) {
>>                  tun_flow_save_rps_rxhash(e, txq);
>> @@ -473,6 +489,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>>                  txq = reciprocal_scale(txq, numqueues);
>>          }
>>
>> +       keys_basic = (struct flow_keys_basic) {
>> +               .control = keys.control,
>> +               .basic = keys.basic
>> +       };
>> +       tun_vnet_hash_report(&tun->vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
>> +                            tun_add_hash);
> 
> Is using txq required when not l4_hash is required by the virtio-spec?

It is a limitation of the implementation. A hardware driver may set a 
hash value with skb_set_hash(), which takes enum pkt_hash_types. The 
enum is defined as follows:
enum pkt_hash_types {
	PKT_HASH_TYPE_NONE,	/* Undefined type */
	PKT_HASH_TYPE_L2,	/* Input: src_MAC, dest_MAC */
	PKT_HASH_TYPE_L3,	/* Input: src_IP, dst_IP */
	PKT_HASH_TYPE_L4,	/* Input: src_IP, dst_IP, src_port, dst_port */
};

A hash value with PKT_HASH_TYPE_L2 must be ignored as the virtio spec 
does not have a corresponding hash type. The virtio spec has 
corresponding hash types for PKT_HASH_TYPE_L3 and PKT_HASH_TYPE_L4 so we 
should report them with the virtio_net header.

However, skb only tells whether the hash is PKT_HASH_TYPE_L4 or not. So 
tun reports skb->hash with a L4 hash type if the hash is 
PKT_HASH_TYPE_L4. Otherwise it ignores skb->hash and uses the hash value 
computed its own because it cannot tell if skb->hash is PKT_HASH_TYPE_L2 
or PKT_HASH_TYPE_L3.

> 
>> +
>>          return txq;
>>   }
>>
>> @@ -1990,10 +2013,8 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
>>          size_t total;
>>
>>          if (tun->flags & IFF_VNET_HDR) {
>> -               struct virtio_net_hdr gso = { 0 };
>> -
>>                  vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
>> -               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
>> +               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, NULL, 0);
>>                  if (ret < 0)
>>                          return ret;
>>          }
>> @@ -2018,7 +2039,6 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>          int vlan_offset = 0;
>>          int vlan_hlen = 0;
>>          int vnet_hdr_sz = 0;
>> -       int ret;
>>
>>          if (skb_vlan_tag_present(skb))
>>                  vlan_hlen = VLAN_HLEN;
>> @@ -2043,13 +2063,15 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>          }
>>
>>          if (vnet_hdr_sz) {
>> -               struct virtio_net_hdr gso;
>> +               struct virtio_net_hdr_v1_hash gso;
>> +               int ret;
>>
>> -               ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
>> +               ret = tun_vnet_hdr_from_skb(vnet_hdr_sz, tun->flags, tun->dev, skb,
>> +                                           tun_find_hash, &gso);
>>                  if (ret < 0)
>>                          goto done;
>>
>> -               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
>> +               ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso, ret);
>>                  if (ret < 0)
>>                          goto done;
>>          }
>> @@ -3055,9 +3077,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>                  goto unlock;
>>          }
>>
>> -       ret = -EBADFD;
>> -       if (!tun)
>> +       if (!tun) {
>> +               ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, cmd, argp);
> 
> This seems not elegant (passing three NULL pointers). Any reason we
> can't just modify __tun_chr_ioctl() instead of introducing things like
> tun_vnet_ioctl()?

tun_vnet_ioctl() is introduced with patch "tun: Unify vnet 
implementation". We can abandon unifying the ioctl handling if the 
interface looks too awkward.

> 
>>                  goto unlock;
>> +       }
>>
>>          netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
>>
>> @@ -3256,7 +3279,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>                  break;
>>
>>          default:
>> -               ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
>> +               ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
>> +                                    &tun->vnet_hash, -EINVAL, cmd, argp);
>>          }
>>
>>          if (do_notify)
>> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
>> index c40bde0fdf8c..589a97dd7d02 100644
>> --- a/drivers/net/tun_vnet.h
>> +++ b/drivers/net/tun_vnet.h
>> @@ -6,6 +6,9 @@
>>   #define TUN_VNET_LE 0x80000000
>>   #define TUN_VNET_BE 0x40000000
>>
>> +typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
>> +typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *);
>> +
>>   static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
>>   {
>>          return !(IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE)) &&
>> @@ -59,18 +62,31 @@ static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val)
>>   }
>>
>>   static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>> -                                 unsigned int cmd, int __user *sp)
>> +                                 struct tun_vnet_hash *hash, long fallback,
>> +                                 unsigned int cmd, void __user *argp)
>>   {
>> +       static const struct tun_vnet_hash cap = {
>> +               .flags = TUN_VNET_HASH_REPORT,
>> +               .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>> +       };
> 
> Let's find a way to reuse virtio-net uAPI instead of introducing new
> stuff to stress the management layer.

I found include/uapi/linux/virtio_net.h inappropriate for this ioctl.

It has the following structure for hash reporting:

struct virtio_net_hash_config {
	__le32 hash_types;
	/* for compatibility with virtio_net_rss_config */
	__le16 reserved[4];
	__u8 hash_key_length;
	__u8 hash_key_data[/* hash_key_length */];
};

We do not care hash_key_length and hash_key_data.

It has the following structure for RSS and hash reporting when RSS is 
enabled:

struct virtio_net_rss_config {
	__le32 hash_types;
	__le16 indirection_table_mask;
	__le16 unclassified_queue;
	__le16 indirection_table[1/* + indirection_table_mask */];
	__le16 max_tx_vq;
	__u8 hash_key_length;
	__u8 hash_key_data[/* hash_key_length */];
};

This structure is hard to use as it has data members after the 
indirection_table flexible array. max_tx_vq is not our interest either.

I tested the usability of ioctl by actually using it in QEMU. The RFC 
patch series is available at:
https://patchew.org/QEMU/20240915-hash-v3-0-79cb08d28647@daynix.com/

> 
>> +       struct tun_vnet_hash hash_buf;
>> +       int __user *sp = argp;
>>          int s;
>>
>>          switch (cmd) {
>>          case TUNGETVNETHDRSZ:
>> +               if (!sz)
>> +                       return -EBADFD;
>> +
>>                  s = *sz;
>>                  if (put_user(s, sp))
>>                          return -EFAULT;
>>                  return 0;
>>
>>          case TUNSETVNETHDRSZ:
>> +               if (!sz)
>> +                       return -EBADFD;
>> +
>>                  if (get_user(s, sp))
>>                          return -EFAULT;
>>                  if (s < (int)sizeof(struct virtio_net_hdr))
>> @@ -80,12 +96,18 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>>                  return 0;
>>
>>          case TUNGETVNETLE:
>> +               if (!flags)
>> +                       return -EBADFD;
>> +
>>                  s = !!(*flags & TUN_VNET_LE);
>>                  if (put_user(s, sp))
>>                          return -EFAULT;
>>                  return 0;
>>
>>          case TUNSETVNETLE:
>> +               if (!flags)
>> +                       return -EBADFD;
>> +
>>                  if (get_user(s, sp))
>>                          return -EFAULT;
>>                  if (s)
>> @@ -95,16 +117,56 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>>                  return 0;
>>
>>          case TUNGETVNETBE:
>> +               if (!flags)
>> +                       return -EBADFD;
>> +
>>                  return tun_vnet_get_be(*flags, sp);
>>
>>          case TUNSETVNETBE:
>> +               if (!flags)
>> +                       return -EBADFD;
>> +
>>                  return tun_vnet_set_be(flags, sp);
>>
>> +       case TUNGETVNETHASHCAP:
>> +               return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0;
>> +
>> +       case TUNSETVNETHASH:
>> +               if (!hash)
>> +                       return -EBADFD;
>> +
>> +               if (copy_from_user(&hash_buf, argp, sizeof(hash_buf)))
>> +                       return -EFAULT;
>> +
>> +               *hash = hash_buf;
>> +               return 0;
>> +
>>          default:
>> -               return -EINVAL;
>> +               return fallback;
>>          }
>>   }
>>
>> +static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
>> +                                       struct sk_buff *skb,
>> +                                       const struct flow_keys_basic *keys,
>> +                                       u32 value,
>> +                                       tun_vnet_hash_add vnet_hash_add)
>> +{
>> +       struct virtio_net_hash *report;
>> +
>> +       if (!(hash->flags & TUN_VNET_HASH_REPORT))
>> +               return;
>> +
>> +       report = vnet_hash_add(skb);
>> +       if (!report)
>> +               return;
>> +
>> +       *report = (struct virtio_net_hash) {
>> +               .report = virtio_net_hash_report(hash->types, keys),
>> +               .value = value
>> +       };
>> +}
>> +
>>   static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>>                                     struct iov_iter *from,
>>                                     struct virtio_net_hdr *hdr)
>> @@ -130,15 +192,15 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>>   }
>>
>>   static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
>> -                                  const struct virtio_net_hdr *hdr)
>> +                                  const void *hdr, int content_sz)
>>   {
>>          if (iov_iter_count(iter) < sz)
>>                  return -EINVAL;
>>
>> -       if (copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))
>> +       if (copy_to_iter(hdr, content_sz, iter) != content_sz)
>>                  return -EFAULT;
>>
>> -       if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
>> +       if (iov_iter_zero(sz - content_sz, iter) != sz - content_sz)
>>                  return -EFAULT;
>>
>>          return 0;
>> @@ -151,32 +213,48 @@ static inline int tun_vnet_hdr_to_skb(unsigned int flags,
>>          return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags));
>>   }
>>
>> -static inline int tun_vnet_hdr_from_skb(unsigned int flags,
>> +static inline int tun_vnet_hdr_from_skb(int sz, unsigned int flags,
>>                                          const struct net_device *dev,
>>                                          const struct sk_buff *skb,
>> -                                       struct virtio_net_hdr *hdr)
>> +                                       tun_vnet_hash_find vnet_hash_find,
>> +                                       struct virtio_net_hdr_v1_hash *hdr)
>>   {
>>          int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
>> +       const struct virtio_net_hash *report = sz < sizeof(struct virtio_net_hdr_v1_hash) ?
>> +                                              NULL : vnet_hash_find(skb);
>> +       int content_sz;
>> +
>> +       if (report) {
>> +               content_sz = sizeof(struct virtio_net_hdr_v1_hash);
>> +
>> +               *hdr = (struct virtio_net_hdr_v1_hash) {
>> +                       .hdr = { .num_buffers = __cpu_to_virtio16(true, 1) },
>> +                       .hash_value = cpu_to_le32(report->value),
>> +                       .hash_report = cpu_to_le16(report->report)
>> +               };
>> +       } else {
>> +               content_sz = sizeof(struct virtio_net_hdr);
>> +       }
>>
>> -       if (virtio_net_hdr_from_skb(skb, hdr,
>> +       if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)hdr,
>>                                      tun_vnet_is_little_endian(flags), true,
>>                                      vlan_hlen)) {
>>                  struct skb_shared_info *sinfo = skb_shinfo(skb);
>>
>>                  if (net_ratelimit()) {
>>                          netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
>> -                                  sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size),
>> -                                  tun_vnet16_to_cpu(flags, hdr->hdr_len));
>> +                                  sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->hdr.gso_size),
>> +                                  tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len));
>>                          print_hex_dump(KERN_ERR, "tun: ",
>>                                         DUMP_PREFIX_NONE,
>>                                         16, 1, skb->head,
>> -                                      min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true);
>> +                                      min(tun_vnet16_to_cpu(flags, hdr->hdr.hdr_len), 64), true);
>>                  }
>>                  WARN_ON_ONCE(1);
>>                  return -EINVAL;
>>          }
>>
>> -       return 0;
>> +       return content_sz;
>>   }
>>
>>   #endif /* TUN_VNET_H */
>> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
>> index 553552fa635c..5bbb343a6dba 100644
>> --- a/include/linux/if_tap.h
>> +++ b/include/linux/if_tap.h
>> @@ -4,6 +4,7 @@
>>
>>   #include <net/sock.h>
>>   #include <linux/skb_array.h>
>> +#include <uapi/linux/if_tun.h>
>>
>>   struct file;
>>   struct socket;
>> @@ -43,6 +44,7 @@ struct tap_dev {
>>          int                     numqueues;
>>          netdev_features_t       tap_features;
>>          int                     minor;
>> +       struct tun_vnet_hash    vnet_hash;
>>
>>          void (*update_features)(struct tap_dev *tap, netdev_features_t features);
>>          void (*count_tx_dropped)(struct tap_dev *tap);
>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>> index 287cdc81c939..d11e79b4e0dc 100644
>> --- a/include/uapi/linux/if_tun.h
>> +++ b/include/uapi/linux/if_tun.h
>> @@ -62,6 +62,34 @@
>>   #define TUNSETCARRIER _IOW('T', 226, int)
>>   #define TUNGETDEVNETNS _IO('T', 227)
>>
>> +/**
>> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
>> + *
>> + * The argument is a pointer to &struct tun_vnet_hash which will store the
>> + * maximal virtio_net hashing configuration.
>> + */
>> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
>> +
>> +/**
>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing
>> + *
>> + * The argument is a pointer to &struct tun_vnet_hash.
>> + *
>> + * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only
>> + * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal
>> + * to the size of &struct virtio_net_hdr_v1_hash.
> 
> I think we don't need & here.

Documentation/doc-guide/kernel-doc.rst says &struct is a token for 
struct cross-reference.

> 
>> + *
>> + * The members added to the legacy header by %TUN_VNET_HASH_REPORT flag will
>> + * always be little-endian.
>> + *
>> + * This ioctl results in %EBADFD if the underlying device is deleted. It affects
>> + * all queues attached to the same device.
>> + *
>> + * This ioctl currently has no effect on XDP packets and packets with
>> + * queue_mapping set by TC.
> 
> This needs to be fixed?

We don't use a hash value to select a queue in such a case so there is 
no point to report one.

> 
>> + */
>> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
>> +
>>   /* TUNSETIFF ifr flags */
>>   #define IFF_TUN                0x0001
>>   #define IFF_TAP                0x0002
>> @@ -115,4 +143,24 @@ struct tun_filter {
>>          __u8   addr[][ETH_ALEN];
>>   };
>>
>> +/**
>> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
>> + */
>> +#define TUN_VNET_HASH_REPORT   0x0001
>> +
>> +/**
>> + * struct tun_vnet_hash - virtio_net hashing configuration
>> + * @flags:
>> + *             Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS
> 
> Could we reuse TUNGETIFF by introduce new IFF_XXX stuffs?

That's certainly doable though I'm a bit worrying that exhausting all 
bits of IFF_XXX.

> 
>> + * @pad:
>> + *             Should be filled with zero before passing to %TUNSETVNETHASH
>> + * @types:
>> + *             Bitmask of allowed hash types
> 
> What are they?

They are defined in the virtio spec and include/uapi/linux/virtio_net.h 
contains them:
#define VIRTIO_NET_RSS_HASH_TYPE_IPv4          (1 << 0)
#define VIRTIO_NET_RSS_HASH_TYPE_TCPv4         (1 << 1)
#define VIRTIO_NET_RSS_HASH_TYPE_UDPv4         (1 << 2)
#define VIRTIO_NET_RSS_HASH_TYPE_IPv6          (1 << 3)
#define VIRTIO_NET_RSS_HASH_TYPE_TCPv6         (1 << 4)
#define VIRTIO_NET_RSS_HASH_TYPE_UDPv6         (1 << 5)
#define VIRTIO_NET_RSS_HASH_TYPE_IP_EX         (1 << 6)
#define VIRTIO_NET_RSS_HASH_TYPE_TCP_EX        (1 << 7)
#define VIRTIO_NET_RSS_HASH_TYPE_UDP_EX        (1 << 8)

> 
>> + */
>> +struct tun_vnet_hash {
>> +       __u16 flags;
>> +       __u8 pad[2];
>> +       __u32 types;
>> +};
>> +
>>   #endif /* _UAPI__IF_TUN_H */
>>
>> --
>> 2.46.2
>>
> 
> Thanks
>