[PATCH net-next 1/6] net: devmem: support TX through netkit leased queues

Bobby Eshleman posted 6 patches 3 weeks ago
[PATCH net-next 1/6] net: devmem: support TX through netkit leased queues
Posted by Bobby Eshleman 3 weeks ago
From: Bobby Eshleman <bobbyeshleman@meta.com>

When a netkit virtual device leases queues from a physical NIC, devmem
TX bindings created on the netkit device should use the physical NIC
for DMA operations rather than the virtual device, which has no DMA
capability.

In bind_tx_doit, walk the device's leased rx queues to discover the
underlying physical device that supports netmem_tx. Use this device
for DMA device lookup and pass it as the real_tx_dev in the binding.
When real_tx_dev is set, it is also used for NUMA-local allocations.

Extend validate_xmit_unreadable_skb() to support the netkit case, where
the skb is validated twice: once on the netkit guest device and again on
the physical NIC after BPF redirect or ip forwarding. Both invocations
must pass for the skb to be transmitted.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
 net/core/dev.c         | 26 +++++++++++++++++++-------
 net/core/devmem.c      | 16 ++++++++++------
 net/core/devmem.h      |  6 ++++--
 net/core/netdev-genl.c | 38 +++++++++++++++++++++++++++++++++-----
 4 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ca4b26dfb1bd..105bd27be024 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3981,24 +3981,36 @@ static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
 static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
 						    struct net_device *dev)
 {
+	struct net_devmem_dmabuf_binding *binding;
 	struct skb_shared_info *shinfo;
+	struct net_device *real_tx_dev;
 	struct net_iov *niov;
 
 	if (likely(skb_frags_readable(skb)))
 		goto out;
 
-	if (!dev->netmem_tx)
-		goto out_free;
-
 	shinfo = skb_shinfo(skb);
+	if (shinfo->nr_frags == 0)
+		goto out;
 
-	if (shinfo->nr_frags > 0) {
-		niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
-		if (net_is_devmem_iov(niov) &&
-		    READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev)
+	niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+	if (!net_is_devmem_iov(niov))
+		goto out;
+
+	binding = net_devmem_iov_binding(niov);
+	real_tx_dev = READ_ONCE(binding->real_tx_dev);
+
+	if (real_tx_dev) {
+		if (!real_tx_dev->netmem_tx)
+			goto out_free;
+		if (READ_ONCE(binding->dev) != dev && real_tx_dev != dev)
 			goto out_free;
+		goto out;
 	}
 
+	if (READ_ONCE(binding->dev) != dev || !dev->netmem_tx)
+		goto out_free;
+
 out:
 	return skb;
 
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 7ede81509968..a4148cba5b5f 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -181,12 +181,13 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
 }
 
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
 		       struct device *dma_dev,
 		       enum dma_data_direction direction,
 		       unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
 		       struct netlink_ext_ack *extack)
 {
+	struct net_device *node_dev = real_tx_dev ?: dev;
 	struct net_devmem_dmabuf_binding *binding;
 	static u32 id_alloc_next;
 	struct scatterlist *sg;
@@ -205,13 +206,14 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 		return ERR_CAST(dmabuf);
 
 	binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
-			       dev_to_node(&dev->dev));
+			       dev_to_node(&node_dev->dev));
 	if (!binding) {
 		err = -ENOMEM;
 		goto err_put_dmabuf;
 	}
 
 	binding->dev = dev;
+	binding->real_tx_dev = real_tx_dev;
 	xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
 
 	err = percpu_ref_init(&binding->ref,
@@ -254,7 +256,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 	 * allocate MTU sized chunks here. Leave that for future work...
 	 */
 	binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
-					      dev_to_node(&dev->dev));
+					      dev_to_node(&node_dev->dev));
 	if (!binding->chunk_pool) {
 		err = -ENOMEM;
 		goto err_tx_vec;
@@ -268,7 +270,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 		struct net_iov *niov;
 
 		owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
-				     dev_to_node(&dev->dev));
+				     dev_to_node(&node_dev->dev));
 		if (!owner) {
 			err = -ENOMEM;
 			goto err_free_chunks;
@@ -280,7 +282,8 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 		owner->binding = binding;
 
 		err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
-					 dma_addr, len, dev_to_node(&dev->dev),
+					 dma_addr, len,
+					 dev_to_node(&node_dev->dev),
 					 owner);
 		if (err) {
 			kfree(owner);
@@ -397,7 +400,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
 	 */
 	dst_dev = dst_dev_rcu(dst);
 	if (unlikely(!dst_dev) ||
-	    unlikely(dst_dev != READ_ONCE(binding->dev))) {
+	    unlikely(dst_dev != READ_ONCE(binding->dev) &&
+		     dst_dev != READ_ONCE(binding->real_tx_dev))) {
 		err = -ENODEV;
 		goto out_unlock;
 	}
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 1c5c18581fcb..ffcf97a33633 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -20,6 +20,8 @@ struct net_devmem_dmabuf_binding {
 	struct dma_buf_attachment *attachment;
 	struct sg_table *sgt;
 	struct net_device *dev;
+	/* Phys dev behind a virtual dev (e.g. netkit) with a queue lease. */
+	struct net_device *real_tx_dev;
 	struct gen_pool *chunk_pool;
 	/* Protect dev */
 	struct mutex lock;
@@ -84,7 +86,7 @@ struct dmabuf_genpool_chunk_owner {
 
 void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
 		       struct device *dma_dev,
 		       enum dma_data_direction direction,
 		       unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
@@ -165,7 +167,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov)
 }
 
 static inline struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *real_tx_dev,
 		       struct device *dma_dev,
 		       enum dma_data_direction direction,
 		       unsigned int dmabuf_fd,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 7d073894ca74..2b34924dc30f 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -1037,7 +1037,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
 		goto err_rxq_bitmap;
 	}
 
-	binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+	binding = net_devmem_bind_dmabuf(netdev, NULL, dma_dev, DMA_FROM_DEVICE,
 					 dmabuf_fd, priv, info->extack);
 	if (IS_ERR(binding)) {
 		err = PTR_ERR(binding);
@@ -1082,6 +1082,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net_devmem_dmabuf_binding *binding;
+	struct net_device *real_tx_dev = NULL;
+	struct netdev_rx_queue *lease_rxq;
 	struct netdev_nl_sock *priv;
 	struct net_device *netdev;
 	struct device *dma_dev;
@@ -1089,6 +1091,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
 	struct sk_buff *rsp;
 	int err = 0;
 	void *hdr;
+	int i;
 
 	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
 	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
@@ -1124,16 +1127,41 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock_netdev;
 	}
 
-	if (!netdev->netmem_tx) {
+	for (i = 0; i < netdev->real_num_rx_queues; i++) {
+		lease_rxq = READ_ONCE(__netif_get_rx_queue(netdev, i)->lease);
+
+		if (!lease_rxq)
+			continue;
+
+		real_tx_dev = lease_rxq->dev;
+		break;
+	}
+
+	if (real_tx_dev) {
+		if (!netif_device_present(real_tx_dev)) {
+			err = -ENODEV;
+			goto err_unlock_netdev;
+		}
+
+		if (!real_tx_dev->netmem_tx) {
+			err = -EOPNOTSUPP;
+			NL_SET_ERR_MSG(info->extack,
+				       "Driver for queue lease device does not support netmem TX");
+			goto err_unlock_netdev;
+		}
+	}
+
+	if (!real_tx_dev && !netdev->netmem_tx) {
 		err = -EOPNOTSUPP;
 		NL_SET_ERR_MSG(info->extack,
 			       "Driver does not support netmem TX");
 		goto err_unlock_netdev;
 	}
 
-	dma_dev = netdev_queue_get_dma_dev(netdev, 0);
-	binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
-					 dmabuf_fd, priv, info->extack);
+	dma_dev = netdev_queue_get_dma_dev(real_tx_dev ?: netdev, 0);
+	binding = net_devmem_bind_dmabuf(netdev, real_tx_dev, dma_dev,
+					 DMA_TO_DEVICE, dmabuf_fd, priv,
+					 info->extack);
 	if (IS_ERR(binding)) {
 		err = PTR_ERR(binding);
 		goto err_unlock_netdev;

-- 
2.52.0