[PATCH net-next v2 11/18] net: macb: single dma_alloc_coherent() for DMA descriptors

Théo Lebrun posted 18 patches 3 months, 1 week ago
[PATCH net-next v2 11/18] net: macb: single dma_alloc_coherent() for DMA descriptors
Posted by Théo Lebrun 3 months, 1 week ago
Move from two (Tx/Rx) dma_alloc_coherent() for DMA descriptor rings *per
queue* to two dma_alloc_coherent() overall.

Issue is with how all queues share the same register for configuring the
upper 32-bits of Tx/Rx descriptor rings. For example, with Tx, notice
how TBQPH does *not* depend on the queue index:

	#define GEM_TBQP(hw_q)		(0x0440 + ((hw_q) << 2))
	#define GEM_TBQPH(hw_q)		(0x04C8)

	queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
	if (bp->hw_dma_cap & HW_DMA_CAP_64B)
		queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
	#endif

To maxime our chances of getting valid DMA addresses, we do a single
dma_alloc_coherent() across queues. This improves the odds because
alloc_pages() guarantees natural alignment. It cannot ensure valid DMA
addresses because of IOMMU or codepaths that don't go through
alloc_pages().

We error out if all rings don't have the same upper 32 bits, which is
better than the current (theoretical, not reproduced) silent corruption
caused by hardware that accesses invalid addresses.

Two considerations:
 - dma_alloc_coherent() gives us page alignment. Here we remove this
   containst meaning each queue's ring won't be page-aligned anymore.
 - This can save some memory. Less allocations means less overhead
   (constant cost per alloc) and less wasted bytes due to alignment
   constraints.

Fixes: 02c958dd3446 ("net/macb: add TX multiqueue support for gem")
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 83 ++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index d3b3635998cad095246edf8a75faebbcf7115355..48b75d95861317b9925b366446c7572c7e186628 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2445,33 +2445,32 @@ static void macb_free_rx_buffers(struct macb *bp)
 
 static void macb_free_consistent(struct macb *bp)
 {
-	struct macb_queue *queue;
+	size_t size, tx_size_per_queue, rx_size_per_queue;
+	struct macb_queue *queue, *queue0 = bp->queues;
+	struct device *dev = &bp->pdev->dev;
 	unsigned int q;
-	int size;
 
 	if (bp->rx_ring_tieoff) {
-		dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
+		dma_free_coherent(dev, macb_dma_desc_get_size(bp),
 				  bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
 		bp->rx_ring_tieoff = NULL;
 	}
 
 	bp->macbgem_ops.mog_free_rx_buffers(bp);
 
+	tx_size_per_queue = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
+	size = bp->num_queues * tx_size_per_queue;
+	dma_free_coherent(dev, size, queue0->tx_ring, queue0->tx_ring_dma);
+
+	rx_size_per_queue = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
+	size = bp->num_queues * rx_size_per_queue;
+	dma_free_coherent(dev, size, queue0->rx_ring, queue0->rx_ring_dma);
+
 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
 		kfree(queue->tx_skb);
 		queue->tx_skb = NULL;
-		if (queue->tx_ring) {
-			size = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
-			dma_free_coherent(&bp->pdev->dev, size,
-					  queue->tx_ring, queue->tx_ring_dma);
-			queue->tx_ring = NULL;
-		}
-		if (queue->rx_ring) {
-			size = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
-			dma_free_coherent(&bp->pdev->dev, size,
-					  queue->rx_ring, queue->rx_ring_dma);
-			queue->rx_ring = NULL;
-		}
+		queue->tx_ring = NULL; /* Single buffer owned by queue0 */
+		queue->rx_ring = NULL; /* Single buffer owned by queue0 */
 	}
 }
 
@@ -2513,37 +2512,47 @@ static int macb_alloc_rx_buffers(struct macb *bp)
 
 static int macb_alloc_consistent(struct macb *bp)
 {
+	size_t size, tx_size_per_queue, rx_size_per_queue;
+	dma_addr_t tx_dma, rx_dma;
+	struct device *dev = &bp->pdev->dev;
 	struct macb_queue *queue;
 	unsigned int q;
-	int size;
+	void *tx, *rx;
+
+	/*
+	 * Upper 32-bits of Tx/Rx DMA descriptor for each queues much match!
+	 * We cannot enforce this guarantee, the best we can do is do a single
+	 * allocation and hope it will land into alloc_pages() that guarantees
+	 * natural alignment of physical addresses.
+	 */
+
+	tx_size_per_queue = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
+	size = bp->num_queues * tx_size_per_queue;
+	tx = dma_alloc_coherent(dev, size, &tx_dma, GFP_KERNEL);
+	if (!tx || upper_32_bits(tx_dma) != upper_32_bits(tx_dma + size - 1))
+		goto out_err;
+	netdev_dbg(bp->dev, "Allocated %zu bytes for %u TX rings at %08lx (mapped %p)\n",
+		   size, bp->num_queues, (unsigned long)tx_dma, tx);
+
+	rx_size_per_queue = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
+	size = bp->num_queues * rx_size_per_queue;
+	rx = dma_alloc_coherent(dev, size, &rx_dma, GFP_KERNEL);
+	if (!rx || upper_32_bits(rx_dma) != upper_32_bits(rx_dma + size - 1))
+		goto out_err;
+	netdev_dbg(bp->dev, "Allocated %zu bytes for %u RX rings at %08lx (mapped %p)\n",
+		   size, bp->num_queues, (unsigned long)rx_dma, rx);
 
 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
-		size = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
-		queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
-						    &queue->tx_ring_dma,
-						    GFP_KERNEL);
-		if (!queue->tx_ring ||
-		    upper_32_bits(queue->tx_ring_dma) != upper_32_bits(bp->queues->tx_ring_dma))
-			goto out_err;
-		netdev_dbg(bp->dev,
-			   "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
-			   q, size, (unsigned long)queue->tx_ring_dma,
-			   queue->tx_ring);
+		queue->tx_ring = tx + tx_size_per_queue * q;
+		queue->tx_ring_dma = tx_dma + tx_size_per_queue * q;
+
+		queue->rx_ring = rx + rx_size_per_queue * q;
+		queue->rx_ring_dma = rx_dma + rx_size_per_queue * q;
 
 		size = bp->tx_ring_size * sizeof(struct macb_tx_skb);
 		queue->tx_skb = kmalloc(size, GFP_KERNEL);
 		if (!queue->tx_skb)
 			goto out_err;
-
-		size = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
-		queue->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
-						 &queue->rx_ring_dma, GFP_KERNEL);
-		if (!queue->rx_ring ||
-		    upper_32_bits(queue->rx_ring_dma) != upper_32_bits(bp->queues->rx_ring_dma))
-			goto out_err;
-		netdev_dbg(bp->dev,
-			   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
-			   size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
 	}
 	if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
 		goto out_err;

-- 
2.50.0

Re: [PATCH net-next v2 11/18] net: macb: single dma_alloc_coherent() for DMA descriptors
Posted by Sean Anderson 3 months, 1 week ago
On 6/27/25 05:08, Théo Lebrun wrote:
> Move from two (Tx/Rx) dma_alloc_coherent() for DMA descriptor rings *per
> queue* to two dma_alloc_coherent() overall.
> 
> Issue is with how all queues share the same register for configuring the
> upper 32-bits of Tx/Rx descriptor rings. For example, with Tx, notice
> how TBQPH does *not* depend on the queue index:
> 
> 	#define GEM_TBQP(hw_q)		(0x0440 + ((hw_q) << 2))
> 	#define GEM_TBQPH(hw_q)		(0x04C8)
> 
> 	queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
> 	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
> 	if (bp->hw_dma_cap & HW_DMA_CAP_64B)
> 		queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
> 	#endif
> 
> To maxime our chances of getting valid DMA addresses, we do a single

maximize

> dma_alloc_coherent() across queues.

Is there really any chance involved (other than avoiding ENOMEM)?

> This improves the odds because
> alloc_pages() guarantees natural alignment. It cannot ensure valid DMA
> addresses because of IOMMU or codepaths that don't go through
> alloc_pages().
> 
> We error out if all rings don't have the same upper 32 bits, which is
> better than the current (theoretical, not reproduced) silent corruption
> caused by hardware that accesses invalid addresses.

I think this is addressed by the previous patch.

> Two considerations:
>  - dma_alloc_coherent() gives us page alignment. Here we remove this
>    containst meaning each queue's ring won't be page-aligned anymore.

constraint

>  - This can save some memory. Less allocations means less overhead

fewer

>    (constant cost per alloc) and less wasted bytes due to alignment
>    constraints.

I think it's probably a bit of a wash with reasonably-sized rings.
Although the prefetch probably interacts poorly with the default "round"
power-of-two ring sizes.

> Fixes: 02c958dd3446 ("net/macb: add TX multiqueue support for gem")
> Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
> ---
>  drivers/net/ethernet/cadence/macb_main.c | 83 ++++++++++++++++++--------------
>  1 file changed, 46 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
> index d3b3635998cad095246edf8a75faebbcf7115355..48b75d95861317b9925b366446c7572c7e186628 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -2445,33 +2445,32 @@ static void macb_free_rx_buffers(struct macb *bp)
>  
>  static void macb_free_consistent(struct macb *bp)
>  {
> -	struct macb_queue *queue;
> +	size_t size, tx_size_per_queue, rx_size_per_queue;
> +	struct macb_queue *queue, *queue0 = bp->queues;
> +	struct device *dev = &bp->pdev->dev;
>  	unsigned int q;
> -	int size;
>  
>  	if (bp->rx_ring_tieoff) {
> -		dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
> +		dma_free_coherent(dev, macb_dma_desc_get_size(bp),
>  				  bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
>  		bp->rx_ring_tieoff = NULL;
>  	}
>  
>  	bp->macbgem_ops.mog_free_rx_buffers(bp);
>  
> +	tx_size_per_queue = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
> +	size = bp->num_queues * tx_size_per_queue;

Can you refactor the size calculation into a helper function?

> +	dma_free_coherent(dev, size, queue0->tx_ring, queue0->tx_ring_dma);
> +
> +	rx_size_per_queue = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
> +	size = bp->num_queues * rx_size_per_queue;
> +	dma_free_coherent(dev, size, queue0->rx_ring, queue0->rx_ring_dma);
> +
>  	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
>  		kfree(queue->tx_skb);
>  		queue->tx_skb = NULL;
> -		if (queue->tx_ring) {
> -			size = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
> -			dma_free_coherent(&bp->pdev->dev, size,
> -					  queue->tx_ring, queue->tx_ring_dma);
> -			queue->tx_ring = NULL;
> -		}
> -		if (queue->rx_ring) {
> -			size = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
> -			dma_free_coherent(&bp->pdev->dev, size,
> -					  queue->rx_ring, queue->rx_ring_dma);
> -			queue->rx_ring = NULL;
> -		}
> +		queue->tx_ring = NULL; /* Single buffer owned by queue0 */
> +		queue->rx_ring = NULL; /* Single buffer owned by queue0 */

OK, but queue0 doesn't own the ring either at this point (it's free'd).

>  	}
>  }
>  
> @@ -2513,37 +2512,47 @@ static int macb_alloc_rx_buffers(struct macb *bp)
>  
>  static int macb_alloc_consistent(struct macb *bp)
>  {
> +	size_t size, tx_size_per_queue, rx_size_per_queue;
> +	dma_addr_t tx_dma, rx_dma;
> +	struct device *dev = &bp->pdev->dev;
>  	struct macb_queue *queue;
>  	unsigned int q;
> -	int size;
> +	void *tx, *rx;
> +
> +	/*
> +	 * Upper 32-bits of Tx/Rx DMA descriptor for each queues much match!
> +	 * We cannot enforce this guarantee, the best we can do is do a single
> +	 * allocation and hope it will land into alloc_pages() that guarantees
> +	 * natural alignment of physical addresses.
> +	 */
> +
> +	tx_size_per_queue = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
> +	size = bp->num_queues * tx_size_per_queue;
> +	tx = dma_alloc_coherent(dev, size, &tx_dma, GFP_KERNEL);
> +	if (!tx || upper_32_bits(tx_dma) != upper_32_bits(tx_dma + size - 1))
> +		goto out_err;
> +	netdev_dbg(bp->dev, "Allocated %zu bytes for %u TX rings at %08lx (mapped %p)\n",
> +		   size, bp->num_queues, (unsigned long)tx_dma, tx);
> +
> +	rx_size_per_queue = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
> +	size = bp->num_queues * rx_size_per_queue;
> +	rx = dma_alloc_coherent(dev, size, &rx_dma, GFP_KERNEL);
> +	if (!rx || upper_32_bits(rx_dma) != upper_32_bits(rx_dma + size - 1))
> +		goto out_err;
> +	netdev_dbg(bp->dev, "Allocated %zu bytes for %u RX rings at %08lx (mapped %p)\n",
> +		   size, bp->num_queues, (unsigned long)rx_dma, rx);
>  
>  	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
> -		size = TX_RING_BYTES(bp) + bp->tx_bd_rd_prefetch;
> -		queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
> -						    &queue->tx_ring_dma,
> -						    GFP_KERNEL);
> -		if (!queue->tx_ring ||
> -		    upper_32_bits(queue->tx_ring_dma) != upper_32_bits(bp->queues->tx_ring_dma))
> -			goto out_err;
> -		netdev_dbg(bp->dev,
> -			   "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
> -			   q, size, (unsigned long)queue->tx_ring_dma,
> -			   queue->tx_ring);
> +		queue->tx_ring = tx + tx_size_per_queue * q;
> +		queue->tx_ring_dma = tx_dma + tx_size_per_queue * q;
> +
> +		queue->rx_ring = rx + rx_size_per_queue * q;
> +		queue->rx_ring_dma = rx_dma + rx_size_per_queue * q;
>  
>  		size = bp->tx_ring_size * sizeof(struct macb_tx_skb);
>  		queue->tx_skb = kmalloc(size, GFP_KERNEL);
>  		if (!queue->tx_skb)
>  			goto out_err;
> -
> -		size = RX_RING_BYTES(bp) + bp->rx_bd_rd_prefetch;
> -		queue->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
> -						 &queue->rx_ring_dma, GFP_KERNEL);
> -		if (!queue->rx_ring ||
> -		    upper_32_bits(queue->rx_ring_dma) != upper_32_bits(bp->queues->rx_ring_dma))
> -			goto out_err;
> -		netdev_dbg(bp->dev,
> -			   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
> -			   size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
>  	}
>  	if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
>  		goto out_err;
> 

Reviewed-by: Sean Anderson <sean.anderson@linux.dev>
Re: [PATCH net-next v2 11/18] net: macb: single dma_alloc_coherent() for DMA descriptors
Posted by Théo Lebrun 2 months ago
Hello Sean,

Thanks for the review! I'll reply only to questions (or comments about
which I have questions).

On Tue Jul 1, 2025 at 6:32 PM CEST, Sean Anderson wrote:
> On 6/27/25 05:08, Théo Lebrun wrote:
>> Move from two (Tx/Rx) dma_alloc_coherent() for DMA descriptor rings *per
>> queue* to two dma_alloc_coherent() overall.
>> 
>> Issue is with how all queues share the same register for configuring the
>> upper 32-bits of Tx/Rx descriptor rings. For example, with Tx, notice
>> how TBQPH does *not* depend on the queue index:
>> 
>> 	#define GEM_TBQP(hw_q)		(0x0440 + ((hw_q) << 2))
>> 	#define GEM_TBQPH(hw_q)		(0x04C8)
>> 
>> 	queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
>> 	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
>> 	if (bp->hw_dma_cap & HW_DMA_CAP_64B)
>> 		queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
>> 	#endif
>> 
>> To maxime our chances of getting valid DMA addresses, we do a single
>
> maximize
>
>> dma_alloc_coherent() across queues.
>
> Is there really any chance involved (other than avoiding ENOMEM)?

If we land in the the page allocator codepath of dma_alloc_coherent(),
then we get natural alignment guarantees, see alloc_pages() comment [0].

[0]: https://elixir.bootlin.com/linux/v6.16/source/mm/mempolicy.c#L2499-L2502

However, we cannot be certain we land in that path. If we have an
IOMMU, then I don't think the API provides strong enough guarantees.

Same for custom `struct dma_map_ops`, be it per-device or arch-specific.
I am not aware (is anything documented on that?) of any alignment
guarantees.

Even if those give us page-aligned allocations, that isn't enough. For
example let's say we want 256KiB. We get 0xFFFF0000 from an allocator.
That is page aligned, but:

   upper_32_bits(START)      != upper_32_bits(START + SIZE - 1)
   upper_32_bits(0xFFFF0000) != upper_32_bits(0xFFFF0000 + 0x40000 - 1)
   0x0                       != 0x1

Thanks!

--
Théo Lebrun, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com