[PATCH net v4 4/5] net: macb: single dma_alloc_coherent() for DMA descriptors

Théo Lebrun posted 5 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH net v4 4/5] net: macb: single dma_alloc_coherent() for DMA descriptors
Posted by Théo Lebrun 1 month, 2 weeks ago
Move from 2*NUM_QUEUES dma_alloc_coherent() for DMA descriptor rings to
2 calls overall.

Issue is with how all queues share the same register for configuring the
upper 32-bits of Tx/Rx descriptor rings. Taking Tx, notice how TBQPH
does *not* depend on the queue index:

	#define GEM_TBQP(hw_q)		(0x0440 + ((hw_q) << 2))
	#define GEM_TBQPH(hw_q)		(0x04C8)

	queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
	if (bp->hw_dma_cap & HW_DMA_CAP_64B)
		queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
	#endif

To maximise our chances of getting valid DMA addresses, we do a single
dma_alloc_coherent() across queues. This improves the odds because
alloc_pages() guarantees natural alignment. Other codepaths (IOMMU or
dev/arch dma_map_ops) don't give high enough guarantees
(even page-aligned isn't enough).

Two consideration:

 - dma_alloc_coherent() gives us page alignment. Here we remove this
   constraint meaning each queue's ring won't be page-aligned anymore.

 - This can save some tiny amounts of memory. Fewer allocations means
   (1) less overhead (constant cost per alloc) and (2) less wasted bytes
   due to alignment constraints.

   Example for (2): 4 queues, default ring size (512), 64-bit DMA
   descriptors, 16K pages:
    - Before: 8 allocs of 8K, each rounded to 16K => 64K wasted.
    - After:  2 allocs of 32K => 0K wasted.

Fixes: 02c958dd3446 ("net/macb: add TX multiqueue support for gem")
Reviewed-by: Sean Anderson <sean.anderson@linux.dev>
Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 80 ++++++++++++++++----------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index d413e8bd4977187fd73f7cc48268baf933aab051..7f31f264a6d342ea01e2f61944b12c9b9a3fe66e 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -2474,32 +2474,30 @@ static unsigned int macb_rx_ring_size_per_queue(struct macb *bp)
 
 static void macb_free_consistent(struct macb *bp)
 {
+	struct device *dev = &bp->pdev->dev;
 	struct macb_queue *queue;
 	unsigned int q;
+	size_t size;
 
 	if (bp->rx_ring_tieoff) {
-		dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
+		dma_free_coherent(dev, macb_dma_desc_get_size(bp),
 				  bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
 		bp->rx_ring_tieoff = NULL;
 	}
 
 	bp->macbgem_ops.mog_free_rx_buffers(bp);
 
+	size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
+	dma_free_coherent(dev, size, bp->queues[0].tx_ring, bp->queues[0].tx_ring_dma);
+
+	size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
+	dma_free_coherent(dev, size, bp->queues[0].rx_ring, bp->queues[0].rx_ring_dma);
+
 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
 		kfree(queue->tx_skb);
 		queue->tx_skb = NULL;
-		if (queue->tx_ring) {
-			dma_free_coherent(&bp->pdev->dev,
-					  macb_tx_ring_size_per_queue(bp),
-					  queue->tx_ring, queue->tx_ring_dma);
-			queue->tx_ring = NULL;
-		}
-		if (queue->rx_ring) {
-			dma_free_coherent(&bp->pdev->dev,
-					  macb_rx_ring_size_per_queue(bp),
-					  queue->rx_ring, queue->rx_ring_dma);
-			queue->rx_ring = NULL;
-		}
+		queue->tx_ring = NULL;
+		queue->rx_ring = NULL;
 	}
 }
 
@@ -2541,41 +2539,45 @@ static int macb_alloc_rx_buffers(struct macb *bp)
 
 static int macb_alloc_consistent(struct macb *bp)
 {
+	struct device *dev = &bp->pdev->dev;
+	dma_addr_t tx_dma, rx_dma;
 	struct macb_queue *queue;
 	unsigned int q;
-	u32 upper;
-	int size;
+	void *tx, *rx;
+	size_t size;
+
+	/*
+	 * Upper 32-bits of Tx/Rx DMA descriptor for each queues much match!
+	 * We cannot enforce this guarantee, the best we can do is do a single
+	 * allocation and hope it will land into alloc_pages() that guarantees
+	 * natural alignment of physical addresses.
+	 */
+
+	size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
+	tx = dma_alloc_coherent(dev, size, &tx_dma, GFP_KERNEL);
+	if (!tx || upper_32_bits(tx_dma) != upper_32_bits(tx_dma + size - 1))
+		goto out_err;
+	netdev_dbg(bp->dev, "Allocated %zu bytes for %u TX rings at %08lx (mapped %p)\n",
+		   size, bp->num_queues, (unsigned long)tx_dma, tx);
+
+	size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
+	rx = dma_alloc_coherent(dev, size, &rx_dma, GFP_KERNEL);
+	if (!rx || upper_32_bits(rx_dma) != upper_32_bits(rx_dma + size - 1))
+		goto out_err;
+	netdev_dbg(bp->dev, "Allocated %zu bytes for %u RX rings at %08lx (mapped %p)\n",
+		   size, bp->num_queues, (unsigned long)rx_dma, rx);
 
 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
-		size = macb_tx_ring_size_per_queue(bp);
-		queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
-						    &queue->tx_ring_dma,
-						    GFP_KERNEL);
-		upper = upper_32_bits(queue->tx_ring_dma);
-		if (!queue->tx_ring ||
-		    upper != upper_32_bits(bp->queues[0].tx_ring_dma))
-			goto out_err;
-		netdev_dbg(bp->dev,
-			   "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
-			   q, size, (unsigned long)queue->tx_ring_dma,
-			   queue->tx_ring);
+		queue->tx_ring = tx + macb_tx_ring_size_per_queue(bp) * q;
+		queue->tx_ring_dma = tx_dma + macb_tx_ring_size_per_queue(bp) * q;
+
+		queue->rx_ring = rx + macb_rx_ring_size_per_queue(bp) * q;
+		queue->rx_ring_dma = rx_dma + macb_rx_ring_size_per_queue(bp) * q;
 
 		size = bp->tx_ring_size * sizeof(struct macb_tx_skb);
 		queue->tx_skb = kmalloc(size, GFP_KERNEL);
 		if (!queue->tx_skb)
 			goto out_err;
-
-		size = macb_rx_ring_size_per_queue(bp);
-		queue->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
-						    &queue->rx_ring_dma,
-						    GFP_KERNEL);
-		upper = upper_32_bits(queue->rx_ring_dma);
-		if (!queue->rx_ring ||
-		    upper != upper_32_bits(bp->queues[0].rx_ring_dma))
-			goto out_err;
-		netdev_dbg(bp->dev,
-			   "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
-			   size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
 	}
 	if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
 		goto out_err;

-- 
2.50.1

Re: [PATCH net v4 4/5] net: macb: single dma_alloc_coherent() for DMA descriptors
Posted by Nicolas Ferre 1 month, 1 week ago
On 20/08/2025 at 16:55, Théo Lebrun wrote:
> Move from 2*NUM_QUEUES dma_alloc_coherent() for DMA descriptor rings to
> 2 calls overall.
> 
> Issue is with how all queues share the same register for configuring the
> upper 32-bits of Tx/Rx descriptor rings. Taking Tx, notice how TBQPH
> does *not* depend on the queue index:
> 
>          #define GEM_TBQP(hw_q)          (0x0440 + ((hw_q) << 2))
>          #define GEM_TBQPH(hw_q)         (0x04C8)
> 
>          queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
>          #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
>          if (bp->hw_dma_cap & HW_DMA_CAP_64B)
>                  queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
>          #endif
> 
> To maximise our chances of getting valid DMA addresses, we do a single
> dma_alloc_coherent() across queues. This improves the odds because
> alloc_pages() guarantees natural alignment. Other codepaths (IOMMU or
> dev/arch dma_map_ops) don't give high enough guarantees
> (even page-aligned isn't enough).
> 
> Two consideration:
> 
>   - dma_alloc_coherent() gives us page alignment. Here we remove this
>     constraint meaning each queue's ring won't be page-aligned anymore.

However... We must guarantee alignement depending on the controller's 
bus width (32 or 64 bits)... but being page aligned and having 
descriptors multiple of 64 bits anyway, we're good for each descriptor 
(might be worth explicitly adding).

> 
>   - This can save some tiny amounts of memory. Fewer allocations means
>     (1) less overhead (constant cost per alloc) and (2) less wasted bytes
>     due to alignment constraints.
> 
>     Example for (2): 4 queues, default ring size (512), 64-bit DMA
>     descriptors, 16K pages:
>      - Before: 8 allocs of 8K, each rounded to 16K => 64K wasted.
>      - After:  2 allocs of 32K => 0K wasted.
> 
> Fixes: 02c958dd3446 ("net/macb: add TX multiqueue support for gem")
> Reviewed-by: Sean Anderson <sean.anderson@linux.dev>
> Signed-off-by: Théo Lebrun <theo.lebrun@bootlin.com>

Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Tested-by: Nicolas Ferre <nicolas.ferre@microchip.com> # on sam9x75


> ---
>   drivers/net/ethernet/cadence/macb_main.c | 80 ++++++++++++++++----------------
>   1 file changed, 41 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
> index d413e8bd4977187fd73f7cc48268baf933aab051..7f31f264a6d342ea01e2f61944b12c9b9a3fe66e 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -2474,32 +2474,30 @@ static unsigned int macb_rx_ring_size_per_queue(struct macb *bp)
> 
>   static void macb_free_consistent(struct macb *bp)
>   {
> +       struct device *dev = &bp->pdev->dev;
>          struct macb_queue *queue;
>          unsigned int q;
> +       size_t size;
> 
>          if (bp->rx_ring_tieoff) {
> -               dma_free_coherent(&bp->pdev->dev, macb_dma_desc_get_size(bp),
> +               dma_free_coherent(dev, macb_dma_desc_get_size(bp),
>                                    bp->rx_ring_tieoff, bp->rx_ring_tieoff_dma);
>                  bp->rx_ring_tieoff = NULL;
>          }
> 
>          bp->macbgem_ops.mog_free_rx_buffers(bp);
> 
> +       size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
> +       dma_free_coherent(dev, size, bp->queues[0].tx_ring, bp->queues[0].tx_ring_dma);
> +
> +       size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
> +       dma_free_coherent(dev, size, bp->queues[0].rx_ring, bp->queues[0].rx_ring_dma);
> +
>          for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
>                  kfree(queue->tx_skb);
>                  queue->tx_skb = NULL;
> -               if (queue->tx_ring) {
> -                       dma_free_coherent(&bp->pdev->dev,
> -                                         macb_tx_ring_size_per_queue(bp),
> -                                         queue->tx_ring, queue->tx_ring_dma);
> -                       queue->tx_ring = NULL;
> -               }
> -               if (queue->rx_ring) {
> -                       dma_free_coherent(&bp->pdev->dev,
> -                                         macb_rx_ring_size_per_queue(bp),
> -                                         queue->rx_ring, queue->rx_ring_dma);
> -                       queue->rx_ring = NULL;
> -               }
> +               queue->tx_ring = NULL;
> +               queue->rx_ring = NULL;
>          }
>   }
> 
> @@ -2541,41 +2539,45 @@ static int macb_alloc_rx_buffers(struct macb *bp)
> 
>   static int macb_alloc_consistent(struct macb *bp)
>   {
> +       struct device *dev = &bp->pdev->dev;
> +       dma_addr_t tx_dma, rx_dma;
>          struct macb_queue *queue;
>          unsigned int q;
> -       u32 upper;
> -       int size;
> +       void *tx, *rx;
> +       size_t size;
> +
> +       /*
> +        * Upper 32-bits of Tx/Rx DMA descriptor for each queues much match!
> +        * We cannot enforce this guarantee, the best we can do is do a single
> +        * allocation and hope it will land into alloc_pages() that guarantees
> +        * natural alignment of physical addresses.
> +        */
> +
> +       size = bp->num_queues * macb_tx_ring_size_per_queue(bp);
> +       tx = dma_alloc_coherent(dev, size, &tx_dma, GFP_KERNEL);
> +       if (!tx || upper_32_bits(tx_dma) != upper_32_bits(tx_dma + size - 1))
> +               goto out_err;
> +       netdev_dbg(bp->dev, "Allocated %zu bytes for %u TX rings at %08lx (mapped %p)\n",
> +                  size, bp->num_queues, (unsigned long)tx_dma, tx);
> +
> +       size = bp->num_queues * macb_rx_ring_size_per_queue(bp);
> +       rx = dma_alloc_coherent(dev, size, &rx_dma, GFP_KERNEL);
> +       if (!rx || upper_32_bits(rx_dma) != upper_32_bits(rx_dma + size - 1))
> +               goto out_err;
> +       netdev_dbg(bp->dev, "Allocated %zu bytes for %u RX rings at %08lx (mapped %p)\n",
> +                  size, bp->num_queues, (unsigned long)rx_dma, rx);
> 
>          for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
> -               size = macb_tx_ring_size_per_queue(bp);
> -               queue->tx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
> -                                                   &queue->tx_ring_dma,
> -                                                   GFP_KERNEL);
> -               upper = upper_32_bits(queue->tx_ring_dma);
> -               if (!queue->tx_ring ||
> -                   upper != upper_32_bits(bp->queues[0].tx_ring_dma))
> -                       goto out_err;
> -               netdev_dbg(bp->dev,
> -                          "Allocated TX ring for queue %u of %d bytes at %08lx (mapped %p)\n",
> -                          q, size, (unsigned long)queue->tx_ring_dma,
> -                          queue->tx_ring);
> +               queue->tx_ring = tx + macb_tx_ring_size_per_queue(bp) * q;
> +               queue->tx_ring_dma = tx_dma + macb_tx_ring_size_per_queue(bp) * q;
> +
> +               queue->rx_ring = rx + macb_rx_ring_size_per_queue(bp) * q;
> +               queue->rx_ring_dma = rx_dma + macb_rx_ring_size_per_queue(bp) * q;
> 
>                  size = bp->tx_ring_size * sizeof(struct macb_tx_skb);
>                  queue->tx_skb = kmalloc(size, GFP_KERNEL);
>                  if (!queue->tx_skb)
>                          goto out_err;
> -
> -               size = macb_rx_ring_size_per_queue(bp);
> -               queue->rx_ring = dma_alloc_coherent(&bp->pdev->dev, size,
> -                                                   &queue->rx_ring_dma,
> -                                                   GFP_KERNEL);
> -               upper = upper_32_bits(queue->rx_ring_dma);
> -               if (!queue->rx_ring ||
> -                   upper != upper_32_bits(bp->queues[0].rx_ring_dma))
> -                       goto out_err;
> -               netdev_dbg(bp->dev,
> -                          "Allocated RX ring of %d bytes at %08lx (mapped %p)\n",
> -                          size, (unsigned long)queue->rx_ring_dma, queue->rx_ring);
>          }
>          if (bp->macbgem_ops.mog_alloc_rx_buffers(bp))
>                  goto out_err;
> 
> --
> 2.50.1
> 

Re: [PATCH net v4 4/5] net: macb: single dma_alloc_coherent() for DMA descriptors
Posted by Théo Lebrun 3 weeks, 2 days ago
Hello Nicolas,

On Tue Aug 26, 2025 at 5:23 PM CEST, Nicolas Ferre wrote:
> On 20/08/2025 at 16:55, Théo Lebrun wrote:
>> Move from 2*NUM_QUEUES dma_alloc_coherent() for DMA descriptor rings to
>> 2 calls overall.
>> 
>> Issue is with how all queues share the same register for configuring the
>> upper 32-bits of Tx/Rx descriptor rings. Taking Tx, notice how TBQPH
>> does *not* depend on the queue index:
>> 
>>          #define GEM_TBQP(hw_q)          (0x0440 + ((hw_q) << 2))
>>          #define GEM_TBQPH(hw_q)         (0x04C8)
>> 
>>          queue_writel(queue, TBQP, lower_32_bits(queue->tx_ring_dma));
>>          #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
>>          if (bp->hw_dma_cap & HW_DMA_CAP_64B)
>>                  queue_writel(queue, TBQPH, upper_32_bits(queue->tx_ring_dma));
>>          #endif
>> 
>> To maximise our chances of getting valid DMA addresses, we do a single
>> dma_alloc_coherent() across queues. This improves the odds because
>> alloc_pages() guarantees natural alignment. Other codepaths (IOMMU or
>> dev/arch dma_map_ops) don't give high enough guarantees
>> (even page-aligned isn't enough).
>> 
>> Two consideration:
>> 
>>   - dma_alloc_coherent() gives us page alignment. Here we remove this
>>     constraint meaning each queue's ring won't be page-aligned anymore.
>
> However... We must guarantee alignement depending on the controller's 
> bus width (32 or 64 bits)... but being page aligned and having 
> descriptors multiple of 64 bits anyway, we're good for each descriptor 
> (might be worth explicitly adding).

Sorry, your comment was unclear to me.

 - I don't see how we can guarantee bus alignment using
   dma_alloc_coherent() which doesn't ask for desired alignment. In
   what case can the DMA APIs return something with less than the
   tolerated bus alignment?

 - What does "having descriptors multiple of 64 bits anyway" mean?

Thanks for your review and acks! V5 got published here:
https://lore.kernel.org/lkml/20250910-macb-fixes-v5-0-f413a3601ce4@bootlin.com/

Regards,

--
Théo Lebrun, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com