dmaengine: Add batched scatter-gather DMA support

[PATCH 1/3] dmaengine: Add multi-buffer support in single DMA transfer

Posted by Sumit Kumar 3 weeks, 4 days ago

Add dmaengine_prep_batch_sg API for batching multiple independent buffers
in a single DMA transaction. Each scatter-gather entry specifies both
source and destination addresses. This allows multiple non-contiguous
memory regions to be transferred in a single DMA transaction instead of
separate operations, significantly reducing submission overhead and
interrupt overhead.

Extends struct scatterlist with optional dma_dst_address field
and implements support in dw-edma driver.

Signed-off-by: Sumit Kumar <sumit.kumar@oss.qualcomm.com>
---
 drivers/dma/dw-edma/Kconfig        |  1 +
 drivers/dma/dw-edma/dw-edma-core.c | 40 ++++++++++++++++++++++++++++++++++----
 drivers/dma/dw-edma/dw-edma-core.h |  3 ++-
 include/linux/dmaengine.h          | 29 ++++++++++++++++++++++++++-
 include/linux/scatterlist.h        |  7 +++++++
 kernel/dma/Kconfig                 |  3 +++
 6 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/dw-edma/Kconfig b/drivers/dma/dw-edma/Kconfig
index 2b6f2679508d93b94b7efecd4e36d5902f7b4c99..0472a6554ff38d4cf172a90b6bf0bdaa9e7f4b95 100644
--- a/drivers/dma/dw-edma/Kconfig
+++ b/drivers/dma/dw-edma/Kconfig
@@ -5,6 +5,7 @@ config DW_EDMA
 	depends on PCI && PCI_MSI
 	select DMA_ENGINE
 	select DMA_VIRTUAL_CHANNELS
+	select NEED_SG_DMA_DST_ADDR
 	help
 	  Support the Synopsys DesignWare eDMA controller, normally
 	  implemented on endpoints SoCs.
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 8e5f7defa6b678eefe0f312ebc59f654677c744f..04314cfd82edbed6ed3665eb4c8e6b428339c207 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -411,6 +411,9 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
 			return NULL;
 		if (!xfer->xfer.il->src_inc || !xfer->xfer.il->dst_inc)
 			return NULL;
+	} else if (xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
+		if (xfer->xfer.sg.len < 1)
+			return NULL;
 	} else {
 		return NULL;
 	}
@@ -438,7 +441,7 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
 
 	if (xfer->type == EDMA_XFER_CYCLIC) {
 		cnt = xfer->xfer.cyclic.cnt;
-	} else if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
+	} else if (xfer->type == EDMA_XFER_SCATTER_GATHER || xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
 		cnt = xfer->xfer.sg.len;
 		sg = xfer->xfer.sg.sgl;
 	} else if (xfer->type == EDMA_XFER_INTERLEAVED) {
@@ -447,7 +450,8 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
 	}
 
 	for (i = 0; i < cnt; i++) {
-		if (xfer->type == EDMA_XFER_SCATTER_GATHER && !sg)
+		if ((xfer->type == EDMA_XFER_SCATTER_GATHER ||
+		     xfer->type == EDMA_XFER_DUAL_ADDR_SG) && !sg)
 			break;
 
 		if (chunk->bursts_alloc == chan->ll_max) {
@@ -462,7 +466,8 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
 
 		if (xfer->type == EDMA_XFER_CYCLIC)
 			burst->sz = xfer->xfer.cyclic.len;
-		else if (xfer->type == EDMA_XFER_SCATTER_GATHER)
+		else if (xfer->type == EDMA_XFER_SCATTER_GATHER ||
+			 xfer->type == EDMA_XFER_DUAL_ADDR_SG)
 			burst->sz = sg_dma_len(sg);
 		else if (xfer->type == EDMA_XFER_INTERLEAVED)
 			burst->sz = xfer->xfer.il->sgl[i % fsz].size;
@@ -486,6 +491,9 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
 				 */
 			} else if (xfer->type == EDMA_XFER_INTERLEAVED) {
 				burst->dar = dst_addr;
+			} else if (xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
+				burst->sar = dw_edma_get_pci_address(chan, sg_dma_address(sg));
+				burst->dar = sg_dma_dst_address(sg);
 			}
 		} else {
 			burst->dar = dst_addr;
@@ -503,10 +511,14 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
 				 */
 			}  else if (xfer->type == EDMA_XFER_INTERLEAVED) {
 				burst->sar = src_addr;
+			} else if (xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
+				burst->sar = sg_dma_address(sg);
+				burst->dar = dw_edma_get_pci_address(chan, sg_dma_dst_address(sg));
 			}
 		}
 
-		if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
+		if (xfer->type == EDMA_XFER_SCATTER_GATHER ||
+		    xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
 			sg = sg_next(sg);
 		} else if (xfer->type == EDMA_XFER_INTERLEAVED) {
 			struct dma_interleaved_template *il = xfer->xfer.il;
@@ -603,6 +615,25 @@ static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
 	res->residue = residue;
 }
 
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_batch_sg_dma(struct dma_chan *dchan,
+				 struct scatterlist *sg,
+				 unsigned int nents,
+				 enum dma_transfer_direction direction,
+				 unsigned long flags)
+{
+	struct dw_edma_transfer xfer;
+
+	xfer.dchan = dchan;
+	xfer.direction = direction;
+	xfer.xfer.sg.sgl = sg;
+	xfer.xfer.sg.len = nents;
+	xfer.flags = flags;
+	xfer.type = EDMA_XFER_DUAL_ADDR_SG;
+
+	return dw_edma_device_transfer(&xfer);
+}
+
 static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
 {
 	struct dw_edma_desc *desc;
@@ -818,6 +849,7 @@ static int dw_edma_channel_setup(struct dw_edma *dw, u32 wr_alloc, u32 rd_alloc)
 	dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg;
 	dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic;
 	dma->device_prep_interleaved_dma = dw_edma_device_prep_interleaved_dma;
+	dma->device_prep_batch_sg_dma = dw_edma_device_prep_batch_sg_dma;
 
 	dma_set_max_seg_size(dma->dev, U32_MAX);
 
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index 71894b9e0b1539c636171738963e80a0a5ef43a4..1a266dc58315edb3d5fd9eddb19fc350f1ed9a1b 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -36,7 +36,8 @@ enum dw_edma_status {
 enum dw_edma_xfer_type {
 	EDMA_XFER_SCATTER_GATHER = 0,
 	EDMA_XFER_CYCLIC,
-	EDMA_XFER_INTERLEAVED
+	EDMA_XFER_INTERLEAVED,
+	EDMA_XFER_DUAL_ADDR_SG,
 };
 
 struct dw_edma_chan;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 99efe2b9b4ea9844ca6161208362ef18ef111d96..fdba75b5c40f805904a6697fce3062303fea762a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -939,7 +939,11 @@ struct dma_device {
 		size_t period_len, enum dma_transfer_direction direction,
 		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
-		struct dma_chan *chan, struct dma_interleaved_template *xt,
+	    struct dma_chan *chan, struct dma_interleaved_template *xt,
+		unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_batch_sg_dma)
+	    (struct dma_chan *chan, struct scatterlist *sg, unsigned int nents,
+	    enum dma_transfer_direction direction,
 		unsigned long flags);
 
 	void (*device_caps)(struct dma_chan *chan, struct dma_slave_caps *caps);
@@ -1060,6 +1064,29 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
 	return chan->device->device_prep_interleaved_dma(chan, xt, flags);
 }
 
+/**
+ * dmaengine_prep_batch_sg_dma() - Prepare single DMA transfer for multiple independent buffers.
+ * @chan: DMA channel
+ * @sg: Scatter-gather list with both source (dma_address) and destination (dma_dst_address)
+ * @nents: Number of entries in the list
+ * @direction: Transfer direction (DMA_MEM_TO_MEM, DMA_DEV_TO_MEM, DMA_MEM_TO_DEV)
+ * @flags: DMA engine flags
+ *
+ * Each SG entry contains both source (sg_dma_address) and destination (sg_dma_dst_address).
+ * This allows multiple independent transfers in a single DMA transaction.
+ * Requires CONFIG_NEED_SG_DMA_DST_ADDR to be enabled.
+ */
+static inline struct dma_async_tx_descriptor *dmaengine_prep_batch_sg_dma
+		(struct dma_chan *chan, struct scatterlist *sg, unsigned int nents,
+		enum dma_transfer_direction direction, unsigned long flags)
+{
+	if (!chan || !chan->device || !chan->device->device_prep_batch_sg_dma ||
+	    !sg || !nents)
+		return NULL;
+
+	return chan->device->device_prep_batch_sg_dma(chan, sg, nents, direction, flags);
+}
+
 /**
  * dmaengine_prep_dma_memset() - Prepare a DMA memset descriptor.
  * @chan: The channel to be used for this descriptor
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 29f6ceb98d74b118d08b6a3d4eb7f62dcde0495d..20b65ffcd5e2a65ec5026a29344caf6baa09700b 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -19,6 +19,9 @@ struct scatterlist {
 #ifdef CONFIG_NEED_SG_DMA_FLAGS
 	unsigned int    dma_flags;
 #endif
+#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
+	dma_addr_t	dma_dst_address;
+#endif
 };
 
 /*
@@ -36,6 +39,10 @@ struct scatterlist {
 #define sg_dma_len(sg)		((sg)->length)
 #endif
 
+#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
+#define sg_dma_dst_address(sg)	((sg)->dma_dst_address)
+#endif
+
 struct sg_table {
 	struct scatterlist *sgl;	/* the list */
 	unsigned int nents;		/* number of mapped entries */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 31cfdb6b4bc3e33c239111955d97b3ec160baafa..3539b5b1efe27be7ccbfebb358dbb9cad2868f11 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -32,6 +32,9 @@ config NEED_SG_DMA_LENGTH
 config NEED_DMA_MAP_STATE
 	bool
 
+config NEED_SG_DMA_DST_ADDR
+	bool
+
 config ARCH_DMA_ADDR_T_64BIT
 	def_bool 64BIT || PHYS_ADDR_T_64BIT
 

-- 
2.34.1

Re: [PATCH 1/3] dmaengine: Add multi-buffer support in single DMA transfer

Posted by Vinod Koul 3 weeks ago

On 13-03-26, 12:19, Sumit Kumar wrote:
> Add dmaengine_prep_batch_sg API for batching multiple independent buffers
> in a single DMA transaction. Each scatter-gather entry specifies both
> source and destination addresses. This allows multiple non-contiguous

Looks like you want to bring back dmaengine_prep_dma_sg() see commit c678fa66341c

> memory regions to be transferred in a single DMA transaction instead of
> separate operations, significantly reducing submission overhead and
> interrupt overhead.
> 
> Extends struct scatterlist with optional dma_dst_address field
> and implements support in dw-edma driver.

If this is memcpy why are you talking about dma_dst_address which is a
slave field?

-- 
~Vinod

Re: [PATCH 1/3] dmaengine: Add multi-buffer support in single DMA transfer

Posted by Sumit Kumar 1 week, 1 day ago


On 3/17/2026 4:24 PM, Vinod Koul wrote:
> On 13-03-26, 12:19, Sumit Kumar wrote:
>> Add dmaengine_prep_batch_sg API for batching multiple independent buffers
>> in a single DMA transaction. Each scatter-gather entry specifies both
>> source and destination addresses. This allows multiple non-contiguous
> Looks like you want to bring back dmaengine_prep_dma_sg() see commit c678fa66341c
I was not aware about this commit, I will bring back this change (only 
the core dma part).
Along with my changes was are integrated with the above commit.
>> memory regions to be transferred in a single DMA transaction instead of
>> separate operations, significantly reducing submission overhead and
>> interrupt overhead.
>>
>> Extends struct scatterlist with optional dma_dst_address field
>> and implements support in dw-edma driver.
> If this is memcpy why are you talking about dma_dst_address which is a
> slave field?
As we are going back with the commit c678fa66341c we can ignore the 
current patch.

- Sumit

Re: [PATCH 1/3] dmaengine: Add multi-buffer support in single DMA transfer

Posted by Robin Murphy 3 weeks, 3 days ago

On 2026-03-13 6:49 am, Sumit Kumar wrote:
> Add dmaengine_prep_batch_sg API for batching multiple independent buffers
> in a single DMA transaction. Each scatter-gather entry specifies both
> source and destination addresses. This allows multiple non-contiguous
> memory regions to be transferred in a single DMA transaction instead of
> separate operations, significantly reducing submission overhead and
> interrupt overhead.
> 
> Extends struct scatterlist with optional dma_dst_address field
> and implements support in dw-edma driver.

[...]
> diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
> index 29f6ceb98d74b118d08b6a3d4eb7f62dcde0495d..20b65ffcd5e2a65ec5026a29344caf6baa09700b 100644
> --- a/include/linux/scatterlist.h
> +++ b/include/linux/scatterlist.h
> @@ -19,6 +19,9 @@ struct scatterlist {
>   #ifdef CONFIG_NEED_SG_DMA_FLAGS
>   	unsigned int    dma_flags;
>   #endif
> +#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
> +	dma_addr_t	dma_dst_address;
> +#endif

Eww, no, what does this even mean? Is the regular dma_addr somehow 
implicitly a "source" now? How could the single piece of memory 
represented by page_link/offset/length have two different DMA addresses? 
How are both the DMA mapping code and users supposed to know which one 
is relevant in any particular situation?

If you want to bring back DMA_MEMCPY_SG yet again, and you have an 
actual user this time, then do that (although by now it most likely 
wants to be a dma_vec version). Don't do whatever this is...

If you want to batch multiple 
dmaengine_slave_config()/dma_prep_slave_single() operations into some 
many-to-many variant of dmaengine_prep_peripheral_dma_vec(), then surely 
that requires actual batching of the config part as well - e.g. passing 
an explicit vector of distinct dma_slave_configs corresponding to each 
individual dma_vec - in order to be able to work correctly in general?

Thanks,
Robin.

>   };
>   
>   /*
> @@ -36,6 +39,10 @@ struct scatterlist {
>   #define sg_dma_len(sg)		((sg)->length)
>   #endif
>   
> +#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
> +#define sg_dma_dst_address(sg)	((sg)->dma_dst_address)
> +#endif
> +
>   struct sg_table {
>   	struct scatterlist *sgl;	/* the list */
>   	unsigned int nents;		/* number of mapped entries */
> diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
> index 31cfdb6b4bc3e33c239111955d97b3ec160baafa..3539b5b1efe27be7ccbfebb358dbb9cad2868f11 100644
> --- a/kernel/dma/Kconfig
> +++ b/kernel/dma/Kconfig
> @@ -32,6 +32,9 @@ config NEED_SG_DMA_LENGTH
>   config NEED_DMA_MAP_STATE
>   	bool
>   
> +config NEED_SG_DMA_DST_ADDR
> +	bool
> +
>   config ARCH_DMA_ADDR_T_64BIT
>   	def_bool 64BIT || PHYS_ADDR_T_64BIT
>   
>

Re: [PATCH 1/3] dmaengine: Add multi-buffer support in single DMA transfer

Posted by Niklas Cassel 3 weeks ago

On Fri, Mar 13, 2026 at 03:16:50PM +0000, Robin Murphy wrote:
> On 2026-03-13 6:49 am, Sumit Kumar wrote:
> 
> If you want to batch multiple
> dmaengine_slave_config()/dma_prep_slave_single() operations into some
> many-to-many variant of dmaengine_prep_peripheral_dma_vec(), then surely
> that requires actual batching of the config part as well - e.g. passing an
> explicit vector of distinct dma_slave_configs corresponding to each
> individual dma_vec - in order to be able to work correctly in general?

This make me think of Frank's series which tries to create an API which
combines dmaengine_slave_config() with dmaengine_prep_slave_single():

https://lore.kernel.org/dmaengine/20251218-dma_prep_config-v2-0-c07079836128@nxp.com/

Not exactly the same, but might still be of interest.


Kind regards,
Niklas