Add dmaengine_prep_batch_sg API for batching multiple independent buffers
in a single DMA transaction. Each scatter-gather entry specifies both
source and destination addresses. This allows multiple non-contiguous
memory regions to be transferred in a single DMA transaction instead of
separate operations, significantly reducing submission overhead and
interrupt overhead.
Extends struct scatterlist with optional dma_dst_address field
and implements support in dw-edma driver.
Signed-off-by: Sumit Kumar <sumit.kumar@oss.qualcomm.com>
---
drivers/dma/dw-edma/Kconfig | 1 +
drivers/dma/dw-edma/dw-edma-core.c | 40 ++++++++++++++++++++++++++++++++++----
drivers/dma/dw-edma/dw-edma-core.h | 3 ++-
include/linux/dmaengine.h | 29 ++++++++++++++++++++++++++-
include/linux/scatterlist.h | 7 +++++++
kernel/dma/Kconfig | 3 +++
6 files changed, 77 insertions(+), 6 deletions(-)
diff --git a/drivers/dma/dw-edma/Kconfig b/drivers/dma/dw-edma/Kconfig
index 2b6f2679508d93b94b7efecd4e36d5902f7b4c99..0472a6554ff38d4cf172a90b6bf0bdaa9e7f4b95 100644
--- a/drivers/dma/dw-edma/Kconfig
+++ b/drivers/dma/dw-edma/Kconfig
@@ -5,6 +5,7 @@ config DW_EDMA
depends on PCI && PCI_MSI
select DMA_ENGINE
select DMA_VIRTUAL_CHANNELS
+ select NEED_SG_DMA_DST_ADDR
help
Support the Synopsys DesignWare eDMA controller, normally
implemented on endpoints SoCs.
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 8e5f7defa6b678eefe0f312ebc59f654677c744f..04314cfd82edbed6ed3665eb4c8e6b428339c207 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -411,6 +411,9 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
return NULL;
if (!xfer->xfer.il->src_inc || !xfer->xfer.il->dst_inc)
return NULL;
+ } else if (xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
+ if (xfer->xfer.sg.len < 1)
+ return NULL;
} else {
return NULL;
}
@@ -438,7 +441,7 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
if (xfer->type == EDMA_XFER_CYCLIC) {
cnt = xfer->xfer.cyclic.cnt;
- } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
+ } else if (xfer->type == EDMA_XFER_SCATTER_GATHER || xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
cnt = xfer->xfer.sg.len;
sg = xfer->xfer.sg.sgl;
} else if (xfer->type == EDMA_XFER_INTERLEAVED) {
@@ -447,7 +450,8 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
}
for (i = 0; i < cnt; i++) {
- if (xfer->type == EDMA_XFER_SCATTER_GATHER && !sg)
+ if ((xfer->type == EDMA_XFER_SCATTER_GATHER ||
+ xfer->type == EDMA_XFER_DUAL_ADDR_SG) && !sg)
break;
if (chunk->bursts_alloc == chan->ll_max) {
@@ -462,7 +466,8 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
if (xfer->type == EDMA_XFER_CYCLIC)
burst->sz = xfer->xfer.cyclic.len;
- else if (xfer->type == EDMA_XFER_SCATTER_GATHER)
+ else if (xfer->type == EDMA_XFER_SCATTER_GATHER ||
+ xfer->type == EDMA_XFER_DUAL_ADDR_SG)
burst->sz = sg_dma_len(sg);
else if (xfer->type == EDMA_XFER_INTERLEAVED)
burst->sz = xfer->xfer.il->sgl[i % fsz].size;
@@ -486,6 +491,9 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
*/
} else if (xfer->type == EDMA_XFER_INTERLEAVED) {
burst->dar = dst_addr;
+ } else if (xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
+ burst->sar = dw_edma_get_pci_address(chan, sg_dma_address(sg));
+ burst->dar = sg_dma_dst_address(sg);
}
} else {
burst->dar = dst_addr;
@@ -503,10 +511,14 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer)
*/
} else if (xfer->type == EDMA_XFER_INTERLEAVED) {
burst->sar = src_addr;
+ } else if (xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
+ burst->sar = sg_dma_address(sg);
+ burst->dar = dw_edma_get_pci_address(chan, sg_dma_dst_address(sg));
}
}
- if (xfer->type == EDMA_XFER_SCATTER_GATHER) {
+ if (xfer->type == EDMA_XFER_SCATTER_GATHER ||
+ xfer->type == EDMA_XFER_DUAL_ADDR_SG) {
sg = sg_next(sg);
} else if (xfer->type == EDMA_XFER_INTERLEAVED) {
struct dma_interleaved_template *il = xfer->xfer.il;
@@ -603,6 +615,25 @@ static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
res->residue = residue;
}
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_batch_sg_dma(struct dma_chan *dchan,
+ struct scatterlist *sg,
+ unsigned int nents,
+ enum dma_transfer_direction direction,
+ unsigned long flags)
+{
+ struct dw_edma_transfer xfer;
+
+ xfer.dchan = dchan;
+ xfer.direction = direction;
+ xfer.xfer.sg.sgl = sg;
+ xfer.xfer.sg.len = nents;
+ xfer.flags = flags;
+ xfer.type = EDMA_XFER_DUAL_ADDR_SG;
+
+ return dw_edma_device_transfer(&xfer);
+}
+
static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
{
struct dw_edma_desc *desc;
@@ -818,6 +849,7 @@ static int dw_edma_channel_setup(struct dw_edma *dw, u32 wr_alloc, u32 rd_alloc)
dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg;
dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic;
dma->device_prep_interleaved_dma = dw_edma_device_prep_interleaved_dma;
+ dma->device_prep_batch_sg_dma = dw_edma_device_prep_batch_sg_dma;
dma_set_max_seg_size(dma->dev, U32_MAX);
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index 71894b9e0b1539c636171738963e80a0a5ef43a4..1a266dc58315edb3d5fd9eddb19fc350f1ed9a1b 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -36,7 +36,8 @@ enum dw_edma_status {
enum dw_edma_xfer_type {
EDMA_XFER_SCATTER_GATHER = 0,
EDMA_XFER_CYCLIC,
- EDMA_XFER_INTERLEAVED
+ EDMA_XFER_INTERLEAVED,
+ EDMA_XFER_DUAL_ADDR_SG,
};
struct dw_edma_chan;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 99efe2b9b4ea9844ca6161208362ef18ef111d96..fdba75b5c40f805904a6697fce3062303fea762a 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -939,7 +939,11 @@ struct dma_device {
size_t period_len, enum dma_transfer_direction direction,
unsigned long flags);
struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
- struct dma_chan *chan, struct dma_interleaved_template *xt,
+ struct dma_chan *chan, struct dma_interleaved_template *xt,
+ unsigned long flags);
+ struct dma_async_tx_descriptor *(*device_prep_batch_sg_dma)
+ (struct dma_chan *chan, struct scatterlist *sg, unsigned int nents,
+ enum dma_transfer_direction direction,
unsigned long flags);
void (*device_caps)(struct dma_chan *chan, struct dma_slave_caps *caps);
@@ -1060,6 +1064,29 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
return chan->device->device_prep_interleaved_dma(chan, xt, flags);
}
+/**
+ * dmaengine_prep_batch_sg_dma() - Prepare single DMA transfer for multiple independent buffers.
+ * @chan: DMA channel
+ * @sg: Scatter-gather list with both source (dma_address) and destination (dma_dst_address)
+ * @nents: Number of entries in the list
+ * @direction: Transfer direction (DMA_MEM_TO_MEM, DMA_DEV_TO_MEM, DMA_MEM_TO_DEV)
+ * @flags: DMA engine flags
+ *
+ * Each SG entry contains both source (sg_dma_address) and destination (sg_dma_dst_address).
+ * This allows multiple independent transfers in a single DMA transaction.
+ * Requires CONFIG_NEED_SG_DMA_DST_ADDR to be enabled.
+ */
+static inline struct dma_async_tx_descriptor *dmaengine_prep_batch_sg_dma
+ (struct dma_chan *chan, struct scatterlist *sg, unsigned int nents,
+ enum dma_transfer_direction direction, unsigned long flags)
+{
+ if (!chan || !chan->device || !chan->device->device_prep_batch_sg_dma ||
+ !sg || !nents)
+ return NULL;
+
+ return chan->device->device_prep_batch_sg_dma(chan, sg, nents, direction, flags);
+}
+
/**
* dmaengine_prep_dma_memset() - Prepare a DMA memset descriptor.
* @chan: The channel to be used for this descriptor
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 29f6ceb98d74b118d08b6a3d4eb7f62dcde0495d..20b65ffcd5e2a65ec5026a29344caf6baa09700b 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -19,6 +19,9 @@ struct scatterlist {
#ifdef CONFIG_NEED_SG_DMA_FLAGS
unsigned int dma_flags;
#endif
+#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
+ dma_addr_t dma_dst_address;
+#endif
};
/*
@@ -36,6 +39,10 @@ struct scatterlist {
#define sg_dma_len(sg) ((sg)->length)
#endif
+#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
+#define sg_dma_dst_address(sg) ((sg)->dma_dst_address)
+#endif
+
struct sg_table {
struct scatterlist *sgl; /* the list */
unsigned int nents; /* number of mapped entries */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 31cfdb6b4bc3e33c239111955d97b3ec160baafa..3539b5b1efe27be7ccbfebb358dbb9cad2868f11 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -32,6 +32,9 @@ config NEED_SG_DMA_LENGTH
config NEED_DMA_MAP_STATE
bool
+config NEED_SG_DMA_DST_ADDR
+ bool
+
config ARCH_DMA_ADDR_T_64BIT
def_bool 64BIT || PHYS_ADDR_T_64BIT
--
2.34.1
On 13-03-26, 12:19, Sumit Kumar wrote: > Add dmaengine_prep_batch_sg API for batching multiple independent buffers > in a single DMA transaction. Each scatter-gather entry specifies both > source and destination addresses. This allows multiple non-contiguous Looks like you want to bring back dmaengine_prep_dma_sg() see commit c678fa66341c > memory regions to be transferred in a single DMA transaction instead of > separate operations, significantly reducing submission overhead and > interrupt overhead. > > Extends struct scatterlist with optional dma_dst_address field > and implements support in dw-edma driver. If this is memcpy why are you talking about dma_dst_address which is a slave field? -- ~Vinod
On 3/17/2026 4:24 PM, Vinod Koul wrote: > On 13-03-26, 12:19, Sumit Kumar wrote: >> Add dmaengine_prep_batch_sg API for batching multiple independent buffers >> in a single DMA transaction. Each scatter-gather entry specifies both >> source and destination addresses. This allows multiple non-contiguous > Looks like you want to bring back dmaengine_prep_dma_sg() see commit c678fa66341c I was not aware about this commit, I will bring back this change (only the core dma part). Along with my changes was are integrated with the above commit. >> memory regions to be transferred in a single DMA transaction instead of >> separate operations, significantly reducing submission overhead and >> interrupt overhead. >> >> Extends struct scatterlist with optional dma_dst_address field >> and implements support in dw-edma driver. > If this is memcpy why are you talking about dma_dst_address which is a > slave field? As we are going back with the commit c678fa66341c we can ignore the current patch. - Sumit
On 2026-03-13 6:49 am, Sumit Kumar wrote:
> Add dmaengine_prep_batch_sg API for batching multiple independent buffers
> in a single DMA transaction. Each scatter-gather entry specifies both
> source and destination addresses. This allows multiple non-contiguous
> memory regions to be transferred in a single DMA transaction instead of
> separate operations, significantly reducing submission overhead and
> interrupt overhead.
>
> Extends struct scatterlist with optional dma_dst_address field
> and implements support in dw-edma driver.
[...]
> diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
> index 29f6ceb98d74b118d08b6a3d4eb7f62dcde0495d..20b65ffcd5e2a65ec5026a29344caf6baa09700b 100644
> --- a/include/linux/scatterlist.h
> +++ b/include/linux/scatterlist.h
> @@ -19,6 +19,9 @@ struct scatterlist {
> #ifdef CONFIG_NEED_SG_DMA_FLAGS
> unsigned int dma_flags;
> #endif
> +#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
> + dma_addr_t dma_dst_address;
> +#endif
Eww, no, what does this even mean? Is the regular dma_addr somehow
implicitly a "source" now? How could the single piece of memory
represented by page_link/offset/length have two different DMA addresses?
How are both the DMA mapping code and users supposed to know which one
is relevant in any particular situation?
If you want to bring back DMA_MEMCPY_SG yet again, and you have an
actual user this time, then do that (although by now it most likely
wants to be a dma_vec version). Don't do whatever this is...
If you want to batch multiple
dmaengine_slave_config()/dma_prep_slave_single() operations into some
many-to-many variant of dmaengine_prep_peripheral_dma_vec(), then surely
that requires actual batching of the config part as well - e.g. passing
an explicit vector of distinct dma_slave_configs corresponding to each
individual dma_vec - in order to be able to work correctly in general?
Thanks,
Robin.
> };
>
> /*
> @@ -36,6 +39,10 @@ struct scatterlist {
> #define sg_dma_len(sg) ((sg)->length)
> #endif
>
> +#ifdef CONFIG_NEED_SG_DMA_DST_ADDR
> +#define sg_dma_dst_address(sg) ((sg)->dma_dst_address)
> +#endif
> +
> struct sg_table {
> struct scatterlist *sgl; /* the list */
> unsigned int nents; /* number of mapped entries */
> diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
> index 31cfdb6b4bc3e33c239111955d97b3ec160baafa..3539b5b1efe27be7ccbfebb358dbb9cad2868f11 100644
> --- a/kernel/dma/Kconfig
> +++ b/kernel/dma/Kconfig
> @@ -32,6 +32,9 @@ config NEED_SG_DMA_LENGTH
> config NEED_DMA_MAP_STATE
> bool
>
> +config NEED_SG_DMA_DST_ADDR
> + bool
> +
> config ARCH_DMA_ADDR_T_64BIT
> def_bool 64BIT || PHYS_ADDR_T_64BIT
>
>
On Fri, Mar 13, 2026 at 03:16:50PM +0000, Robin Murphy wrote: > On 2026-03-13 6:49 am, Sumit Kumar wrote: > > If you want to batch multiple > dmaengine_slave_config()/dma_prep_slave_single() operations into some > many-to-many variant of dmaengine_prep_peripheral_dma_vec(), then surely > that requires actual batching of the config part as well - e.g. passing an > explicit vector of distinct dma_slave_configs corresponding to each > individual dma_vec - in order to be able to work correctly in general? This make me think of Frank's series which tries to create an API which combines dmaengine_slave_config() with dmaengine_prep_slave_single(): https://lore.kernel.org/dmaengine/20251218-dma_prep_config-v2-0-c07079836128@nxp.com/ Not exactly the same, but might still be of interest. Kind regards, Niklas
© 2016 - 2026 Red Hat, Inc.