[PATCH 10/14] dmaengine: dma350: Alloc command[] from dma pool

Jisheng Zhang posted 14 patches 1 month, 1 week ago
[PATCH 10/14] dmaengine: dma350: Alloc command[] from dma pool
Posted by Jisheng Zhang 1 month, 1 week ago
Currently, the command[] is allocated with kzalloc(), but dma350 may be
used on dma-non-coherent platforms, to prepare the support of peripheral
and scatter-gather chaining on both dma-coherent and dma-non-coherent
platforms, let's alloc them from dma pool.

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
---
 drivers/dma/arm-dma350.c | 143 +++++++++++++++++++++++++++++++--------
 1 file changed, 113 insertions(+), 30 deletions(-)

diff --git a/drivers/dma/arm-dma350.c b/drivers/dma/arm-dma350.c
index 72067518799e..3d26a1f020df 100644
--- a/drivers/dma/arm-dma350.c
+++ b/drivers/dma/arm-dma350.c
@@ -4,6 +4,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/dmaengine.h>
+#include <linux/dmapool.h>
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
 #include <linux/of.h>
@@ -143,6 +144,7 @@
 #define LINK_LINKADDR		BIT(30)
 #define LINK_LINKADDRHI		BIT(31)
 
+#define D350_MAX_CMDS		16
 
 enum ch_ctrl_donetype {
 	CH_CTRL_DONETYPE_NONE = 0,
@@ -169,18 +171,25 @@ enum ch_cfg_memattr {
 	MEMATTR_WB = 0xff
 };
 
-struct d350_desc {
-	struct virt_dma_desc vd;
-	u32 command[16];
+struct d350_sg {
+	u32 *command;
+	dma_addr_t phys;
 	u16 xsize;
 	u16 xsizehi;
 	u8 tsz;
 };
 
+struct d350_desc {
+	struct virt_dma_desc vd;
+	u32 sglen;
+	struct d350_sg sg[] __counted_by(sglen);
+};
+
 struct d350_chan {
 	struct virt_dma_chan vc;
 	struct d350_desc *desc;
 	void __iomem *base;
+	struct dma_pool *cmd_pool;
 	int irq;
 	enum dma_status status;
 	dma_cookie_t cookie;
@@ -210,7 +219,14 @@ static inline struct d350_desc *to_d350_desc(struct virt_dma_desc *vd)
 
 static void d350_desc_free(struct virt_dma_desc *vd)
 {
-	kfree(to_d350_desc(vd));
+	struct d350_chan *dch = to_d350_chan(vd->tx.chan);
+	struct d350_desc *desc = to_d350_desc(vd);
+	int i;
+
+	for (i = 0; i < desc->sglen; i++)
+		dma_pool_free(dch->cmd_pool, desc->sg[i].command, desc->sg[i].phys);
+
+	kfree(desc);
 }
 
 static struct dma_async_tx_descriptor *d350_prep_memcpy(struct dma_chan *chan,
@@ -218,22 +234,32 @@ static struct dma_async_tx_descriptor *d350_prep_memcpy(struct dma_chan *chan,
 {
 	struct d350_chan *dch = to_d350_chan(chan);
 	struct d350_desc *desc;
+	struct d350_sg *sg;
+	dma_addr_t phys;
 	u32 *cmd;
 
-	desc = kzalloc(sizeof(*desc), GFP_NOWAIT);
+	desc = kzalloc(struct_size(desc, sg, 1), GFP_NOWAIT);
 	if (!desc)
 		return NULL;
 
-	desc->tsz = __ffs(len | dest | src | (1 << dch->tsz));
-	desc->xsize = lower_16_bits(len >> desc->tsz);
-	desc->xsizehi = upper_16_bits(len >> desc->tsz);
+	sg = &desc->sg[0];
+	sg->command = dma_pool_zalloc(dch->cmd_pool, GFP_NOWAIT, &phys);
+	if (unlikely(!sg->command)) {
+		kfree(desc);
+		return NULL;
+	}
+	sg->phys = phys;
+
+	sg->tsz = __ffs(len | dest | src | (1 << dch->tsz));
+	sg->xsize = lower_16_bits(len >> sg->tsz);
+	sg->xsizehi = upper_16_bits(len >> sg->tsz);
 
-	cmd = desc->command;
+	cmd = sg->command;
 	cmd[0] = LINK_CTRL | LINK_SRCADDR | LINK_SRCADDRHI | LINK_DESADDR |
 		 LINK_DESADDRHI | LINK_XSIZE | LINK_XSIZEHI | LINK_SRCTRANSCFG |
 		 LINK_DESTRANSCFG | LINK_XADDRINC | LINK_LINKADDR;
 
-	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, desc->tsz) |
+	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, sg->tsz) |
 		 FIELD_PREP(CH_CTRL_XTYPE, CH_CTRL_XTYPE_CONTINUE) |
 		 FIELD_PREP(CH_CTRL_DONETYPE, CH_CTRL_DONETYPE_CMD);
 
@@ -241,13 +267,15 @@ static struct dma_async_tx_descriptor *d350_prep_memcpy(struct dma_chan *chan,
 	cmd[3] = upper_32_bits(src);
 	cmd[4] = lower_32_bits(dest);
 	cmd[5] = upper_32_bits(dest);
-	cmd[6] = FIELD_PREP(CH_XY_SRC, desc->xsize) | FIELD_PREP(CH_XY_DES, desc->xsize);
-	cmd[7] = FIELD_PREP(CH_XY_SRC, desc->xsizehi) | FIELD_PREP(CH_XY_DES, desc->xsizehi);
+	cmd[6] = FIELD_PREP(CH_XY_SRC, sg->xsize) | FIELD_PREP(CH_XY_DES, sg->xsize);
+	cmd[7] = FIELD_PREP(CH_XY_SRC, sg->xsizehi) | FIELD_PREP(CH_XY_DES, sg->xsizehi);
 	cmd[8] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
 	cmd[9] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
 	cmd[10] = FIELD_PREP(CH_XY_SRC, 1) | FIELD_PREP(CH_XY_DES, 1);
 	cmd[11] = 0;
 
+	mb();
+
 	return vchan_tx_prep(&dch->vc, &desc->vd, flags);
 }
 
@@ -256,34 +284,46 @@ static struct dma_async_tx_descriptor *d350_prep_memset(struct dma_chan *chan,
 {
 	struct d350_chan *dch = to_d350_chan(chan);
 	struct d350_desc *desc;
+	struct d350_sg *sg;
+	dma_addr_t phys;
 	u32 *cmd;
 
-	desc = kzalloc(sizeof(*desc), GFP_NOWAIT);
+	desc = kzalloc(struct_size(desc, sg, 1), GFP_NOWAIT);
 	if (!desc)
 		return NULL;
 
-	desc->tsz = __ffs(len | dest | (1 << dch->tsz));
-	desc->xsize = lower_16_bits(len >> desc->tsz);
-	desc->xsizehi = upper_16_bits(len >> desc->tsz);
+	sg = &desc->sg[0];
+	sg->command = dma_pool_zalloc(dch->cmd_pool, GFP_NOWAIT, &phys);
+	if (unlikely(!sg->command)) {
+		kfree(desc);
+		return NULL;
+	}
+	sg->phys = phys;
+
+	sg->tsz = __ffs(len | dest | (1 << dch->tsz));
+	sg->xsize = lower_16_bits(len >> sg->tsz);
+	sg->xsizehi = upper_16_bits(len >> sg->tsz);
 
-	cmd = desc->command;
+	cmd = sg->command;
 	cmd[0] = LINK_CTRL | LINK_DESADDR | LINK_DESADDRHI |
 		 LINK_XSIZE | LINK_XSIZEHI | LINK_DESTRANSCFG |
 		 LINK_XADDRINC | LINK_FILLVAL | LINK_LINKADDR;
 
-	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, desc->tsz) |
+	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, sg->tsz) |
 		 FIELD_PREP(CH_CTRL_XTYPE, CH_CTRL_XTYPE_FILL) |
 		 FIELD_PREP(CH_CTRL_DONETYPE, CH_CTRL_DONETYPE_CMD);
 
 	cmd[2] = lower_32_bits(dest);
 	cmd[3] = upper_32_bits(dest);
-	cmd[4] = FIELD_PREP(CH_XY_DES, desc->xsize);
-	cmd[5] = FIELD_PREP(CH_XY_DES, desc->xsizehi);
+	cmd[4] = FIELD_PREP(CH_XY_DES, sg->xsize);
+	cmd[5] = FIELD_PREP(CH_XY_DES, sg->xsizehi);
 	cmd[6] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
 	cmd[7] = FIELD_PREP(CH_XY_DES, 1);
 	cmd[8] = (u8)value * 0x01010101;
 	cmd[9] = 0;
 
+	mb();
+
 	return vchan_tx_prep(&dch->vc, &desc->vd, flags);
 }
 
@@ -319,8 +359,9 @@ static int d350_resume(struct dma_chan *chan)
 
 static u32 d350_get_residue(struct d350_chan *dch)
 {
-	u32 res, xsize, xsizehi, hi_new;
-	int retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */
+	u32 res, xsize, xsizehi, linkaddr, linkaddrhi, hi_new;
+	int i, sgcur, retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */
+	struct d350_desc *desc = dch->desc;
 
 	hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
 	do {
@@ -329,10 +370,26 @@ static u32 d350_get_residue(struct d350_chan *dch)
 		hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
 	} while (xsizehi != hi_new && --retries);
 
+	hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
+	do {
+		linkaddrhi = hi_new;
+		linkaddr = readl_relaxed(dch->base + CH_LINKADDR);
+		hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
+	} while (linkaddrhi != hi_new && --retries);
+
+	for (i = 0; i < desc->sglen; i++) {
+		if (desc->sg[i].phys == (((u64)linkaddrhi << 32) | (linkaddr & ~CH_LINKADDR_EN)))
+			sgcur = i;
+	}
+
 	res = FIELD_GET(CH_XY_DES, xsize);
 	res |= FIELD_GET(CH_XY_DES, xsizehi) << 16;
+	res <<= desc->sg[sgcur].tsz;
+
+	for (i = sgcur + 1; i < desc->sglen; i++)
+		res += (((u32)desc->sg[i].xsizehi << 16 | desc->sg[i].xsize) << desc->sg[i].tsz);
 
-	return res << dch->desc->tsz;
+	return res;
 }
 
 static int d350_terminate_all(struct dma_chan *chan)
@@ -365,7 +422,13 @@ static void d350_synchronize(struct dma_chan *chan)
 
 static u32 d350_desc_bytes(struct d350_desc *desc)
 {
-	return ((u32)desc->xsizehi << 16 | desc->xsize) << desc->tsz;
+	int i;
+	u32 bytes = 0;
+
+	for (i = 0; i < desc->sglen; i++)
+		bytes += (((u32)desc->sg[i].xsizehi << 16 | desc->sg[i].xsize) << desc->sg[i].tsz);
+
+	return bytes;
 }
 
 static enum dma_status d350_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
@@ -415,8 +478,8 @@ static void d350_start_next(struct d350_chan *dch)
 	dch->cookie = dch->desc->vd.tx.cookie;
 	dch->residue = d350_desc_bytes(dch->desc);
 
-	hdr = dch->desc->command[0];
-	reg = &dch->desc->command[1];
+	hdr = dch->desc->sg[0].command[0];
+	reg = &dch->desc->sg[0].command[1];
 
 	if (hdr & LINK_INTREN)
 		writel_relaxed(*reg++, dch->base + CH_INTREN);
@@ -512,11 +575,29 @@ static irqreturn_t d350_irq(int irq, void *data)
 static int d350_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct d350_chan *dch = to_d350_chan(chan);
-	int ret = request_irq(dch->irq, d350_irq, IRQF_SHARED,
-			      dev_name(&dch->vc.chan.dev->device), dch);
-	if (!ret)
-		writel_relaxed(CH_INTREN_DONE | CH_INTREN_ERR, dch->base + CH_INTREN);
+	int ret;
+
+	dch->cmd_pool = dma_pool_create(dma_chan_name(chan),
+					  chan->device->dev,
+					  D350_MAX_CMDS * sizeof(u32),
+					  sizeof(u32), 0);
+	if (!dch->cmd_pool) {
+		dev_err(chan->device->dev, "No memory for cmd pool\n");
+		return -ENOMEM;
+	}
+
+	ret = request_irq(dch->irq, d350_irq, 0,
+			  dev_name(&dch->vc.chan.dev->device), dch);
+	if (ret < 0)
+		goto err_irq;
+
+	writel_relaxed(CH_INTREN_DONE | CH_INTREN_ERR, dch->base + CH_INTREN);
+
+	return 0;
 
+err_irq:
+	dma_pool_destroy(dch->cmd_pool);
+	dch->cmd_pool = NULL;
 	return ret;
 }
 
@@ -527,6 +608,8 @@ static void d350_free_chan_resources(struct dma_chan *chan)
 	writel_relaxed(0, dch->base + CH_INTREN);
 	free_irq(dch->irq, dch);
 	vchan_free_chan_resources(&dch->vc);
+	dma_pool_destroy(dch->cmd_pool);
+	dch->cmd_pool = NULL;
 }
 
 static int d350_probe(struct platform_device *pdev)
-- 
2.50.0
Re: [PATCH 10/14] dmaengine: dma350: Alloc command[] from dma pool
Posted by Dan Carpenter 1 month ago
Hi Jisheng,

kernel test robot noticed the following build warnings:

https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Jisheng-Zhang/dmaengine-dma350-Fix-CH_CTRL_USESRCTRIGIN-definition/20250824-000425
base:   https://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git next
patch link:    https://lore.kernel.org/r/20250823154009.25992-11-jszhang%40kernel.org
patch subject: [PATCH 10/14] dmaengine: dma350: Alloc command[] from dma pool
config: arm-randconfig-r073-20250829 (https://download.01.org/0day-ci/archive/20250829/202508291556.kjNumYgR-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 13.4.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202508291556.kjNumYgR-lkp@intel.com/

smatch warnings:
drivers/dma/arm-dma350.c:387 d350_get_residue() error: uninitialized symbol 'sgcur'.

vim +/sgcur +387 drivers/dma/arm-dma350.c

5d099706449d54 Robin Murphy  2025-03-12  360  static u32 d350_get_residue(struct d350_chan *dch)
5d099706449d54 Robin Murphy  2025-03-12  361  {
eae79fde2ff50c Jisheng Zhang 2025-08-23  362  	u32 res, xsize, xsizehi, linkaddr, linkaddrhi, hi_new;
eae79fde2ff50c Jisheng Zhang 2025-08-23  363  	int i, sgcur, retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */
eae79fde2ff50c Jisheng Zhang 2025-08-23  364  	struct d350_desc *desc = dch->desc;
5d099706449d54 Robin Murphy  2025-03-12  365  
5d099706449d54 Robin Murphy  2025-03-12  366  	hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
5d099706449d54 Robin Murphy  2025-03-12  367  	do {
5d099706449d54 Robin Murphy  2025-03-12  368  		xsizehi = hi_new;
5d099706449d54 Robin Murphy  2025-03-12  369  		xsize = readl_relaxed(dch->base + CH_XSIZE);
5d099706449d54 Robin Murphy  2025-03-12  370  		hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
5d099706449d54 Robin Murphy  2025-03-12  371  	} while (xsizehi != hi_new && --retries);
5d099706449d54 Robin Murphy  2025-03-12  372  
eae79fde2ff50c Jisheng Zhang 2025-08-23  373  	hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
eae79fde2ff50c Jisheng Zhang 2025-08-23  374  	do {
eae79fde2ff50c Jisheng Zhang 2025-08-23  375  		linkaddrhi = hi_new;
eae79fde2ff50c Jisheng Zhang 2025-08-23  376  		linkaddr = readl_relaxed(dch->base + CH_LINKADDR);
eae79fde2ff50c Jisheng Zhang 2025-08-23  377  		hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
eae79fde2ff50c Jisheng Zhang 2025-08-23  378  	} while (linkaddrhi != hi_new && --retries);
eae79fde2ff50c Jisheng Zhang 2025-08-23  379  
eae79fde2ff50c Jisheng Zhang 2025-08-23  380  	for (i = 0; i < desc->sglen; i++) {
eae79fde2ff50c Jisheng Zhang 2025-08-23  381  		if (desc->sg[i].phys == (((u64)linkaddrhi << 32) | (linkaddr & ~CH_LINKADDR_EN)))
eae79fde2ff50c Jisheng Zhang 2025-08-23  382  			sgcur = i;

I'm suprised there isn't a break statement after this assignment.
What if we exit the loop with i == desc->sglen?

eae79fde2ff50c Jisheng Zhang 2025-08-23  383  	}
eae79fde2ff50c Jisheng Zhang 2025-08-23  384  
5d099706449d54 Robin Murphy  2025-03-12  385  	res = FIELD_GET(CH_XY_DES, xsize);
5d099706449d54 Robin Murphy  2025-03-12  386  	res |= FIELD_GET(CH_XY_DES, xsizehi) << 16;
eae79fde2ff50c Jisheng Zhang 2025-08-23 @387  	res <<= desc->sg[sgcur].tsz;
                                                                 ^^^^^
Uninitialized.

eae79fde2ff50c Jisheng Zhang 2025-08-23  388  
eae79fde2ff50c Jisheng Zhang 2025-08-23  389  	for (i = sgcur + 1; i < desc->sglen; i++)
eae79fde2ff50c Jisheng Zhang 2025-08-23  390  		res += (((u32)desc->sg[i].xsizehi << 16 | desc->sg[i].xsize) << desc->sg[i].tsz);
5d099706449d54 Robin Murphy  2025-03-12  391  
eae79fde2ff50c Jisheng Zhang 2025-08-23  392  	return res;
5d099706449d54 Robin Murphy  2025-03-12  393  }

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH 10/14] dmaengine: dma350: Alloc command[] from dma pool
Posted by Robin Murphy 1 month ago
On 2025-08-23 4:40 pm, Jisheng Zhang wrote:
> Currently, the command[] is allocated with kzalloc(), but dma350 may be
> used on dma-non-coherent platforms, to prepare the support of peripheral
> and scatter-gather chaining on both dma-coherent and dma-non-coherent
> platforms, let's alloc them from dma pool.

FWIW my plan was to use dma_map_single() for command linking, since the 
descriptors themselves are short-lived one-way data transfers which 
really don't need any of the (potentially costly) properties of a 
dma-coherent allocation.

> Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
> ---
>   drivers/dma/arm-dma350.c | 143 +++++++++++++++++++++++++++++++--------
>   1 file changed, 113 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/dma/arm-dma350.c b/drivers/dma/arm-dma350.c
> index 72067518799e..3d26a1f020df 100644
> --- a/drivers/dma/arm-dma350.c
> +++ b/drivers/dma/arm-dma350.c
> @@ -4,6 +4,7 @@
>   
>   #include <linux/bitfield.h>
>   #include <linux/dmaengine.h>
> +#include <linux/dmapool.h>
>   #include <linux/dma-mapping.h>
>   #include <linux/io.h>
>   #include <linux/of.h>
> @@ -143,6 +144,7 @@
>   #define LINK_LINKADDR		BIT(30)
>   #define LINK_LINKADDRHI		BIT(31)
>   
> +#define D350_MAX_CMDS		16

What's that based on? We should be able to link arbitrarily-long chains 
of commands, no?

>   enum ch_ctrl_donetype {
>   	CH_CTRL_DONETYPE_NONE = 0,
> @@ -169,18 +171,25 @@ enum ch_cfg_memattr {
>   	MEMATTR_WB = 0xff
>   };
>   
> -struct d350_desc {
> -	struct virt_dma_desc vd;
> -	u32 command[16];
> +struct d350_sg {
> +	u32 *command;
> +	dma_addr_t phys;
>   	u16 xsize;
>   	u16 xsizehi;
>   	u8 tsz;
>   };
>   
> +struct d350_desc {
> +	struct virt_dma_desc vd;
> +	u32 sglen;
> +	struct d350_sg sg[] __counted_by(sglen);
> +};

Perhaps it's mostly the naming, but this seems rather hard to make sense 
of. To clarify, the current driver design was in anticipation of a split 
more like so:

struct d350_cmd {
	u32 command[16];
	u16 xsize;
	u16 xsizehi;
	u8 tsz;
	struct d350_cmd *next;
};

struct d350_desc {
	struct virt_dma_desc vd;
	// any totals etc. to help with residue calculation
	struct d350_cmd cmd;
};

Or perhaps what I'd more likely have ended up with (which is maybe sort 
of what you've tried to do?):

struct d350_cmd {
	u32 command[16];
	u16 xsize;
	u16 xsizehi;
	u8 tsz;
};

struct d350_desc {
	struct virt_dma_desc vd;
	// anything else as above
	int num_cmds;
	struct d350_cmd cmd[1]; //extensible
};

Conveniently taking advantage of the fact that either way the DMA 
address will inherently be stored in the LINKADDR fields of the first 
command (which doesn't need DMA mapping itself), so at worst we still 
only need one or two allocations plus a single dma_map per prep 
operation (since we can keep all the linked commands as a single block). 
I don't see a DMA pool approach being beneficial here, since it seems 
like it's always going to be considerably less efficient.

(Not to mention that separate pools per channel is complete overkill 
anyway.)

Thanks,
Robin.

> +
>   struct d350_chan {
>   	struct virt_dma_chan vc;
>   	struct d350_desc *desc;
>   	void __iomem *base;
> +	struct dma_pool *cmd_pool;
>   	int irq;
>   	enum dma_status status;
>   	dma_cookie_t cookie;
> @@ -210,7 +219,14 @@ static inline struct d350_desc *to_d350_desc(struct virt_dma_desc *vd)
>   
>   static void d350_desc_free(struct virt_dma_desc *vd)
>   {
> -	kfree(to_d350_desc(vd));
> +	struct d350_chan *dch = to_d350_chan(vd->tx.chan);
> +	struct d350_desc *desc = to_d350_desc(vd);
> +	int i;
> +
> +	for (i = 0; i < desc->sglen; i++)
> +		dma_pool_free(dch->cmd_pool, desc->sg[i].command, desc->sg[i].phys);
> +
> +	kfree(desc);
>   }
>   
>   static struct dma_async_tx_descriptor *d350_prep_memcpy(struct dma_chan *chan,
> @@ -218,22 +234,32 @@ static struct dma_async_tx_descriptor *d350_prep_memcpy(struct dma_chan *chan,
>   {
>   	struct d350_chan *dch = to_d350_chan(chan);
>   	struct d350_desc *desc;
> +	struct d350_sg *sg;
> +	dma_addr_t phys;
>   	u32 *cmd;
>   
> -	desc = kzalloc(sizeof(*desc), GFP_NOWAIT);
> +	desc = kzalloc(struct_size(desc, sg, 1), GFP_NOWAIT);
>   	if (!desc)
>   		return NULL;
>   
> -	desc->tsz = __ffs(len | dest | src | (1 << dch->tsz));
> -	desc->xsize = lower_16_bits(len >> desc->tsz);
> -	desc->xsizehi = upper_16_bits(len >> desc->tsz);
> +	sg = &desc->sg[0];
> +	sg->command = dma_pool_zalloc(dch->cmd_pool, GFP_NOWAIT, &phys);
> +	if (unlikely(!sg->command)) {
> +		kfree(desc);
> +		return NULL;
> +	}
> +	sg->phys = phys;
> +
> +	sg->tsz = __ffs(len | dest | src | (1 << dch->tsz));
> +	sg->xsize = lower_16_bits(len >> sg->tsz);
> +	sg->xsizehi = upper_16_bits(len >> sg->tsz);
>   
> -	cmd = desc->command;
> +	cmd = sg->command;
>   	cmd[0] = LINK_CTRL | LINK_SRCADDR | LINK_SRCADDRHI | LINK_DESADDR |
>   		 LINK_DESADDRHI | LINK_XSIZE | LINK_XSIZEHI | LINK_SRCTRANSCFG |
>   		 LINK_DESTRANSCFG | LINK_XADDRINC | LINK_LINKADDR;
>   
> -	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, desc->tsz) |
> +	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, sg->tsz) |
>   		 FIELD_PREP(CH_CTRL_XTYPE, CH_CTRL_XTYPE_CONTINUE) |
>   		 FIELD_PREP(CH_CTRL_DONETYPE, CH_CTRL_DONETYPE_CMD);
>   
> @@ -241,13 +267,15 @@ static struct dma_async_tx_descriptor *d350_prep_memcpy(struct dma_chan *chan,
>   	cmd[3] = upper_32_bits(src);
>   	cmd[4] = lower_32_bits(dest);
>   	cmd[5] = upper_32_bits(dest);
> -	cmd[6] = FIELD_PREP(CH_XY_SRC, desc->xsize) | FIELD_PREP(CH_XY_DES, desc->xsize);
> -	cmd[7] = FIELD_PREP(CH_XY_SRC, desc->xsizehi) | FIELD_PREP(CH_XY_DES, desc->xsizehi);
> +	cmd[6] = FIELD_PREP(CH_XY_SRC, sg->xsize) | FIELD_PREP(CH_XY_DES, sg->xsize);
> +	cmd[7] = FIELD_PREP(CH_XY_SRC, sg->xsizehi) | FIELD_PREP(CH_XY_DES, sg->xsizehi);
>   	cmd[8] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
>   	cmd[9] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
>   	cmd[10] = FIELD_PREP(CH_XY_SRC, 1) | FIELD_PREP(CH_XY_DES, 1);
>   	cmd[11] = 0;
>   
> +	mb();
> +
>   	return vchan_tx_prep(&dch->vc, &desc->vd, flags);
>   }
>   
> @@ -256,34 +284,46 @@ static struct dma_async_tx_descriptor *d350_prep_memset(struct dma_chan *chan,
>   {
>   	struct d350_chan *dch = to_d350_chan(chan);
>   	struct d350_desc *desc;
> +	struct d350_sg *sg;
> +	dma_addr_t phys;
>   	u32 *cmd;
>   
> -	desc = kzalloc(sizeof(*desc), GFP_NOWAIT);
> +	desc = kzalloc(struct_size(desc, sg, 1), GFP_NOWAIT);
>   	if (!desc)
>   		return NULL;
>   
> -	desc->tsz = __ffs(len | dest | (1 << dch->tsz));
> -	desc->xsize = lower_16_bits(len >> desc->tsz);
> -	desc->xsizehi = upper_16_bits(len >> desc->tsz);
> +	sg = &desc->sg[0];
> +	sg->command = dma_pool_zalloc(dch->cmd_pool, GFP_NOWAIT, &phys);
> +	if (unlikely(!sg->command)) {
> +		kfree(desc);
> +		return NULL;
> +	}
> +	sg->phys = phys;
> +
> +	sg->tsz = __ffs(len | dest | (1 << dch->tsz));
> +	sg->xsize = lower_16_bits(len >> sg->tsz);
> +	sg->xsizehi = upper_16_bits(len >> sg->tsz);
>   
> -	cmd = desc->command;
> +	cmd = sg->command;
>   	cmd[0] = LINK_CTRL | LINK_DESADDR | LINK_DESADDRHI |
>   		 LINK_XSIZE | LINK_XSIZEHI | LINK_DESTRANSCFG |
>   		 LINK_XADDRINC | LINK_FILLVAL | LINK_LINKADDR;
>   
> -	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, desc->tsz) |
> +	cmd[1] = FIELD_PREP(CH_CTRL_TRANSIZE, sg->tsz) |
>   		 FIELD_PREP(CH_CTRL_XTYPE, CH_CTRL_XTYPE_FILL) |
>   		 FIELD_PREP(CH_CTRL_DONETYPE, CH_CTRL_DONETYPE_CMD);
>   
>   	cmd[2] = lower_32_bits(dest);
>   	cmd[3] = upper_32_bits(dest);
> -	cmd[4] = FIELD_PREP(CH_XY_DES, desc->xsize);
> -	cmd[5] = FIELD_PREP(CH_XY_DES, desc->xsizehi);
> +	cmd[4] = FIELD_PREP(CH_XY_DES, sg->xsize);
> +	cmd[5] = FIELD_PREP(CH_XY_DES, sg->xsizehi);
>   	cmd[6] = dch->coherent ? TRANSCFG_WB : TRANSCFG_NC;
>   	cmd[7] = FIELD_PREP(CH_XY_DES, 1);
>   	cmd[8] = (u8)value * 0x01010101;
>   	cmd[9] = 0;
>   
> +	mb();
> +
>   	return vchan_tx_prep(&dch->vc, &desc->vd, flags);
>   }
>   
> @@ -319,8 +359,9 @@ static int d350_resume(struct dma_chan *chan)
>   
>   static u32 d350_get_residue(struct d350_chan *dch)
>   {
> -	u32 res, xsize, xsizehi, hi_new;
> -	int retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */
> +	u32 res, xsize, xsizehi, linkaddr, linkaddrhi, hi_new;
> +	int i, sgcur, retries = 3; /* 1st time unlucky, 2nd improbable, 3rd just broken */
> +	struct d350_desc *desc = dch->desc;
>   
>   	hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
>   	do {
> @@ -329,10 +370,26 @@ static u32 d350_get_residue(struct d350_chan *dch)
>   		hi_new = readl_relaxed(dch->base + CH_XSIZEHI);
>   	} while (xsizehi != hi_new && --retries);
>   
> +	hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
> +	do {
> +		linkaddrhi = hi_new;
> +		linkaddr = readl_relaxed(dch->base + CH_LINKADDR);
> +		hi_new = readl_relaxed(dch->base + CH_LINKADDRHI);
> +	} while (linkaddrhi != hi_new && --retries);
> +
> +	for (i = 0; i < desc->sglen; i++) {
> +		if (desc->sg[i].phys == (((u64)linkaddrhi << 32) | (linkaddr & ~CH_LINKADDR_EN)))
> +			sgcur = i;
> +	}
> +
>   	res = FIELD_GET(CH_XY_DES, xsize);
>   	res |= FIELD_GET(CH_XY_DES, xsizehi) << 16;
> +	res <<= desc->sg[sgcur].tsz;
> +
> +	for (i = sgcur + 1; i < desc->sglen; i++)
> +		res += (((u32)desc->sg[i].xsizehi << 16 | desc->sg[i].xsize) << desc->sg[i].tsz);
>   
> -	return res << dch->desc->tsz;
> +	return res;
>   }
>   
>   static int d350_terminate_all(struct dma_chan *chan)
> @@ -365,7 +422,13 @@ static void d350_synchronize(struct dma_chan *chan)
>   
>   static u32 d350_desc_bytes(struct d350_desc *desc)
>   {
> -	return ((u32)desc->xsizehi << 16 | desc->xsize) << desc->tsz;
> +	int i;
> +	u32 bytes = 0;
> +
> +	for (i = 0; i < desc->sglen; i++)
> +		bytes += (((u32)desc->sg[i].xsizehi << 16 | desc->sg[i].xsize) << desc->sg[i].tsz);
> +
> +	return bytes;
>   }
>   
>   static enum dma_status d350_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
> @@ -415,8 +478,8 @@ static void d350_start_next(struct d350_chan *dch)
>   	dch->cookie = dch->desc->vd.tx.cookie;
>   	dch->residue = d350_desc_bytes(dch->desc);
>   
> -	hdr = dch->desc->command[0];
> -	reg = &dch->desc->command[1];
> +	hdr = dch->desc->sg[0].command[0];
> +	reg = &dch->desc->sg[0].command[1];
>   
>   	if (hdr & LINK_INTREN)
>   		writel_relaxed(*reg++, dch->base + CH_INTREN);
> @@ -512,11 +575,29 @@ static irqreturn_t d350_irq(int irq, void *data)
>   static int d350_alloc_chan_resources(struct dma_chan *chan)
>   {
>   	struct d350_chan *dch = to_d350_chan(chan);
> -	int ret = request_irq(dch->irq, d350_irq, IRQF_SHARED,
> -			      dev_name(&dch->vc.chan.dev->device), dch);
> -	if (!ret)
> -		writel_relaxed(CH_INTREN_DONE | CH_INTREN_ERR, dch->base + CH_INTREN);
> +	int ret;
> +
> +	dch->cmd_pool = dma_pool_create(dma_chan_name(chan),
> +					  chan->device->dev,
> +					  D350_MAX_CMDS * sizeof(u32),
> +					  sizeof(u32), 0);
> +	if (!dch->cmd_pool) {
> +		dev_err(chan->device->dev, "No memory for cmd pool\n");
> +		return -ENOMEM;
> +	}
> +
> +	ret = request_irq(dch->irq, d350_irq, 0,
> +			  dev_name(&dch->vc.chan.dev->device), dch);
> +	if (ret < 0)
> +		goto err_irq;
> +
> +	writel_relaxed(CH_INTREN_DONE | CH_INTREN_ERR, dch->base + CH_INTREN);
> +
> +	return 0;
>   
> +err_irq:
> +	dma_pool_destroy(dch->cmd_pool);
> +	dch->cmd_pool = NULL;
>   	return ret;
>   }
>   
> @@ -527,6 +608,8 @@ static void d350_free_chan_resources(struct dma_chan *chan)
>   	writel_relaxed(0, dch->base + CH_INTREN);
>   	free_irq(dch->irq, dch);
>   	vchan_free_chan_resources(&dch->vc);
> +	dma_pool_destroy(dch->cmd_pool);
> +	dch->cmd_pool = NULL;
>   }
>   
>   static int d350_probe(struct platform_device *pdev)