[v1] Support ECSPI dynamic burst feature for DMA mode

[PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode

Posted by Carlos Song 2 months, 2 weeks ago

ECSPI transfers only one word per frame in DMA mode, causing SCLK stalls
between words due to BURST_LENGTH updates, which significantly impacts
performance.

To improve throughput, configure BURST_LENGTH as large as possible (up to
512 bytes per frame) instead of word length. This avoids delays between
words. When transfer length is not 4-byte aligned, use bounce buffers to
align data for DMA. TX uses aligned words for TXFIFO, while RX trims DMA
buffer data after transfer completion.

Introduce a new dma_package structure to store:
  1. BURST_LENGTH values for each DMA request
  2. Variables for DMA submission
  3. DMA transmission length and actual data length

Handle three cases:
  - len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1
  - len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes)
  - len > 512 and unaligned: two packages, second for tail data

Performance test (spidev_test @10MHz, 4KB):
  Before: tx/rx ~6651.9 kbps
  After:  tx/rx ~9922.2 kbps (~50% improvement)

For compatibility with slow SPI devices, add configurable word delay in
DMA mode. When word delay is set, dynamic burst is disabled and
BURST_LENGTH equals word length.

Signed-off-by: Carlos Song <carlos.song@nxp.com>
---
 drivers/spi/spi-imx.c | 409 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 373 insertions(+), 36 deletions(-)

diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index 42f64d9535c9..f02a47fbba8a 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us,
 #define MX51_ECSPI_CTRL_MAX_BURST	512
 /* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/
 #define MX53_MAX_TRANSFER_BYTES		512
+#define BYTES_PER_32BITS_WORD		4
 
 enum spi_imx_devtype {
 	IMX1_CSPI,
@@ -95,6 +96,16 @@ struct spi_imx_devtype_data {
 	enum spi_imx_devtype devtype;
 };
 
+struct dma_data_package {
+	u32 cmd_word;
+	void *dma_rx_buf;
+	void *dma_tx_buf;
+	dma_addr_t	dma_tx_addr;
+	dma_addr_t	dma_rx_addr;
+	int dma_len;
+	int data_len;
+};
+
 struct spi_imx_data {
 	struct spi_controller *controller;
 	struct device *dev;
@@ -130,6 +141,9 @@ struct spi_imx_data {
 	u32 wml;
 	struct completion dma_rx_completion;
 	struct completion dma_tx_completion;
+	struct dma_data_package *dma_data;
+	int dma_package_num;
+	int rx_offset;
 
 	const struct spi_imx_devtype_data *devtype_data;
 };
@@ -189,6 +203,9 @@ MXC_SPI_BUF_TX(u16)
 MXC_SPI_BUF_RX(u32)
 MXC_SPI_BUF_TX(u32)
 
+/* Align to cache line to avoid swiotlo bounce */
+#define DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment())
+
 /* First entry is reserved, second entry is valid only if SDHC_SPIEN is set
  * (which is currently not the case in this driver)
  */
@@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller *controller, struct spi_device
 	if (transfer->len < spi_imx->devtype_data->fifo_size)
 		return false;
 
+	/* DMA only can transmit data in bytes */
+	if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 &&
+	    spi_imx->bits_per_word != 32)
+		return false;
+
+	if (transfer->len >= MAX_SDMA_BD_BYTES)
+		return false;
+
 	spi_imx->dynamic_burst = 0;
 
 	return true;
@@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev, struct spi_imx_data *spi_imx,
 
 	init_completion(&spi_imx->dma_rx_completion);
 	init_completion(&spi_imx->dma_tx_completion);
-	controller->can_dma = spi_imx_can_dma;
-	controller->max_dma_len = MAX_SDMA_BD_BYTES;
 	spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX |
 					 SPI_CONTROLLER_MUST_TX;
 
@@ -1437,10 +1460,252 @@ static int spi_imx_calculate_timeout(struct spi_imx_data *spi_imx, int size)
 	return secs_to_jiffies(2 * timeout);
 }
 
+static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx,
+			      struct dma_data_package *dma_data)
+{
+	struct device *tx_dev = spi_imx->controller->dma_tx->device->dev;
+	struct device *rx_dev = spi_imx->controller->dma_rx->device->dev;
+
+	dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
+			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+			 DMA_TO_DEVICE);
+	dma_unmap_single(rx_dev, dma_data->dma_rx_addr,
+			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+			 DMA_FROM_DEVICE);
+}
+
+static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx,
+				       struct dma_data_package *dma_data, void *rx_buf,
+				       bool word_delay)
+{
+#ifdef __LITTLE_ENDIAN
+	unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
+	u32 *temp = dma_data->dma_rx_buf;
+#endif
+	void *copy_ptr;
+	int unaligned;
+
+#ifdef __LITTLE_ENDIAN
+	/*
+	 * On little-endian CPUs, adjust byte order:
+	 * - Swap bytes when bpw = 8
+	 * - Swap half-words when bpw = 16
+	 * This ensures correct data ordering for DMA transfers.
+	 */
+	if (!word_delay) {
+		for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {
+			if (bytes_per_word == 1)
+				swab32s(temp + i);
+			else if (bytes_per_word == 2)
+				swahw32s(temp + i);
+		}
+	}
+#endif
+
+	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
+		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
+		copy_ptr = (u8 *)dma_data->dma_rx_buf + BYTES_PER_32BITS_WORD - unaligned;
+	} else {
+		copy_ptr = dma_data->dma_rx_buf;
+	}
+
+	memcpy(rx_buf, copy_ptr, dma_data->data_len);
+}
+
+static int spi_imx_dma_map(struct spi_imx_data *spi_imx,
+			   struct dma_data_package *dma_data)
+{
+	struct spi_controller *controller = spi_imx->controller;
+	struct device *tx_dev = controller->dma_tx->device->dev;
+	struct device *rx_dev = controller->dma_rx->device->dev;
+
+	dma_data->dma_tx_addr = dma_map_single(tx_dev, dma_data->dma_tx_buf,
+					       DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+					       DMA_TO_DEVICE);
+	if (dma_mapping_error(tx_dev, dma_data->dma_tx_addr)) {
+		dev_err(spi_imx->dev, "DMA TX map failed\n");
+		return -EINVAL;
+	}
+
+	dma_data->dma_rx_addr = dma_map_single(rx_dev, dma_data->dma_rx_buf,
+					       DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+					       DMA_FROM_DEVICE);
+	if (dma_mapping_error(rx_dev, dma_data->dma_rx_addr)) {
+		dev_err(spi_imx->dev, "DMA RX map failed\n");
+		goto rx_map_err;
+	}
+
+	return 0;
+
+rx_map_err:
+	dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
+			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+			 DMA_TO_DEVICE);
+	return -EINVAL;
+}
+
+static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx,
+				      struct dma_data_package *dma_data,
+				      const void *tx_buf,
+				      bool word_delay)
+{
+#ifdef __LITTLE_ENDIAN
+	unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
+	u32 *temp;
+#endif
+	void *copy_ptr;
+	int unaligned;
+
+	if (word_delay) {
+		dma_data->dma_len = dma_data->data_len;
+	} else {
+		/*
+		 * As per the reference manual, when burst length = 32*n + m bits, ECSPI
+		 * sends m LSB bits in the first word, followed by n full 32-bit words.
+		 * Since actual data may not be 4-byte aligned, allocate DMA TX/RX buffers
+		 * to ensure alignment. For TX, DMA pushes 4-byte aligned words to TXFIFO,
+		 * while ECSPI uses BURST_LENGTH settings to maintain correct bit count.
+		 * For RX, DMA receives 32-bit words from RXFIFO; after transfer completes,
+		 * trim the DMA RX buffer and copy the actual data to rx_buf.
+		 */
+		dma_data->dma_len = ALIGN(dma_data->data_len, BYTES_PER_32BITS_WORD);
+	}
+
+	dma_data->dma_tx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);
+	if (!dma_data->dma_tx_buf)
+		return -ENOMEM;
+
+	dma_data->dma_rx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);
+	if (!dma_data->dma_rx_buf) {
+		kfree(dma_data->dma_tx_buf);
+		return -ENOMEM;
+	}
+
+	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
+		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
+		copy_ptr = (u8 *)dma_data->dma_tx_buf + BYTES_PER_32BITS_WORD - unaligned;
+	} else {
+		copy_ptr = dma_data->dma_tx_buf;
+	}
+
+	memcpy(copy_ptr, tx_buf, dma_data->data_len);
+
+	/*
+	 * When word_delay is enabled, DMA transfers an entire word in one minor loop.
+	 * In this case, no data requires additional handling.
+	 */
+	if (word_delay)
+		return 0;
+
+#ifdef __LITTLE_ENDIAN
+	/*
+	 * On little-endian CPUs, adjust byte order:
+	 * - Swap bytes when bpw = 8
+	 * - Swap half-words when bpw = 16
+	 * This ensures correct data ordering for DMA transfers.
+	 */
+	temp = dma_data->dma_tx_buf;
+	for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {
+		if (bytes_per_word == 1)
+			swab32s(temp + i);
+		else if (bytes_per_word == 2)
+			swahw32s(temp + i);
+	}
+#endif
+
+	return 0;
+}
+
+static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
+				    struct spi_transfer *transfer,
+				    bool word_delay)
+{
+	u32 pre_bl, tail_bl;
+	u32 ctrl;
+	int ret;
+
+	/*
+	 * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512
+	 * and is not a multiple of 512, a tail transfer is required. In this case,
+	 * an extra DMA request is issued for the remaining data.
+	 */
+	ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
+	if (word_delay) {
+		/*
+		 * When SPI IMX need to support word delay, according to "Sample Period Control
+		 * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG)
+		 * provides software a way to insert delays (wait states) between consecutive
+		 * SPI transfers. As a result, ECSPI can only transfer one word per frame, and
+		 * the delay occurs between frames.
+		 */
+		spi_imx->dma_package_num = 1;
+		pre_bl = spi_imx->bits_per_word - 1;
+	} else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
+		spi_imx->dma_package_num = 1;
+		pre_bl = transfer->len * BITS_PER_BYTE - 1;
+	} else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
+		spi_imx->dma_package_num = 1;
+		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
+	} else {
+		spi_imx->dma_package_num = 2;
+		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
+		tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1;
+	}
+
+	spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
+					  sizeof(struct dma_data_package),
+					  GFP_KERNEL | __GFP_ZERO);
+	if (!spi_imx->dma_data) {
+		dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
+		return -ENOMEM;
+	}
+
+	if (spi_imx->dma_package_num == 1) {
+		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+		spi_imx->dma_data[0].cmd_word = ctrl;
+		spi_imx->dma_data[0].data_len = transfer->len;
+		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
+						 word_delay);
+		if (ret) {
+			kfree(spi_imx->dma_data);
+			return ret;
+		}
+	} else {
+		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+		spi_imx->dma_data[0].cmd_word = ctrl;
+		spi_imx->dma_data[0].data_len = DIV_ROUND_DOWN_ULL(transfer->len,
+								   MX51_ECSPI_CTRL_MAX_BURST)
+								   * MX51_ECSPI_CTRL_MAX_BURST;
+		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
+						 false);
+		if (ret) {
+			kfree(spi_imx->dma_data);
+			return ret;
+		}
+
+		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+		ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+		spi_imx->dma_data[1].cmd_word = ctrl;
+		spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST;
+		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
+						 transfer->tx_buf + spi_imx->dma_data[0].data_len,
+						 false);
+		if (ret) {
+			kfree(spi_imx->dma_data[0].dma_tx_buf);
+			kfree(spi_imx->dma_data[0].dma_rx_buf);
+			kfree(spi_imx->dma_data);
+		}
+	}
+
+	return 0;
+}
+
 static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
+			      struct dma_data_package *dma_data,
 			      struct spi_transfer *transfer)
 {
-	struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
 	struct spi_controller *controller = spi_imx->controller;
 	struct dma_async_tx_descriptor *desc_tx, *desc_rx;
 	unsigned long transfer_timeout;
@@ -1451,9 +1716,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
 	 * The TX DMA setup starts the transfer, so make sure RX is configured
 	 * before TX.
 	 */
-	desc_rx = dmaengine_prep_slave_sg(controller->dma_rx,
-					  rx->sgl, rx->nents, DMA_DEV_TO_MEM,
-					  DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	desc_rx = dmaengine_prep_slave_single(controller->dma_rx, dma_data->dma_rx_addr,
+					      dma_data->dma_len, DMA_DEV_TO_MEM,
+					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	if (!desc_rx) {
 		transfer->error |= SPI_TRANS_FAIL_NO_START;
 		return -EINVAL;
@@ -1471,9 +1736,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
 	reinit_completion(&spi_imx->dma_rx_completion);
 	dma_async_issue_pending(controller->dma_rx);
 
-	desc_tx = dmaengine_prep_slave_sg(controller->dma_tx,
-					  tx->sgl, tx->nents, DMA_MEM_TO_DEV,
-					  DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	desc_tx = dmaengine_prep_slave_single(controller->dma_tx, dma_data->dma_tx_addr,
+					      dma_data->dma_len, DMA_MEM_TO_DEV,
+					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	if (!desc_tx)
 		goto dmaengine_terminate_rx;
 
@@ -1521,16 +1786,16 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
 }
 
 static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
-				     struct spi_transfer *transfer)
+				     struct dma_data_package *dma_data,
+				     bool word_delay)
 {
-	struct sg_table *rx = &transfer->rx_sg;
-	struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents);
-	unsigned int bytes_per_word, i;
+	unsigned int bytes_per_word = word_delay ?
+				      spi_imx_bytes_per_word(spi_imx->bits_per_word) :
+				      BYTES_PER_32BITS_WORD;
+	unsigned int i;
 
-	/* Get the right burst length from the last sg to ensure no tail data */
-	bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word);
 	for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) {
-		if (!(sg_dma_len(last_sg) % (i * bytes_per_word)))
+		if (!dma_data->dma_len % (i * bytes_per_word))
 			break;
 	}
 	/* Use 1 as wml in case no available burst length got */
@@ -1540,25 +1805,29 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
 	spi_imx->wml = i;
 }
 
-static int spi_imx_dma_configure(struct spi_controller *controller)
+static int spi_imx_dma_configure(struct spi_controller *controller, bool word_delay)
 {
 	int ret;
 	enum dma_slave_buswidth buswidth;
 	struct dma_slave_config rx = {}, tx = {};
 	struct spi_imx_data *spi_imx = spi_controller_get_devdata(controller);
 
-	switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
-	case 4:
+	if (word_delay) {
+		switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
+		case 4:
+			buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
+			break;
+		case 2:
+			buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
+			break;
+		case 1:
+			buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
+			break;
+		default:
+			return -EINVAL;
+		}
+	} else {
 		buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
-		break;
-	case 2:
-		buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
-		break;
-	case 1:
-		buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
-		break;
-	default:
-		return -EINVAL;
 	}
 
 	tx.direction = DMA_MEM_TO_DEV;
@@ -1584,15 +1853,17 @@ static int spi_imx_dma_configure(struct spi_controller *controller)
 	return 0;
 }
 
-static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
-				struct spi_transfer *transfer)
+static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx,
+					struct dma_data_package *dma_data,
+					struct spi_transfer *transfer,
+					bool word_delay)
 {
 	struct spi_controller *controller = spi_imx->controller;
 	int ret;
 
-	spi_imx_dma_max_wml_find(spi_imx, transfer);
+	spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay);
 
-	ret = spi_imx_dma_configure(controller);
+	ret = spi_imx_dma_configure(controller, word_delay);
 	if (ret)
 		goto dma_failure_no_start;
 
@@ -1603,10 +1874,17 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 	}
 	spi_imx->devtype_data->setup_wml(spi_imx);
 
-	ret = spi_imx_dma_submit(spi_imx, transfer);
+	ret = spi_imx_dma_submit(spi_imx, dma_data, transfer);
 	if (ret)
 		return ret;
 
+	/* Trim the DMA RX buffer and copy the actual data to rx_buf */
+	dma_sync_single_for_cpu(controller->dma_rx->device->dev, dma_data->dma_rx_addr,
+				dma_data->dma_len, DMA_FROM_DEVICE);
+	spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf + spi_imx->rx_offset,
+				   word_delay);
+	spi_imx->rx_offset += dma_data->data_len;
+
 	return 0;
 /* fallback to pio */
 dma_failure_no_start:
@@ -1614,6 +1892,60 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 	return ret;
 }
 
+static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
+				struct spi_transfer *transfer)
+{
+	bool word_delay = transfer->word_delay.value != 0;
+	int ret;
+	int i;
+
+	ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
+	if (ret < 0) {
+		transfer->error |= SPI_TRANS_FAIL_NO_START;
+		dev_err(spi_imx->dev, "DMA data prepare fail\n");
+		goto fallback_pio;
+	}
+
+	spi_imx->rx_offset = 0;
+
+	/* Each dma_package performs a separate DMA transfer once */
+	for (i = 0; i < spi_imx->dma_package_num; i++) {
+		ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
+		if (ret < 0) {
+			transfer->error |= SPI_TRANS_FAIL_NO_START;
+			dev_err(spi_imx->dev, "DMA map fail\n");
+			break;
+		}
+
+		/* Update the CTRL register BL field */
+		writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
+
+		ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
+						   transfer, word_delay);
+
+		/* Whether the dma transmission is successful or not, dma unmap is necessary */
+		spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
+
+		if (ret < 0) {
+			dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
+			break;
+		}
+	}
+
+	for (int j = 0; j < spi_imx->dma_package_num; j++) {
+		kfree(spi_imx->dma_data[j].dma_tx_buf);
+		kfree(spi_imx->dma_data[j].dma_rx_buf);
+	}
+	kfree(spi_imx->dma_data);
+
+fallback_pio:
+	/* If no any dma package data is transferred, fallback to PIO mode transfer */
+	if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
+		transfer->error &= !SPI_TRANS_FAIL_NO_START;
+
+	return ret;
+}
+
 static int spi_imx_pio_transfer(struct spi_device *spi,
 				struct spi_transfer *transfer)
 {
@@ -1780,9 +2112,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller,
 	 * transfer, the SPI transfer has already been mapped, so we
 	 * have to do the DMA transfer here.
 	 */
-	if (spi_imx->usedma)
-		return spi_imx_dma_transfer(spi_imx, transfer);
-
+	if (spi_imx->usedma) {
+		ret = spi_imx_dma_transfer(spi_imx, transfer);
+		if (transfer->error & SPI_TRANS_FAIL_NO_START) {
+			spi_imx->usedma = false;
+			return spi_imx_pio_transfer(spi, transfer);
+		}
+		return ret;
+	}
 	/* run in polling mode for short transfers */
 	if (transfer->len == 1 || (polling_limit_us &&
 				   spi_imx_transfer_estimate_time_us(transfer) < polling_limit_us))
-- 
2.34.1

Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode

Posted by Marc Kleine-Budde 2 months, 2 weeks ago

On 25.11.2025 18:06:17, Carlos Song wrote:
>  static int spi_imx_pio_transfer(struct spi_device *spi,
>  				struct spi_transfer *transfer)
>  {
> @@ -1780,9 +2112,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller,
>  	 * transfer, the SPI transfer has already been mapped, so we
>  	 * have to do the DMA transfer here.
>  	 */
> -	if (spi_imx->usedma)
> -		return spi_imx_dma_transfer(spi_imx, transfer);
> -
> +	if (spi_imx->usedma) {
> +		ret = spi_imx_dma_transfer(spi_imx, transfer);
> +		if (transfer->error & SPI_TRANS_FAIL_NO_START) {
> +			spi_imx->usedma = false;
> +			return spi_imx_pio_transfer(spi, transfer);
> +		}
> +		return ret;
> +	}

Why do you do this? AFAICS the framework already does this for you see:
spi_transfer_one_message().

Marc

-- 
Pengutronix e.K.                 | Marc Kleine-Budde          |
Embedded Linux                   | https://www.pengutronix.de |
Vertretung Nürnberg              | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax:   +49-5121-206917-9   |

Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode

Posted by Marc Kleine-Budde 2 months, 2 weeks ago

On 25.11.2025 18:06:17, Carlos Song wrote:
> +static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
> +				    struct spi_transfer *transfer,
> +				    bool word_delay)
> +{
> +	u32 pre_bl, tail_bl;
> +	u32 ctrl;
> +	int ret;
> +
> +	/*
> +	 * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512
> +	 * and is not a multiple of 512, a tail transfer is required. In this case,
> +	 * an extra DMA request is issued for the remaining data.
> +	 */
> +	ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
> +	if (word_delay) {
> +		/*
> +		 * When SPI IMX need to support word delay, according to "Sample Period Control
> +		 * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG)
> +		 * provides software a way to insert delays (wait states) between consecutive
> +		 * SPI transfers. As a result, ECSPI can only transfer one word per frame, and
> +		 * the delay occurs between frames.
> +		 */
> +		spi_imx->dma_package_num = 1;
> +		pre_bl = spi_imx->bits_per_word - 1;
> +	} else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
> +		spi_imx->dma_package_num = 1;
> +		pre_bl = transfer->len * BITS_PER_BYTE - 1;
> +	} else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
> +		spi_imx->dma_package_num = 1;
> +		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> +	} else {
> +		spi_imx->dma_package_num = 2;
> +		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> +		tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1;
> +	}
> +
> +	spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
> +					  sizeof(struct dma_data_package),
> +					  GFP_KERNEL | __GFP_ZERO);
> +	if (!spi_imx->dma_data) {
> +		dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
> +		return -ENOMEM;
> +	}
> +
> +	if (spi_imx->dma_package_num == 1) {
> +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> +		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> +		spi_imx->dma_data[0].cmd_word = ctrl;
> +		spi_imx->dma_data[0].data_len = transfer->len;
> +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> +						 word_delay);
> +		if (ret) {
> +			kfree(spi_imx->dma_data);
> +			return ret;
> +		}
> +	} else {
> +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> +		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> +		spi_imx->dma_data[0].cmd_word = ctrl;
> +		spi_imx->dma_data[0].data_len = DIV_ROUND_DOWN_ULL(transfer->len,
> +								   MX51_ECSPI_CTRL_MAX_BURST)
> +								   * MX51_ECSPI_CTRL_MAX_BURST;

What mathematical operation do you want to do? Why do you use a 64 bit
div? What about round_down()?

> +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> +						 false);
> +		if (ret) {
> +			kfree(spi_imx->dma_data);
> +			return ret;
> +		}
> +
> +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> +		ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> +		spi_imx->dma_data[1].cmd_word = ctrl;
> +		spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST;
> +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
> +						 transfer->tx_buf + spi_imx->dma_data[0].data_len,
> +						 false);
> +		if (ret) {
> +			kfree(spi_imx->dma_data[0].dma_tx_buf);
> +			kfree(spi_imx->dma_data[0].dma_rx_buf);
> +			kfree(spi_imx->dma_data);
> +		}
> +	}
> +
> +	return 0;
> +}

regards,
Marc

--
Pengutronix e.K.                 | Marc Kleine-Budde          |
Embedded Linux                   | https://www.pengutronix.de |
Vertretung Nürnberg              | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax:   +49-5121-206917-9   |

Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode

Posted by Marc Kleine-Budde 2 months, 2 weeks ago

On 25.11.2025 18:06:17, Carlos Song wrote:
> +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> +				struct spi_transfer *transfer)
> +{
> +	bool word_delay = transfer->word_delay.value != 0;
> +	int ret;
> +	int i;
> +
> +	ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
> +	if (ret < 0) {
> +		transfer->error |= SPI_TRANS_FAIL_NO_START;
> +		dev_err(spi_imx->dev, "DMA data prepare fail\n");
> +		goto fallback_pio;
> +	}
> +
> +	spi_imx->rx_offset = 0;
> +
> +	/* Each dma_package performs a separate DMA transfer once */
> +	for (i = 0; i < spi_imx->dma_package_num; i++) {
> +		ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
> +		if (ret < 0) {
> +			transfer->error |= SPI_TRANS_FAIL_NO_START;

What about:

if (i == 0)
        transfer->error |= SPI_TRANS_FAIL_NO_START;

instead of removing the later?
> +			dev_err(spi_imx->dev, "DMA map fail\n");
> +			break;
> +		}
> +
> +		/* Update the CTRL register BL field */
> +		writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
> +
> +		ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
> +						   transfer, word_delay);
> +
> +		/* Whether the dma transmission is successful or not, dma unmap is necessary */
> +		spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
> +
> +		if (ret < 0) {
> +			dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
> +			break;
> +		}
> +	}
> +
> +	for (int j = 0; j < spi_imx->dma_package_num; j++) {
> +		kfree(spi_imx->dma_data[j].dma_tx_buf);
> +		kfree(spi_imx->dma_data[j].dma_rx_buf);
> +	}
> +	kfree(spi_imx->dma_data);
> +
> +fallback_pio:
> +	/* If no any dma package data is transferred, fallback to PIO mode transfer */
> +	if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
> +		transfer->error &= !SPI_TRANS_FAIL_NO_START;
> +
> +	return ret;
> +}
> +

Marc

-- 
Pengutronix e.K.                 | Marc Kleine-Budde          |
Embedded Linux                   | https://www.pengutronix.de |
Vertretung Nürnberg              | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax:   +49-5121-206917-9   |

Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode

Posted by Marc Kleine-Budde 2 months, 2 weeks ago

On 25.11.2025 18:06:17, Carlos Song wrote:
> +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> +				struct spi_transfer *transfer)
> +{
> +	bool word_delay = transfer->word_delay.value != 0;
> +	int ret;
> +	int i;
> +
> +	ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
> +	if (ret < 0) {
> +		transfer->error |= SPI_TRANS_FAIL_NO_START;
> +		dev_err(spi_imx->dev, "DMA data prepare fail\n");
> +		goto fallback_pio;
> +	}
> +
> +	spi_imx->rx_offset = 0;
> +
> +	/* Each dma_package performs a separate DMA transfer once */
> +	for (i = 0; i < spi_imx->dma_package_num; i++) {
> +		ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
> +		if (ret < 0) {
> +			transfer->error |= SPI_TRANS_FAIL_NO_START;
> +			dev_err(spi_imx->dev, "DMA map fail\n");
> +			break;
> +		}
> +
> +		/* Update the CTRL register BL field */
> +		writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
> +
> +		ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
> +						   transfer, word_delay);
> +
> +		/* Whether the dma transmission is successful or not, dma unmap is necessary */
> +		spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
> +
> +		if (ret < 0) {
> +			dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
> +			break;
> +		}
> +	}
> +
> +	for (int j = 0; j < spi_imx->dma_package_num; j++) {
> +		kfree(spi_imx->dma_data[j].dma_tx_buf);
> +		kfree(spi_imx->dma_data[j].dma_rx_buf);
> +	}
> +	kfree(spi_imx->dma_data);
> +
> +fallback_pio:
> +	/* If no any dma package data is transferred, fallback to PIO mode transfer */
> +	if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
> +		transfer->error &= !SPI_TRANS_FAIL_NO_START;
                                   ^
this doesn't look correct, you probably want to use a "~", right?

Marc


-- 
Pengutronix e.K.                 | Marc Kleine-Budde          |
Embedded Linux                   | https://www.pengutronix.de |
Vertretung Nürnberg              | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax:   +49-5121-206917-9   |

Re: [PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode

Posted by Marc Kleine-Budde 2 months, 2 weeks ago

On 25.11.2025 18:06:17, Carlos Song wrote:
> ECSPI transfers only one word per frame in DMA mode, causing SCLK stalls
> between words due to BURST_LENGTH updates, which significantly impacts
> performance.
>
> To improve throughput, configure BURST_LENGTH as large as possible (up to
> 512 bytes per frame) instead of word length. This avoids delays between
> words. When transfer length is not 4-byte aligned, use bounce buffers to
> align data for DMA. TX uses aligned words for TXFIFO, while RX trims DMA
> buffer data after transfer completion.
>
> Introduce a new dma_package structure to store:
>   1. BURST_LENGTH values for each DMA request
>   2. Variables for DMA submission
>   3. DMA transmission length and actual data length
>
> Handle three cases:
>   - len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1
>   - len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes)
>   - len > 512 and unaligned: two packages, second for tail data
>
> Performance test (spidev_test @10MHz, 4KB):
>   Before: tx/rx ~6651.9 kbps
>   After:  tx/rx ~9922.2 kbps (~50% improvement)
>
> For compatibility with slow SPI devices, add configurable word delay in
> DMA mode. When word delay is set, dynamic burst is disabled and
> BURST_LENGTH equals word length.
>
> Signed-off-by: Carlos Song <carlos.song@nxp.com>
> ---
>  drivers/spi/spi-imx.c | 409 ++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 373 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
> index 42f64d9535c9..f02a47fbba8a 100644
> --- a/drivers/spi/spi-imx.c
> +++ b/drivers/spi/spi-imx.c
> @@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us,
>  #define MX51_ECSPI_CTRL_MAX_BURST	512
>  /* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/
>  #define MX53_MAX_TRANSFER_BYTES		512
> +#define BYTES_PER_32BITS_WORD		4
>
>  enum spi_imx_devtype {
>  	IMX1_CSPI,
> @@ -95,6 +96,16 @@ struct spi_imx_devtype_data {
>  	enum spi_imx_devtype devtype;
>  };
>
> +struct dma_data_package {
> +	u32 cmd_word;
> +	void *dma_rx_buf;
> +	void *dma_tx_buf;
> +	dma_addr_t	dma_tx_addr;
> +	dma_addr_t	dma_rx_addr;

Better use uniform indention: here one space, not one tab.

> +	int dma_len;
> +	int data_len;
> +};
> +
>  struct spi_imx_data {
>  	struct spi_controller *controller;
>  	struct device *dev;
> @@ -130,6 +141,9 @@ struct spi_imx_data {
>  	u32 wml;
>  	struct completion dma_rx_completion;
>  	struct completion dma_tx_completion;
> +	struct dma_data_package *dma_data;

please add __counted_by(dma_package_num)

> +	int dma_package_num;
> +	int rx_offset;
>
>  	const struct spi_imx_devtype_data *devtype_data;
>  };
> @@ -189,6 +203,9 @@ MXC_SPI_BUF_TX(u16)
>  MXC_SPI_BUF_RX(u32)
>  MXC_SPI_BUF_TX(u32)
>
> +/* Align to cache line to avoid swiotlo bounce */
> +#define DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment())
> +
>  /* First entry is reserved, second entry is valid only if SDHC_SPIEN is set
>   * (which is currently not the case in this driver)
>   */
> @@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller *controller, struct spi_device
>  	if (transfer->len < spi_imx->devtype_data->fifo_size)
>  		return false;
>
> +	/* DMA only can transmit data in bytes */
> +	if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 &&
> +	    spi_imx->bits_per_word != 32)
> +		return false;
> +
> +	if (transfer->len >= MAX_SDMA_BD_BYTES)
> +		return false;
> +
>  	spi_imx->dynamic_burst = 0;
>
>  	return true;
> @@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev, struct spi_imx_data *spi_imx,
>
>  	init_completion(&spi_imx->dma_rx_completion);
>  	init_completion(&spi_imx->dma_tx_completion);
> -	controller->can_dma = spi_imx_can_dma;
> -	controller->max_dma_len = MAX_SDMA_BD_BYTES;
>  	spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX |
>  					 SPI_CONTROLLER_MUST_TX;
>
> @@ -1437,10 +1460,252 @@ static int spi_imx_calculate_timeout(struct spi_imx_data *spi_imx, int size)
>  	return secs_to_jiffies(2 * timeout);
>  }
>
> +static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx,
> +			      struct dma_data_package *dma_data)
> +{
> +	struct device *tx_dev = spi_imx->controller->dma_tx->device->dev;
> +	struct device *rx_dev = spi_imx->controller->dma_rx->device->dev;
> +
> +	dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
> +			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> +			 DMA_TO_DEVICE);
> +	dma_unmap_single(rx_dev, dma_data->dma_rx_addr,
> +			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> +			 DMA_FROM_DEVICE);
> +}
> +
> +static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx,
> +				       struct dma_data_package *dma_data, void *rx_buf,
> +				       bool word_delay)
> +{
> +#ifdef __LITTLE_ENDIAN
> +	unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
> +	u32 *temp = dma_data->dma_rx_buf;

can you move this into the scope of the if() below?
> +#endif
> +	void *copy_ptr;
> +	int unaligned;
> +
> +#ifdef __LITTLE_ENDIAN
> +	/*
> +	 * On little-endian CPUs, adjust byte order:
> +	 * - Swap bytes when bpw = 8
> +	 * - Swap half-words when bpw = 16
> +	 * This ensures correct data ordering for DMA transfers.
> +	 */
> +	if (!word_delay) {
> +		for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {

sizeof(*temp) instead of BYTES_PER_32BITS_WORD?

> +			if (bytes_per_word == 1)
> +				swab32s(temp + i);
> +			else if (bytes_per_word == 2)
> +				swahw32s(temp + i);

Why do you do first a swap in place and then a memcpy()

> +		}
> +	}
> +#endif
> +
> +	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {

I think this deserves a comment, why you do a re-alignment of the data here.

> +		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
> +		copy_ptr = (u8 *)dma_data->dma_rx_buf + BYTES_PER_32BITS_WORD - unaligned;
> +	} else {
> +		copy_ptr = dma_data->dma_rx_buf;

Why do you use the bounce buffer if the data is aligned correct?

> +	}
> +
> +	memcpy(rx_buf, copy_ptr, dma_data->data_len);
> +}
> +
> +static int spi_imx_dma_map(struct spi_imx_data *spi_imx,
> +			   struct dma_data_package *dma_data)
> +{
> +	struct spi_controller *controller = spi_imx->controller;
> +	struct device *tx_dev = controller->dma_tx->device->dev;
> +	struct device *rx_dev = controller->dma_rx->device->dev;
> +
> +	dma_data->dma_tx_addr = dma_map_single(tx_dev, dma_data->dma_tx_buf,
> +					       DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> +					       DMA_TO_DEVICE);
> +	if (dma_mapping_error(tx_dev, dma_data->dma_tx_addr)) {
> +		dev_err(spi_imx->dev, "DMA TX map failed\n");
> +		return -EINVAL;

please propagate the error value of dma_mapping_error()

> +	}
> +
> +	dma_data->dma_rx_addr = dma_map_single(rx_dev, dma_data->dma_rx_buf,
> +					       DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> +					       DMA_FROM_DEVICE);
> +	if (dma_mapping_error(rx_dev, dma_data->dma_rx_addr)) {
> +		dev_err(spi_imx->dev, "DMA RX map failed\n");
> +		goto rx_map_err;

there's only one user of the dma_unmap_single(), so no need for the
goto.

> +	}
> +
> +	return 0;
> +
> +rx_map_err:
> +	dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
> +			 DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> +			 DMA_TO_DEVICE);
> +	return -EINVAL;
> +}
> +
> +static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx,
> +				      struct dma_data_package *dma_data,
> +				      const void *tx_buf,
> +				      bool word_delay)
> +{
> +#ifdef __LITTLE_ENDIAN
> +	unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
> +	u32 *temp;
> +#endif

move into scope of if()

> +	void *copy_ptr;
> +	int unaligned;
> +
> +	if (word_delay) {
> +		dma_data->dma_len = dma_data->data_len;
> +	} else {
> +		/*
> +		 * As per the reference manual, when burst length = 32*n + m bits, ECSPI
> +		 * sends m LSB bits in the first word, followed by n full 32-bit words.
> +		 * Since actual data may not be 4-byte aligned, allocate DMA TX/RX buffers
> +		 * to ensure alignment. For TX, DMA pushes 4-byte aligned words to TXFIFO,
> +		 * while ECSPI uses BURST_LENGTH settings to maintain correct bit count.
> +		 * For RX, DMA receives 32-bit words from RXFIFO; after transfer completes,
> +		 * trim the DMA RX buffer and copy the actual data to rx_buf.
> +		 */

Ahh, please add the corresponding description for rx.

> +		dma_data->dma_len = ALIGN(dma_data->data_len, BYTES_PER_32BITS_WORD);
> +	}
> +
> +	dma_data->dma_tx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);

kzalloc()?

> +	if (!dma_data->dma_tx_buf)
> +		return -ENOMEM;
> +
> +	dma_data->dma_rx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);
> +	if (!dma_data->dma_rx_buf) {
> +		kfree(dma_data->dma_tx_buf);
> +		return -ENOMEM;
> +	}
> +
> +	if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
> +		unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
> +		copy_ptr = (u8 *)dma_data->dma_tx_buf + BYTES_PER_32BITS_WORD - unaligned;
> +	} else {
> +		copy_ptr = dma_data->dma_tx_buf;
> +	}
> +
> +	memcpy(copy_ptr, tx_buf, dma_data->data_len);
> +
> +	/*
> +	 * When word_delay is enabled, DMA transfers an entire word in one minor loop.
> +	 * In this case, no data requires additional handling.
> +	 */
> +	if (word_delay)
> +		return 0;
> +
> +#ifdef __LITTLE_ENDIAN
> +	/*
> +	 * On little-endian CPUs, adjust byte order:
> +	 * - Swap bytes when bpw = 8
> +	 * - Swap half-words when bpw = 16
> +	 * This ensures correct data ordering for DMA transfers.
> +	 */
> +	temp = dma_data->dma_tx_buf;
> +	for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {
> +		if (bytes_per_word == 1)
> +			swab32s(temp + i);
> +		else if (bytes_per_word == 2)
> +			swahw32s(temp + i);
> +	}
> +#endif
> +
> +	return 0;
> +}
> +
> +static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
> +				    struct spi_transfer *transfer,
> +				    bool word_delay)
> +{
> +	u32 pre_bl, tail_bl;
> +	u32 ctrl;
> +	int ret;
> +
> +	/*
> +	 * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512
> +	 * and is not a multiple of 512, a tail transfer is required. In this case,
> +	 * an extra DMA request is issued for the remaining data.

Why do you need an extra transfer in this case?

> +	 */
> +	ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
> +	if (word_delay) {
> +		/*
> +		 * When SPI IMX need to support word delay, according to "Sample Period Control
> +		 * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG)
> +		 * provides software a way to insert delays (wait states) between consecutive
> +		 * SPI transfers. As a result, ECSPI can only transfer one word per frame, and
> +		 * the delay occurs between frames.
> +		 */
> +		spi_imx->dma_package_num = 1;
> +		pre_bl = spi_imx->bits_per_word - 1;
> +	} else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
> +		spi_imx->dma_package_num = 1;
> +		pre_bl = transfer->len * BITS_PER_BYTE - 1;
> +	} else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
> +		spi_imx->dma_package_num = 1;
> +		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> +	} else {
> +		spi_imx->dma_package_num = 2;
> +		pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> +		tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1;
> +	}
> +
> +	spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
> +					  sizeof(struct dma_data_package),
> +					  GFP_KERNEL | __GFP_ZERO);
> +	if (!spi_imx->dma_data) {
> +		dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
> +		return -ENOMEM;
> +	}
> +
> +	if (spi_imx->dma_package_num == 1) {
> +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> +		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> +		spi_imx->dma_data[0].cmd_word = ctrl;
> +		spi_imx->dma_data[0].data_len = transfer->len;
> +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> +						 word_delay);
> +		if (ret) {
> +			kfree(spi_imx->dma_data);
> +			return ret;
> +		}
> +	} else {
> +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> +		ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> +		spi_imx->dma_data[0].cmd_word = ctrl;
> +		spi_imx->dma_data[0].data_len = DIV_ROUND_DOWN_ULL(transfer->len,
> +								   MX51_ECSPI_CTRL_MAX_BURST)
> +								   * MX51_ECSPI_CTRL_MAX_BURST;
> +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> +						 false);
> +		if (ret) {
> +			kfree(spi_imx->dma_data);
> +			return ret;
> +		}
> +
> +		ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> +		ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> +		spi_imx->dma_data[1].cmd_word = ctrl;
> +		spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST;
> +		ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
> +						 transfer->tx_buf + spi_imx->dma_data[0].data_len,
> +						 false);
> +		if (ret) {
> +			kfree(spi_imx->dma_data[0].dma_tx_buf);
> +			kfree(spi_imx->dma_data[0].dma_rx_buf);
> +			kfree(spi_imx->dma_data);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
> +			      struct dma_data_package *dma_data,
>  			      struct spi_transfer *transfer)
>  {
> -	struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
>  	struct spi_controller *controller = spi_imx->controller;
>  	struct dma_async_tx_descriptor *desc_tx, *desc_rx;
>  	unsigned long transfer_timeout;
> @@ -1451,9 +1716,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
>  	 * The TX DMA setup starts the transfer, so make sure RX is configured
>  	 * before TX.
>  	 */
> -	desc_rx = dmaengine_prep_slave_sg(controller->dma_rx,
> -					  rx->sgl, rx->nents, DMA_DEV_TO_MEM,
> -					  DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> +	desc_rx = dmaengine_prep_slave_single(controller->dma_rx, dma_data->dma_rx_addr,
> +					      dma_data->dma_len, DMA_DEV_TO_MEM,
> +					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
>  	if (!desc_rx) {
>  		transfer->error |= SPI_TRANS_FAIL_NO_START;
>  		return -EINVAL;
> @@ -1471,9 +1736,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
>  	reinit_completion(&spi_imx->dma_rx_completion);
>  	dma_async_issue_pending(controller->dma_rx);
>
> -	desc_tx = dmaengine_prep_slave_sg(controller->dma_tx,
> -					  tx->sgl, tx->nents, DMA_MEM_TO_DEV,
> -					  DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> +	desc_tx = dmaengine_prep_slave_single(controller->dma_tx, dma_data->dma_tx_addr,
> +					      dma_data->dma_len, DMA_MEM_TO_DEV,
> +					      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
>  	if (!desc_tx)
>  		goto dmaengine_terminate_rx;
>
> @@ -1521,16 +1786,16 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
>  }
>
>  static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
> -				     struct spi_transfer *transfer)
> +				     struct dma_data_package *dma_data,
> +				     bool word_delay)
>  {
> -	struct sg_table *rx = &transfer->rx_sg;
> -	struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents);
> -	unsigned int bytes_per_word, i;
> +	unsigned int bytes_per_word = word_delay ?
> +				      spi_imx_bytes_per_word(spi_imx->bits_per_word) :
> +				      BYTES_PER_32BITS_WORD;
> +	unsigned int i;
>
> -	/* Get the right burst length from the last sg to ensure no tail data */
> -	bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word);
>  	for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) {
> -		if (!(sg_dma_len(last_sg) % (i * bytes_per_word)))
> +		if (!dma_data->dma_len % (i * bytes_per_word))
>  			break;
>  	}
>  	/* Use 1 as wml in case no available burst length got */
> @@ -1540,25 +1805,29 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
>  	spi_imx->wml = i;
>  }
>
> -static int spi_imx_dma_configure(struct spi_controller *controller)
> +static int spi_imx_dma_configure(struct spi_controller *controller, bool word_delay)
>  {
>  	int ret;
>  	enum dma_slave_buswidth buswidth;
>  	struct dma_slave_config rx = {}, tx = {};
>  	struct spi_imx_data *spi_imx = spi_controller_get_devdata(controller);
>
> -	switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
> -	case 4:
> +	if (word_delay) {
> +		switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
> +		case 4:
> +			buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
> +			break;
> +		case 2:
> +			buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
> +			break;
> +		case 1:
> +			buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +	} else {
>  		buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
> -		break;
> -	case 2:
> -		buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
> -		break;
> -	case 1:
> -		buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
> -		break;
> -	default:
> -		return -EINVAL;
>  	}
>
>  	tx.direction = DMA_MEM_TO_DEV;
> @@ -1584,15 +1853,17 @@ static int spi_imx_dma_configure(struct spi_controller *controller)
>  	return 0;
>  }
>
> -static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> -				struct spi_transfer *transfer)
> +static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx,
> +					struct dma_data_package *dma_data,
> +					struct spi_transfer *transfer,
> +					bool word_delay)
>  {
>  	struct spi_controller *controller = spi_imx->controller;
>  	int ret;
>
> -	spi_imx_dma_max_wml_find(spi_imx, transfer);
> +	spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay);
>
> -	ret = spi_imx_dma_configure(controller);
> +	ret = spi_imx_dma_configure(controller, word_delay);
>  	if (ret)
>  		goto dma_failure_no_start;
>
> @@ -1603,10 +1874,17 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
>  	}
>  	spi_imx->devtype_data->setup_wml(spi_imx);
>
> -	ret = spi_imx_dma_submit(spi_imx, transfer);
> +	ret = spi_imx_dma_submit(spi_imx, dma_data, transfer);
>  	if (ret)
>  		return ret;
>
> +	/* Trim the DMA RX buffer and copy the actual data to rx_buf */
> +	dma_sync_single_for_cpu(controller->dma_rx->device->dev, dma_data->dma_rx_addr,
> +				dma_data->dma_len, DMA_FROM_DEVICE);
> +	spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf + spi_imx->rx_offset,
> +				   word_delay);
> +	spi_imx->rx_offset += dma_data->data_len;
> +
>  	return 0;
>  /* fallback to pio */
>  dma_failure_no_start:
> @@ -1614,6 +1892,60 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
>  	return ret;
>  }
>
> +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> +				struct spi_transfer *transfer)
> +{
> +	bool word_delay = transfer->word_delay.value != 0;
> +	int ret;
> +	int i;
> +
> +	ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
> +	if (ret < 0) {
> +		transfer->error |= SPI_TRANS_FAIL_NO_START;
> +		dev_err(spi_imx->dev, "DMA data prepare fail\n");
> +		goto fallback_pio;
> +	}
> +
> +	spi_imx->rx_offset = 0;
> +
> +	/* Each dma_package performs a separate DMA transfer once */
> +	for (i = 0; i < spi_imx->dma_package_num; i++) {
> +		ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
> +		if (ret < 0) {
> +			transfer->error |= SPI_TRANS_FAIL_NO_START;
> +			dev_err(spi_imx->dev, "DMA map fail\n");
> +			break;
> +		}
> +
> +		/* Update the CTRL register BL field */
> +		writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
> +
> +		ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
> +						   transfer, word_delay);
> +
> +		/* Whether the dma transmission is successful or not, dma unmap is necessary */
> +		spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
> +
> +		if (ret < 0) {
> +			dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
> +			break;
> +		}
> +	}
> +
> +	for (int j = 0; j < spi_imx->dma_package_num; j++) {
> +		kfree(spi_imx->dma_data[j].dma_tx_buf);
> +		kfree(spi_imx->dma_data[j].dma_rx_buf);
> +	}
> +	kfree(spi_imx->dma_data);
> +
> +fallback_pio:
> +	/* If no any dma package data is transferred, fallback to PIO mode transfer */
> +	if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
> +		transfer->error &= !SPI_TRANS_FAIL_NO_START;
> +
> +	return ret;
> +}
> +
>  static int spi_imx_pio_transfer(struct spi_device *spi,
>  				struct spi_transfer *transfer)
>  {
> @@ -1780,9 +2112,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller,
>  	 * transfer, the SPI transfer has already been mapped, so we
>  	 * have to do the DMA transfer here.
>  	 */
> -	if (spi_imx->usedma)
> -		return spi_imx_dma_transfer(spi_imx, transfer);
> -
> +	if (spi_imx->usedma) {
> +		ret = spi_imx_dma_transfer(spi_imx, transfer);
> +		if (transfer->error & SPI_TRANS_FAIL_NO_START) {
> +			spi_imx->usedma = false;
> +			return spi_imx_pio_transfer(spi, transfer);
> +		}
> +		return ret;
> +	}
>  	/* run in polling mode for short transfers */
>  	if (transfer->len == 1 || (polling_limit_us &&
>  				   spi_imx_transfer_estimate_time_us(transfer) < polling_limit_us))
> --
> 2.34.1
>
>
>

regards,
Marc

--
Pengutronix e.K.                 | Marc Kleine-Budde          |
Embedded Linux                   | https://www.pengutronix.de |
Vertretung Nürnberg              | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax:   +49-5121-206917-9   |

[PATCH 1/6] spi: imx: group spi_imx_dma_configure() with spi_imx_dma_transfer()
[PATCH 2/6] spi: imx: introduce helper to clear DMA mode logic
[PATCH 3/6] spi: imx: avoid dmaengine_terminate_all() on TX prep failure
[PATCH 4/6] spi: imx: handle DMA submission errors with dma_submit_error()
[PATCH 5/6] spi: imx: support dynamic burst length for ECSPI DMA mode
[PATCH 6/6] spi: imx: enable DMA mode for target operation