ECSPI transfers only one word per frame in DMA mode, causing SCLK stalls
between words due to BURST_LENGTH updates, which significantly impacts
performance.
To improve throughput, configure BURST_LENGTH as large as possible (up to
512 bytes per frame) instead of word length. This avoids delays between
words. When transfer length is not 4-byte aligned, use bounce buffers to
align data for DMA. TX uses aligned words for TXFIFO, while RX trims DMA
buffer data after transfer completion.
Introduce a new dma_package structure to store:
1. BURST_LENGTH values for each DMA request
2. Variables for DMA submission
3. DMA transmission length and actual data length
Handle three cases:
- len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1
- len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes)
- len > 512 and unaligned: two packages, second for tail data
Performance test (spidev_test @10MHz, 4KB):
Before: tx/rx ~6651.9 kbps
After: tx/rx ~9922.2 kbps (~50% improvement)
For compatibility with slow SPI devices, add configurable word delay in
DMA mode. When word delay is set, dynamic burst is disabled and
BURST_LENGTH equals word length.
Signed-off-by: Carlos Song <carlos.song@nxp.com>
---
drivers/spi/spi-imx.c | 409 ++++++++++++++++++++++++++++++++++++++----
1 file changed, 373 insertions(+), 36 deletions(-)
diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index 42f64d9535c9..f02a47fbba8a 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us,
#define MX51_ECSPI_CTRL_MAX_BURST 512
/* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/
#define MX53_MAX_TRANSFER_BYTES 512
+#define BYTES_PER_32BITS_WORD 4
enum spi_imx_devtype {
IMX1_CSPI,
@@ -95,6 +96,16 @@ struct spi_imx_devtype_data {
enum spi_imx_devtype devtype;
};
+struct dma_data_package {
+ u32 cmd_word;
+ void *dma_rx_buf;
+ void *dma_tx_buf;
+ dma_addr_t dma_tx_addr;
+ dma_addr_t dma_rx_addr;
+ int dma_len;
+ int data_len;
+};
+
struct spi_imx_data {
struct spi_controller *controller;
struct device *dev;
@@ -130,6 +141,9 @@ struct spi_imx_data {
u32 wml;
struct completion dma_rx_completion;
struct completion dma_tx_completion;
+ struct dma_data_package *dma_data;
+ int dma_package_num;
+ int rx_offset;
const struct spi_imx_devtype_data *devtype_data;
};
@@ -189,6 +203,9 @@ MXC_SPI_BUF_TX(u16)
MXC_SPI_BUF_RX(u32)
MXC_SPI_BUF_TX(u32)
+/* Align to cache line to avoid swiotlo bounce */
+#define DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment())
+
/* First entry is reserved, second entry is valid only if SDHC_SPIEN is set
* (which is currently not the case in this driver)
*/
@@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller *controller, struct spi_device
if (transfer->len < spi_imx->devtype_data->fifo_size)
return false;
+ /* DMA only can transmit data in bytes */
+ if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 &&
+ spi_imx->bits_per_word != 32)
+ return false;
+
+ if (transfer->len >= MAX_SDMA_BD_BYTES)
+ return false;
+
spi_imx->dynamic_burst = 0;
return true;
@@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev, struct spi_imx_data *spi_imx,
init_completion(&spi_imx->dma_rx_completion);
init_completion(&spi_imx->dma_tx_completion);
- controller->can_dma = spi_imx_can_dma;
- controller->max_dma_len = MAX_SDMA_BD_BYTES;
spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX |
SPI_CONTROLLER_MUST_TX;
@@ -1437,10 +1460,252 @@ static int spi_imx_calculate_timeout(struct spi_imx_data *spi_imx, int size)
return secs_to_jiffies(2 * timeout);
}
+static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx,
+ struct dma_data_package *dma_data)
+{
+ struct device *tx_dev = spi_imx->controller->dma_tx->device->dev;
+ struct device *rx_dev = spi_imx->controller->dma_rx->device->dev;
+
+ dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
+ DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+ DMA_TO_DEVICE);
+ dma_unmap_single(rx_dev, dma_data->dma_rx_addr,
+ DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+ DMA_FROM_DEVICE);
+}
+
+static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx,
+ struct dma_data_package *dma_data, void *rx_buf,
+ bool word_delay)
+{
+#ifdef __LITTLE_ENDIAN
+ unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
+ u32 *temp = dma_data->dma_rx_buf;
+#endif
+ void *copy_ptr;
+ int unaligned;
+
+#ifdef __LITTLE_ENDIAN
+ /*
+ * On little-endian CPUs, adjust byte order:
+ * - Swap bytes when bpw = 8
+ * - Swap half-words when bpw = 16
+ * This ensures correct data ordering for DMA transfers.
+ */
+ if (!word_delay) {
+ for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {
+ if (bytes_per_word == 1)
+ swab32s(temp + i);
+ else if (bytes_per_word == 2)
+ swahw32s(temp + i);
+ }
+ }
+#endif
+
+ if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
+ unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
+ copy_ptr = (u8 *)dma_data->dma_rx_buf + BYTES_PER_32BITS_WORD - unaligned;
+ } else {
+ copy_ptr = dma_data->dma_rx_buf;
+ }
+
+ memcpy(rx_buf, copy_ptr, dma_data->data_len);
+}
+
+static int spi_imx_dma_map(struct spi_imx_data *spi_imx,
+ struct dma_data_package *dma_data)
+{
+ struct spi_controller *controller = spi_imx->controller;
+ struct device *tx_dev = controller->dma_tx->device->dev;
+ struct device *rx_dev = controller->dma_rx->device->dev;
+
+ dma_data->dma_tx_addr = dma_map_single(tx_dev, dma_data->dma_tx_buf,
+ DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(tx_dev, dma_data->dma_tx_addr)) {
+ dev_err(spi_imx->dev, "DMA TX map failed\n");
+ return -EINVAL;
+ }
+
+ dma_data->dma_rx_addr = dma_map_single(rx_dev, dma_data->dma_rx_buf,
+ DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+ DMA_FROM_DEVICE);
+ if (dma_mapping_error(rx_dev, dma_data->dma_rx_addr)) {
+ dev_err(spi_imx->dev, "DMA RX map failed\n");
+ goto rx_map_err;
+ }
+
+ return 0;
+
+rx_map_err:
+ dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
+ DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
+ DMA_TO_DEVICE);
+ return -EINVAL;
+}
+
+static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx,
+ struct dma_data_package *dma_data,
+ const void *tx_buf,
+ bool word_delay)
+{
+#ifdef __LITTLE_ENDIAN
+ unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
+ u32 *temp;
+#endif
+ void *copy_ptr;
+ int unaligned;
+
+ if (word_delay) {
+ dma_data->dma_len = dma_data->data_len;
+ } else {
+ /*
+ * As per the reference manual, when burst length = 32*n + m bits, ECSPI
+ * sends m LSB bits in the first word, followed by n full 32-bit words.
+ * Since actual data may not be 4-byte aligned, allocate DMA TX/RX buffers
+ * to ensure alignment. For TX, DMA pushes 4-byte aligned words to TXFIFO,
+ * while ECSPI uses BURST_LENGTH settings to maintain correct bit count.
+ * For RX, DMA receives 32-bit words from RXFIFO; after transfer completes,
+ * trim the DMA RX buffer and copy the actual data to rx_buf.
+ */
+ dma_data->dma_len = ALIGN(dma_data->data_len, BYTES_PER_32BITS_WORD);
+ }
+
+ dma_data->dma_tx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);
+ if (!dma_data->dma_tx_buf)
+ return -ENOMEM;
+
+ dma_data->dma_rx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);
+ if (!dma_data->dma_rx_buf) {
+ kfree(dma_data->dma_tx_buf);
+ return -ENOMEM;
+ }
+
+ if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
+ unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
+ copy_ptr = (u8 *)dma_data->dma_tx_buf + BYTES_PER_32BITS_WORD - unaligned;
+ } else {
+ copy_ptr = dma_data->dma_tx_buf;
+ }
+
+ memcpy(copy_ptr, tx_buf, dma_data->data_len);
+
+ /*
+ * When word_delay is enabled, DMA transfers an entire word in one minor loop.
+ * In this case, no data requires additional handling.
+ */
+ if (word_delay)
+ return 0;
+
+#ifdef __LITTLE_ENDIAN
+ /*
+ * On little-endian CPUs, adjust byte order:
+ * - Swap bytes when bpw = 8
+ * - Swap half-words when bpw = 16
+ * This ensures correct data ordering for DMA transfers.
+ */
+ temp = dma_data->dma_tx_buf;
+ for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {
+ if (bytes_per_word == 1)
+ swab32s(temp + i);
+ else if (bytes_per_word == 2)
+ swahw32s(temp + i);
+ }
+#endif
+
+ return 0;
+}
+
+static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
+ struct spi_transfer *transfer,
+ bool word_delay)
+{
+ u32 pre_bl, tail_bl;
+ u32 ctrl;
+ int ret;
+
+ /*
+ * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512
+ * and is not a multiple of 512, a tail transfer is required. In this case,
+ * an extra DMA request is issued for the remaining data.
+ */
+ ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
+ if (word_delay) {
+ /*
+ * When SPI IMX need to support word delay, according to "Sample Period Control
+ * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG)
+ * provides software a way to insert delays (wait states) between consecutive
+ * SPI transfers. As a result, ECSPI can only transfer one word per frame, and
+ * the delay occurs between frames.
+ */
+ spi_imx->dma_package_num = 1;
+ pre_bl = spi_imx->bits_per_word - 1;
+ } else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
+ spi_imx->dma_package_num = 1;
+ pre_bl = transfer->len * BITS_PER_BYTE - 1;
+ } else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
+ spi_imx->dma_package_num = 1;
+ pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
+ } else {
+ spi_imx->dma_package_num = 2;
+ pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
+ tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1;
+ }
+
+ spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
+ sizeof(struct dma_data_package),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!spi_imx->dma_data) {
+ dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
+ return -ENOMEM;
+ }
+
+ if (spi_imx->dma_package_num == 1) {
+ ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+ ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+ spi_imx->dma_data[0].cmd_word = ctrl;
+ spi_imx->dma_data[0].data_len = transfer->len;
+ ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
+ word_delay);
+ if (ret) {
+ kfree(spi_imx->dma_data);
+ return ret;
+ }
+ } else {
+ ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+ ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+ spi_imx->dma_data[0].cmd_word = ctrl;
+ spi_imx->dma_data[0].data_len = DIV_ROUND_DOWN_ULL(transfer->len,
+ MX51_ECSPI_CTRL_MAX_BURST)
+ * MX51_ECSPI_CTRL_MAX_BURST;
+ ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
+ false);
+ if (ret) {
+ kfree(spi_imx->dma_data);
+ return ret;
+ }
+
+ ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
+ ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
+ spi_imx->dma_data[1].cmd_word = ctrl;
+ spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST;
+ ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
+ transfer->tx_buf + spi_imx->dma_data[0].data_len,
+ false);
+ if (ret) {
+ kfree(spi_imx->dma_data[0].dma_tx_buf);
+ kfree(spi_imx->dma_data[0].dma_rx_buf);
+ kfree(spi_imx->dma_data);
+ }
+ }
+
+ return 0;
+}
+
static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
+ struct dma_data_package *dma_data,
struct spi_transfer *transfer)
{
- struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
struct spi_controller *controller = spi_imx->controller;
struct dma_async_tx_descriptor *desc_tx, *desc_rx;
unsigned long transfer_timeout;
@@ -1451,9 +1716,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
* The TX DMA setup starts the transfer, so make sure RX is configured
* before TX.
*/
- desc_rx = dmaengine_prep_slave_sg(controller->dma_rx,
- rx->sgl, rx->nents, DMA_DEV_TO_MEM,
- DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+ desc_rx = dmaengine_prep_slave_single(controller->dma_rx, dma_data->dma_rx_addr,
+ dma_data->dma_len, DMA_DEV_TO_MEM,
+ DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
if (!desc_rx) {
transfer->error |= SPI_TRANS_FAIL_NO_START;
return -EINVAL;
@@ -1471,9 +1736,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
reinit_completion(&spi_imx->dma_rx_completion);
dma_async_issue_pending(controller->dma_rx);
- desc_tx = dmaengine_prep_slave_sg(controller->dma_tx,
- tx->sgl, tx->nents, DMA_MEM_TO_DEV,
- DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+ desc_tx = dmaengine_prep_slave_single(controller->dma_tx, dma_data->dma_tx_addr,
+ dma_data->dma_len, DMA_MEM_TO_DEV,
+ DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
if (!desc_tx)
goto dmaengine_terminate_rx;
@@ -1521,16 +1786,16 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
}
static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
- struct spi_transfer *transfer)
+ struct dma_data_package *dma_data,
+ bool word_delay)
{
- struct sg_table *rx = &transfer->rx_sg;
- struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents);
- unsigned int bytes_per_word, i;
+ unsigned int bytes_per_word = word_delay ?
+ spi_imx_bytes_per_word(spi_imx->bits_per_word) :
+ BYTES_PER_32BITS_WORD;
+ unsigned int i;
- /* Get the right burst length from the last sg to ensure no tail data */
- bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word);
for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) {
- if (!(sg_dma_len(last_sg) % (i * bytes_per_word)))
+ if (!dma_data->dma_len % (i * bytes_per_word))
break;
}
/* Use 1 as wml in case no available burst length got */
@@ -1540,25 +1805,29 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
spi_imx->wml = i;
}
-static int spi_imx_dma_configure(struct spi_controller *controller)
+static int spi_imx_dma_configure(struct spi_controller *controller, bool word_delay)
{
int ret;
enum dma_slave_buswidth buswidth;
struct dma_slave_config rx = {}, tx = {};
struct spi_imx_data *spi_imx = spi_controller_get_devdata(controller);
- switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
- case 4:
+ if (word_delay) {
+ switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
+ case 4:
+ buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
+ break;
+ case 2:
+ buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
+ break;
+ case 1:
+ buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
+ break;
+ default:
+ return -EINVAL;
+ }
+ } else {
buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
- break;
- case 2:
- buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
- break;
- case 1:
- buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
- break;
- default:
- return -EINVAL;
}
tx.direction = DMA_MEM_TO_DEV;
@@ -1584,15 +1853,17 @@ static int spi_imx_dma_configure(struct spi_controller *controller)
return 0;
}
-static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
- struct spi_transfer *transfer)
+static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx,
+ struct dma_data_package *dma_data,
+ struct spi_transfer *transfer,
+ bool word_delay)
{
struct spi_controller *controller = spi_imx->controller;
int ret;
- spi_imx_dma_max_wml_find(spi_imx, transfer);
+ spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay);
- ret = spi_imx_dma_configure(controller);
+ ret = spi_imx_dma_configure(controller, word_delay);
if (ret)
goto dma_failure_no_start;
@@ -1603,10 +1874,17 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
}
spi_imx->devtype_data->setup_wml(spi_imx);
- ret = spi_imx_dma_submit(spi_imx, transfer);
+ ret = spi_imx_dma_submit(spi_imx, dma_data, transfer);
if (ret)
return ret;
+ /* Trim the DMA RX buffer and copy the actual data to rx_buf */
+ dma_sync_single_for_cpu(controller->dma_rx->device->dev, dma_data->dma_rx_addr,
+ dma_data->dma_len, DMA_FROM_DEVICE);
+ spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf + spi_imx->rx_offset,
+ word_delay);
+ spi_imx->rx_offset += dma_data->data_len;
+
return 0;
/* fallback to pio */
dma_failure_no_start:
@@ -1614,6 +1892,60 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
return ret;
}
+static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
+ struct spi_transfer *transfer)
+{
+ bool word_delay = transfer->word_delay.value != 0;
+ int ret;
+ int i;
+
+ ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
+ if (ret < 0) {
+ transfer->error |= SPI_TRANS_FAIL_NO_START;
+ dev_err(spi_imx->dev, "DMA data prepare fail\n");
+ goto fallback_pio;
+ }
+
+ spi_imx->rx_offset = 0;
+
+ /* Each dma_package performs a separate DMA transfer once */
+ for (i = 0; i < spi_imx->dma_package_num; i++) {
+ ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
+ if (ret < 0) {
+ transfer->error |= SPI_TRANS_FAIL_NO_START;
+ dev_err(spi_imx->dev, "DMA map fail\n");
+ break;
+ }
+
+ /* Update the CTRL register BL field */
+ writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
+
+ ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
+ transfer, word_delay);
+
+ /* Whether the dma transmission is successful or not, dma unmap is necessary */
+ spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
+
+ if (ret < 0) {
+ dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
+ break;
+ }
+ }
+
+ for (int j = 0; j < spi_imx->dma_package_num; j++) {
+ kfree(spi_imx->dma_data[j].dma_tx_buf);
+ kfree(spi_imx->dma_data[j].dma_rx_buf);
+ }
+ kfree(spi_imx->dma_data);
+
+fallback_pio:
+ /* If no any dma package data is transferred, fallback to PIO mode transfer */
+ if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
+ transfer->error &= !SPI_TRANS_FAIL_NO_START;
+
+ return ret;
+}
+
static int spi_imx_pio_transfer(struct spi_device *spi,
struct spi_transfer *transfer)
{
@@ -1780,9 +2112,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller,
* transfer, the SPI transfer has already been mapped, so we
* have to do the DMA transfer here.
*/
- if (spi_imx->usedma)
- return spi_imx_dma_transfer(spi_imx, transfer);
-
+ if (spi_imx->usedma) {
+ ret = spi_imx_dma_transfer(spi_imx, transfer);
+ if (transfer->error & SPI_TRANS_FAIL_NO_START) {
+ spi_imx->usedma = false;
+ return spi_imx_pio_transfer(spi, transfer);
+ }
+ return ret;
+ }
/* run in polling mode for short transfers */
if (transfer->len == 1 || (polling_limit_us &&
spi_imx_transfer_estimate_time_us(transfer) < polling_limit_us))
--
2.34.1
On 25.11.2025 18:06:17, Carlos Song wrote:
> static int spi_imx_pio_transfer(struct spi_device *spi,
> struct spi_transfer *transfer)
> {
> @@ -1780,9 +2112,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller,
> * transfer, the SPI transfer has already been mapped, so we
> * have to do the DMA transfer here.
> */
> - if (spi_imx->usedma)
> - return spi_imx_dma_transfer(spi_imx, transfer);
> -
> + if (spi_imx->usedma) {
> + ret = spi_imx_dma_transfer(spi_imx, transfer);
> + if (transfer->error & SPI_TRANS_FAIL_NO_START) {
> + spi_imx->usedma = false;
> + return spi_imx_pio_transfer(spi, transfer);
> + }
> + return ret;
> + }
Why do you do this? AFAICS the framework already does this for you see:
spi_transfer_one_message().
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Embedded Linux | https://www.pengutronix.de |
Vertretung Nürnberg | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-9 |
On 25.11.2025 18:06:17, Carlos Song wrote:
> +static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
> + struct spi_transfer *transfer,
> + bool word_delay)
> +{
> + u32 pre_bl, tail_bl;
> + u32 ctrl;
> + int ret;
> +
> + /*
> + * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512
> + * and is not a multiple of 512, a tail transfer is required. In this case,
> + * an extra DMA request is issued for the remaining data.
> + */
> + ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
> + if (word_delay) {
> + /*
> + * When SPI IMX need to support word delay, according to "Sample Period Control
> + * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG)
> + * provides software a way to insert delays (wait states) between consecutive
> + * SPI transfers. As a result, ECSPI can only transfer one word per frame, and
> + * the delay occurs between frames.
> + */
> + spi_imx->dma_package_num = 1;
> + pre_bl = spi_imx->bits_per_word - 1;
> + } else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
> + spi_imx->dma_package_num = 1;
> + pre_bl = transfer->len * BITS_PER_BYTE - 1;
> + } else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
> + spi_imx->dma_package_num = 1;
> + pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> + } else {
> + spi_imx->dma_package_num = 2;
> + pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> + tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1;
> + }
> +
> + spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
> + sizeof(struct dma_data_package),
> + GFP_KERNEL | __GFP_ZERO);
> + if (!spi_imx->dma_data) {
> + dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
> + return -ENOMEM;
> + }
> +
> + if (spi_imx->dma_package_num == 1) {
> + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> + ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> + spi_imx->dma_data[0].cmd_word = ctrl;
> + spi_imx->dma_data[0].data_len = transfer->len;
> + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> + word_delay);
> + if (ret) {
> + kfree(spi_imx->dma_data);
> + return ret;
> + }
> + } else {
> + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> + ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> + spi_imx->dma_data[0].cmd_word = ctrl;
> + spi_imx->dma_data[0].data_len = DIV_ROUND_DOWN_ULL(transfer->len,
> + MX51_ECSPI_CTRL_MAX_BURST)
> + * MX51_ECSPI_CTRL_MAX_BURST;
What mathematical operation do you want to do? Why do you use a 64 bit
div? What about round_down()?
> + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> + false);
> + if (ret) {
> + kfree(spi_imx->dma_data);
> + return ret;
> + }
> +
> + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> + ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> + spi_imx->dma_data[1].cmd_word = ctrl;
> + spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST;
> + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
> + transfer->tx_buf + spi_imx->dma_data[0].data_len,
> + false);
> + if (ret) {
> + kfree(spi_imx->dma_data[0].dma_tx_buf);
> + kfree(spi_imx->dma_data[0].dma_rx_buf);
> + kfree(spi_imx->dma_data);
> + }
> + }
> +
> + return 0;
> +}
regards,
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Embedded Linux | https://www.pengutronix.de |
Vertretung Nürnberg | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-9 |
On 25.11.2025 18:06:17, Carlos Song wrote:
> +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> + struct spi_transfer *transfer)
> +{
> + bool word_delay = transfer->word_delay.value != 0;
> + int ret;
> + int i;
> +
> + ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
> + if (ret < 0) {
> + transfer->error |= SPI_TRANS_FAIL_NO_START;
> + dev_err(spi_imx->dev, "DMA data prepare fail\n");
> + goto fallback_pio;
> + }
> +
> + spi_imx->rx_offset = 0;
> +
> + /* Each dma_package performs a separate DMA transfer once */
> + for (i = 0; i < spi_imx->dma_package_num; i++) {
> + ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
> + if (ret < 0) {
> + transfer->error |= SPI_TRANS_FAIL_NO_START;
What about:
if (i == 0)
transfer->error |= SPI_TRANS_FAIL_NO_START;
instead of removing the later?
> + dev_err(spi_imx->dev, "DMA map fail\n");
> + break;
> + }
> +
> + /* Update the CTRL register BL field */
> + writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
> +
> + ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
> + transfer, word_delay);
> +
> + /* Whether the dma transmission is successful or not, dma unmap is necessary */
> + spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
> +
> + if (ret < 0) {
> + dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
> + break;
> + }
> + }
> +
> + for (int j = 0; j < spi_imx->dma_package_num; j++) {
> + kfree(spi_imx->dma_data[j].dma_tx_buf);
> + kfree(spi_imx->dma_data[j].dma_rx_buf);
> + }
> + kfree(spi_imx->dma_data);
> +
> +fallback_pio:
> + /* If no any dma package data is transferred, fallback to PIO mode transfer */
> + if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
> + transfer->error &= !SPI_TRANS_FAIL_NO_START;
> +
> + return ret;
> +}
> +
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Embedded Linux | https://www.pengutronix.de |
Vertretung Nürnberg | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-9 |
On 25.11.2025 18:06:17, Carlos Song wrote:
> +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> + struct spi_transfer *transfer)
> +{
> + bool word_delay = transfer->word_delay.value != 0;
> + int ret;
> + int i;
> +
> + ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
> + if (ret < 0) {
> + transfer->error |= SPI_TRANS_FAIL_NO_START;
> + dev_err(spi_imx->dev, "DMA data prepare fail\n");
> + goto fallback_pio;
> + }
> +
> + spi_imx->rx_offset = 0;
> +
> + /* Each dma_package performs a separate DMA transfer once */
> + for (i = 0; i < spi_imx->dma_package_num; i++) {
> + ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
> + if (ret < 0) {
> + transfer->error |= SPI_TRANS_FAIL_NO_START;
> + dev_err(spi_imx->dev, "DMA map fail\n");
> + break;
> + }
> +
> + /* Update the CTRL register BL field */
> + writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
> +
> + ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
> + transfer, word_delay);
> +
> + /* Whether the dma transmission is successful or not, dma unmap is necessary */
> + spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
> +
> + if (ret < 0) {
> + dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
> + break;
> + }
> + }
> +
> + for (int j = 0; j < spi_imx->dma_package_num; j++) {
> + kfree(spi_imx->dma_data[j].dma_tx_buf);
> + kfree(spi_imx->dma_data[j].dma_rx_buf);
> + }
> + kfree(spi_imx->dma_data);
> +
> +fallback_pio:
> + /* If no any dma package data is transferred, fallback to PIO mode transfer */
> + if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
> + transfer->error &= !SPI_TRANS_FAIL_NO_START;
^
this doesn't look correct, you probably want to use a "~", right?
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Embedded Linux | https://www.pengutronix.de |
Vertretung Nürnberg | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-9 |
On 25.11.2025 18:06:17, Carlos Song wrote:
> ECSPI transfers only one word per frame in DMA mode, causing SCLK stalls
> between words due to BURST_LENGTH updates, which significantly impacts
> performance.
>
> To improve throughput, configure BURST_LENGTH as large as possible (up to
> 512 bytes per frame) instead of word length. This avoids delays between
> words. When transfer length is not 4-byte aligned, use bounce buffers to
> align data for DMA. TX uses aligned words for TXFIFO, while RX trims DMA
> buffer data after transfer completion.
>
> Introduce a new dma_package structure to store:
> 1. BURST_LENGTH values for each DMA request
> 2. Variables for DMA submission
> 3. DMA transmission length and actual data length
>
> Handle three cases:
> - len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1
> - len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes)
> - len > 512 and unaligned: two packages, second for tail data
>
> Performance test (spidev_test @10MHz, 4KB):
> Before: tx/rx ~6651.9 kbps
> After: tx/rx ~9922.2 kbps (~50% improvement)
>
> For compatibility with slow SPI devices, add configurable word delay in
> DMA mode. When word delay is set, dynamic burst is disabled and
> BURST_LENGTH equals word length.
>
> Signed-off-by: Carlos Song <carlos.song@nxp.com>
> ---
> drivers/spi/spi-imx.c | 409 ++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 373 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
> index 42f64d9535c9..f02a47fbba8a 100644
> --- a/drivers/spi/spi-imx.c
> +++ b/drivers/spi/spi-imx.c
> @@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us,
> #define MX51_ECSPI_CTRL_MAX_BURST 512
> /* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/
> #define MX53_MAX_TRANSFER_BYTES 512
> +#define BYTES_PER_32BITS_WORD 4
>
> enum spi_imx_devtype {
> IMX1_CSPI,
> @@ -95,6 +96,16 @@ struct spi_imx_devtype_data {
> enum spi_imx_devtype devtype;
> };
>
> +struct dma_data_package {
> + u32 cmd_word;
> + void *dma_rx_buf;
> + void *dma_tx_buf;
> + dma_addr_t dma_tx_addr;
> + dma_addr_t dma_rx_addr;
Better use uniform indention: here one space, not one tab.
> + int dma_len;
> + int data_len;
> +};
> +
> struct spi_imx_data {
> struct spi_controller *controller;
> struct device *dev;
> @@ -130,6 +141,9 @@ struct spi_imx_data {
> u32 wml;
> struct completion dma_rx_completion;
> struct completion dma_tx_completion;
> + struct dma_data_package *dma_data;
please add __counted_by(dma_package_num)
> + int dma_package_num;
> + int rx_offset;
>
> const struct spi_imx_devtype_data *devtype_data;
> };
> @@ -189,6 +203,9 @@ MXC_SPI_BUF_TX(u16)
> MXC_SPI_BUF_RX(u32)
> MXC_SPI_BUF_TX(u32)
>
> +/* Align to cache line to avoid swiotlo bounce */
> +#define DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment())
> +
> /* First entry is reserved, second entry is valid only if SDHC_SPIEN is set
> * (which is currently not the case in this driver)
> */
> @@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller *controller, struct spi_device
> if (transfer->len < spi_imx->devtype_data->fifo_size)
> return false;
>
> + /* DMA only can transmit data in bytes */
> + if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 &&
> + spi_imx->bits_per_word != 32)
> + return false;
> +
> + if (transfer->len >= MAX_SDMA_BD_BYTES)
> + return false;
> +
> spi_imx->dynamic_burst = 0;
>
> return true;
> @@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev, struct spi_imx_data *spi_imx,
>
> init_completion(&spi_imx->dma_rx_completion);
> init_completion(&spi_imx->dma_tx_completion);
> - controller->can_dma = spi_imx_can_dma;
> - controller->max_dma_len = MAX_SDMA_BD_BYTES;
> spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX |
> SPI_CONTROLLER_MUST_TX;
>
> @@ -1437,10 +1460,252 @@ static int spi_imx_calculate_timeout(struct spi_imx_data *spi_imx, int size)
> return secs_to_jiffies(2 * timeout);
> }
>
> +static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx,
> + struct dma_data_package *dma_data)
> +{
> + struct device *tx_dev = spi_imx->controller->dma_tx->device->dev;
> + struct device *rx_dev = spi_imx->controller->dma_rx->device->dev;
> +
> + dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
> + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> + DMA_TO_DEVICE);
> + dma_unmap_single(rx_dev, dma_data->dma_rx_addr,
> + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> + DMA_FROM_DEVICE);
> +}
> +
> +static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx,
> + struct dma_data_package *dma_data, void *rx_buf,
> + bool word_delay)
> +{
> +#ifdef __LITTLE_ENDIAN
> + unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
> + u32 *temp = dma_data->dma_rx_buf;
can you move this into the scope of the if() below?
> +#endif
> + void *copy_ptr;
> + int unaligned;
> +
> +#ifdef __LITTLE_ENDIAN
> + /*
> + * On little-endian CPUs, adjust byte order:
> + * - Swap bytes when bpw = 8
> + * - Swap half-words when bpw = 16
> + * This ensures correct data ordering for DMA transfers.
> + */
> + if (!word_delay) {
> + for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {
sizeof(*temp) instead of BYTES_PER_32BITS_WORD?
> + if (bytes_per_word == 1)
> + swab32s(temp + i);
> + else if (bytes_per_word == 2)
> + swahw32s(temp + i);
Why do you do first a swap in place and then a memcpy()
> + }
> + }
> +#endif
> +
> + if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
I think this deserves a comment, why you do a re-alignment of the data here.
> + unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
> + copy_ptr = (u8 *)dma_data->dma_rx_buf + BYTES_PER_32BITS_WORD - unaligned;
> + } else {
> + copy_ptr = dma_data->dma_rx_buf;
Why do you use the bounce buffer if the data is aligned correct?
> + }
> +
> + memcpy(rx_buf, copy_ptr, dma_data->data_len);
> +}
> +
> +static int spi_imx_dma_map(struct spi_imx_data *spi_imx,
> + struct dma_data_package *dma_data)
> +{
> + struct spi_controller *controller = spi_imx->controller;
> + struct device *tx_dev = controller->dma_tx->device->dev;
> + struct device *rx_dev = controller->dma_rx->device->dev;
> +
> + dma_data->dma_tx_addr = dma_map_single(tx_dev, dma_data->dma_tx_buf,
> + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> + DMA_TO_DEVICE);
> + if (dma_mapping_error(tx_dev, dma_data->dma_tx_addr)) {
> + dev_err(spi_imx->dev, "DMA TX map failed\n");
> + return -EINVAL;
please propagate the error value of dma_mapping_error()
> + }
> +
> + dma_data->dma_rx_addr = dma_map_single(rx_dev, dma_data->dma_rx_buf,
> + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> + DMA_FROM_DEVICE);
> + if (dma_mapping_error(rx_dev, dma_data->dma_rx_addr)) {
> + dev_err(spi_imx->dev, "DMA RX map failed\n");
> + goto rx_map_err;
there's only one user of the dma_unmap_single(), so no need for the
goto.
> + }
> +
> + return 0;
> +
> +rx_map_err:
> + dma_unmap_single(tx_dev, dma_data->dma_tx_addr,
> + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len),
> + DMA_TO_DEVICE);
> + return -EINVAL;
> +}
> +
> +static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx,
> + struct dma_data_package *dma_data,
> + const void *tx_buf,
> + bool word_delay)
> +{
> +#ifdef __LITTLE_ENDIAN
> + unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word);
> + u32 *temp;
> +#endif
move into scope of if()
> + void *copy_ptr;
> + int unaligned;
> +
> + if (word_delay) {
> + dma_data->dma_len = dma_data->data_len;
> + } else {
> + /*
> + * As per the reference manual, when burst length = 32*n + m bits, ECSPI
> + * sends m LSB bits in the first word, followed by n full 32-bit words.
> + * Since actual data may not be 4-byte aligned, allocate DMA TX/RX buffers
> + * to ensure alignment. For TX, DMA pushes 4-byte aligned words to TXFIFO,
> + * while ECSPI uses BURST_LENGTH settings to maintain correct bit count.
> + * For RX, DMA receives 32-bit words from RXFIFO; after transfer completes,
> + * trim the DMA RX buffer and copy the actual data to rx_buf.
> + */
Ahh, please add the corresponding description for rx.
> + dma_data->dma_len = ALIGN(dma_data->data_len, BYTES_PER_32BITS_WORD);
> + }
> +
> + dma_data->dma_tx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);
kzalloc()?
> + if (!dma_data->dma_tx_buf)
> + return -ENOMEM;
> +
> + dma_data->dma_rx_buf = kmalloc(dma_data->dma_len, GFP_KERNEL | __GFP_ZERO);
> + if (!dma_data->dma_rx_buf) {
> + kfree(dma_data->dma_tx_buf);
> + return -ENOMEM;
> + }
> +
> + if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) {
> + unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD;
> + copy_ptr = (u8 *)dma_data->dma_tx_buf + BYTES_PER_32BITS_WORD - unaligned;
> + } else {
> + copy_ptr = dma_data->dma_tx_buf;
> + }
> +
> + memcpy(copy_ptr, tx_buf, dma_data->data_len);
> +
> + /*
> + * When word_delay is enabled, DMA transfers an entire word in one minor loop.
> + * In this case, no data requires additional handling.
> + */
> + if (word_delay)
> + return 0;
> +
> +#ifdef __LITTLE_ENDIAN
> + /*
> + * On little-endian CPUs, adjust byte order:
> + * - Swap bytes when bpw = 8
> + * - Swap half-words when bpw = 16
> + * This ensures correct data ordering for DMA transfers.
> + */
> + temp = dma_data->dma_tx_buf;
> + for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, BYTES_PER_32BITS_WORD); i++) {
> + if (bytes_per_word == 1)
> + swab32s(temp + i);
> + else if (bytes_per_word == 2)
> + swahw32s(temp + i);
> + }
> +#endif
> +
> + return 0;
> +}
> +
> +static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx,
> + struct spi_transfer *transfer,
> + bool word_delay)
> +{
> + u32 pre_bl, tail_bl;
> + u32 ctrl;
> + int ret;
> +
> + /*
> + * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512
> + * and is not a multiple of 512, a tail transfer is required. In this case,
> + * an extra DMA request is issued for the remaining data.
Why do you need an extra transfer in this case?
> + */
> + ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL);
> + if (word_delay) {
> + /*
> + * When SPI IMX need to support word delay, according to "Sample Period Control
> + * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG)
> + * provides software a way to insert delays (wait states) between consecutive
> + * SPI transfers. As a result, ECSPI can only transfer one word per frame, and
> + * the delay occurs between frames.
> + */
> + spi_imx->dma_package_num = 1;
> + pre_bl = spi_imx->bits_per_word - 1;
> + } else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) {
> + spi_imx->dma_package_num = 1;
> + pre_bl = transfer->len * BITS_PER_BYTE - 1;
> + } else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) {
> + spi_imx->dma_package_num = 1;
> + pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> + } else {
> + spi_imx->dma_package_num = 2;
> + pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1;
> + tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1;
> + }
> +
> + spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num,
> + sizeof(struct dma_data_package),
> + GFP_KERNEL | __GFP_ZERO);
> + if (!spi_imx->dma_data) {
> + dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n");
> + return -ENOMEM;
> + }
> +
> + if (spi_imx->dma_package_num == 1) {
> + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> + ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> + spi_imx->dma_data[0].cmd_word = ctrl;
> + spi_imx->dma_data[0].data_len = transfer->len;
> + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> + word_delay);
> + if (ret) {
> + kfree(spi_imx->dma_data);
> + return ret;
> + }
> + } else {
> + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> + ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> + spi_imx->dma_data[0].cmd_word = ctrl;
> + spi_imx->dma_data[0].data_len = DIV_ROUND_DOWN_ULL(transfer->len,
> + MX51_ECSPI_CTRL_MAX_BURST)
> + * MX51_ECSPI_CTRL_MAX_BURST;
> + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf,
> + false);
> + if (ret) {
> + kfree(spi_imx->dma_data);
> + return ret;
> + }
> +
> + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK;
> + ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET;
> + spi_imx->dma_data[1].cmd_word = ctrl;
> + spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST;
> + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1],
> + transfer->tx_buf + spi_imx->dma_data[0].data_len,
> + false);
> + if (ret) {
> + kfree(spi_imx->dma_data[0].dma_tx_buf);
> + kfree(spi_imx->dma_data[0].dma_rx_buf);
> + kfree(spi_imx->dma_data);
> + }
> + }
> +
> + return 0;
> +}
> +
> static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
> + struct dma_data_package *dma_data,
> struct spi_transfer *transfer)
> {
> - struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
> struct spi_controller *controller = spi_imx->controller;
> struct dma_async_tx_descriptor *desc_tx, *desc_rx;
> unsigned long transfer_timeout;
> @@ -1451,9 +1716,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
> * The TX DMA setup starts the transfer, so make sure RX is configured
> * before TX.
> */
> - desc_rx = dmaengine_prep_slave_sg(controller->dma_rx,
> - rx->sgl, rx->nents, DMA_DEV_TO_MEM,
> - DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> + desc_rx = dmaengine_prep_slave_single(controller->dma_rx, dma_data->dma_rx_addr,
> + dma_data->dma_len, DMA_DEV_TO_MEM,
> + DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> if (!desc_rx) {
> transfer->error |= SPI_TRANS_FAIL_NO_START;
> return -EINVAL;
> @@ -1471,9 +1736,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
> reinit_completion(&spi_imx->dma_rx_completion);
> dma_async_issue_pending(controller->dma_rx);
>
> - desc_tx = dmaengine_prep_slave_sg(controller->dma_tx,
> - tx->sgl, tx->nents, DMA_MEM_TO_DEV,
> - DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> + desc_tx = dmaengine_prep_slave_single(controller->dma_tx, dma_data->dma_tx_addr,
> + dma_data->dma_len, DMA_MEM_TO_DEV,
> + DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> if (!desc_tx)
> goto dmaengine_terminate_rx;
>
> @@ -1521,16 +1786,16 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx,
> }
>
> static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
> - struct spi_transfer *transfer)
> + struct dma_data_package *dma_data,
> + bool word_delay)
> {
> - struct sg_table *rx = &transfer->rx_sg;
> - struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents);
> - unsigned int bytes_per_word, i;
> + unsigned int bytes_per_word = word_delay ?
> + spi_imx_bytes_per_word(spi_imx->bits_per_word) :
> + BYTES_PER_32BITS_WORD;
> + unsigned int i;
>
> - /* Get the right burst length from the last sg to ensure no tail data */
> - bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word);
> for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) {
> - if (!(sg_dma_len(last_sg) % (i * bytes_per_word)))
> + if (!dma_data->dma_len % (i * bytes_per_word))
> break;
> }
> /* Use 1 as wml in case no available burst length got */
> @@ -1540,25 +1805,29 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx,
> spi_imx->wml = i;
> }
>
> -static int spi_imx_dma_configure(struct spi_controller *controller)
> +static int spi_imx_dma_configure(struct spi_controller *controller, bool word_delay)
> {
> int ret;
> enum dma_slave_buswidth buswidth;
> struct dma_slave_config rx = {}, tx = {};
> struct spi_imx_data *spi_imx = spi_controller_get_devdata(controller);
>
> - switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
> - case 4:
> + if (word_delay) {
> + switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) {
> + case 4:
> + buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
> + break;
> + case 2:
> + buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
> + break;
> + case 1:
> + buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
> + break;
> + default:
> + return -EINVAL;
> + }
> + } else {
> buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES;
> - break;
> - case 2:
> - buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES;
> - break;
> - case 1:
> - buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE;
> - break;
> - default:
> - return -EINVAL;
> }
>
> tx.direction = DMA_MEM_TO_DEV;
> @@ -1584,15 +1853,17 @@ static int spi_imx_dma_configure(struct spi_controller *controller)
> return 0;
> }
>
> -static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> - struct spi_transfer *transfer)
> +static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx,
> + struct dma_data_package *dma_data,
> + struct spi_transfer *transfer,
> + bool word_delay)
> {
> struct spi_controller *controller = spi_imx->controller;
> int ret;
>
> - spi_imx_dma_max_wml_find(spi_imx, transfer);
> + spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay);
>
> - ret = spi_imx_dma_configure(controller);
> + ret = spi_imx_dma_configure(controller, word_delay);
> if (ret)
> goto dma_failure_no_start;
>
> @@ -1603,10 +1874,17 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> }
> spi_imx->devtype_data->setup_wml(spi_imx);
>
> - ret = spi_imx_dma_submit(spi_imx, transfer);
> + ret = spi_imx_dma_submit(spi_imx, dma_data, transfer);
> if (ret)
> return ret;
>
> + /* Trim the DMA RX buffer and copy the actual data to rx_buf */
> + dma_sync_single_for_cpu(controller->dma_rx->device->dev, dma_data->dma_rx_addr,
> + dma_data->dma_len, DMA_FROM_DEVICE);
> + spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf + spi_imx->rx_offset,
> + word_delay);
> + spi_imx->rx_offset += dma_data->data_len;
> +
> return 0;
> /* fallback to pio */
> dma_failure_no_start:
> @@ -1614,6 +1892,60 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> return ret;
> }
>
> +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
> + struct spi_transfer *transfer)
> +{
> + bool word_delay = transfer->word_delay.value != 0;
> + int ret;
> + int i;
> +
> + ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay);
> + if (ret < 0) {
> + transfer->error |= SPI_TRANS_FAIL_NO_START;
> + dev_err(spi_imx->dev, "DMA data prepare fail\n");
> + goto fallback_pio;
> + }
> +
> + spi_imx->rx_offset = 0;
> +
> + /* Each dma_package performs a separate DMA transfer once */
> + for (i = 0; i < spi_imx->dma_package_num; i++) {
> + ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]);
> + if (ret < 0) {
> + transfer->error |= SPI_TRANS_FAIL_NO_START;
> + dev_err(spi_imx->dev, "DMA map fail\n");
> + break;
> + }
> +
> + /* Update the CTRL register BL field */
> + writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL);
> +
> + ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i],
> + transfer, word_delay);
> +
> + /* Whether the dma transmission is successful or not, dma unmap is necessary */
> + spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]);
> +
> + if (ret < 0) {
> + dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i);
> + break;
> + }
> + }
> +
> + for (int j = 0; j < spi_imx->dma_package_num; j++) {
> + kfree(spi_imx->dma_data[j].dma_tx_buf);
> + kfree(spi_imx->dma_data[j].dma_rx_buf);
> + }
> + kfree(spi_imx->dma_data);
> +
> +fallback_pio:
> + /* If no any dma package data is transferred, fallback to PIO mode transfer */
> + if ((transfer->error & SPI_TRANS_FAIL_NO_START) && i != 0)
> + transfer->error &= !SPI_TRANS_FAIL_NO_START;
> +
> + return ret;
> +}
> +
> static int spi_imx_pio_transfer(struct spi_device *spi,
> struct spi_transfer *transfer)
> {
> @@ -1780,9 +2112,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller,
> * transfer, the SPI transfer has already been mapped, so we
> * have to do the DMA transfer here.
> */
> - if (spi_imx->usedma)
> - return spi_imx_dma_transfer(spi_imx, transfer);
> -
> + if (spi_imx->usedma) {
> + ret = spi_imx_dma_transfer(spi_imx, transfer);
> + if (transfer->error & SPI_TRANS_FAIL_NO_START) {
> + spi_imx->usedma = false;
> + return spi_imx_pio_transfer(spi, transfer);
> + }
> + return ret;
> + }
> /* run in polling mode for short transfers */
> if (transfer->len == 1 || (polling_limit_us &&
> spi_imx_transfer_estimate_time_us(transfer) < polling_limit_us))
> --
> 2.34.1
>
>
>
regards,
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Embedded Linux | https://www.pengutronix.de |
Vertretung Nürnberg | Phone: +49-5121-206917-129 |
Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917-9 |
© 2016 - 2025 Red Hat, Inc.