This use PCS-CCS-CB-TCB Producer-Consumer Synchronization module, which
support append new DMA request during dmaengine runnings.
Append new request during dmaengine runnings.
But look like hardware have bug, which missed doorbell when engine is
running. So add workaround to push doorbelll again when found engine stop.
Get more than 10% performance gain.
The before
Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
After
Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
drivers/dma/dw-edma/dw-edma-core.c | 104 ++++++++++++++++++++++++++--------
drivers/dma/dw-edma/dw-edma-core.h | 2 +
drivers/dma/dw-edma/dw-edma-v0-core.c | 22 ++++++-
3 files changed, 102 insertions(+), 26 deletions(-)
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 678bbc4e65f0e2fced6efec88a3af6935d833bc6..5aacd04bd2da4a65aabec48f6631f6f8882eecfd 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -65,6 +65,7 @@ static void dw_edma_core_reset_ll(struct dw_edma_chan *chan)
chan->ll_head = 0;
chan->ll_end = 0;
chan->cb = true;
+ chan->cur_idx = -1;
dw_edma_core_ll_link(chan, chan->ll_max - 1, chan->cb,
chan->ll_region.paddr);
@@ -82,6 +83,12 @@ static u32 dw_edma_core_get_free_num(struct dw_edma_chan *chan)
(chan->ll_max - 1);
}
+static u32 dw_edma_core_get_done_num(struct dw_edma_chan *chan, u32 index)
+{
+ return (index - chan->ll_end + chan->ll_max - 1) % (chan->ll_max - 1);
+}
+
+/* Need hold vc.lock */
static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
{
struct dw_edma_chan *chan = desc->chan;
@@ -94,6 +101,11 @@ static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
if (!free)
break;
+ /* need update link CB before last update last item */
+ if (chan->ll_head == chan->ll_max - 2)
+ dw_edma_core_ll_link(chan, chan->ll_max - 1, chan->cb,
+ chan->ll_region.paddr);
+
/* Enable irq for last free entry or last burst */
dw_edma_core_ll_data(chan, &desc->burst[i],
chan->ll_head, chan->cb,
@@ -108,32 +120,36 @@ static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
}
desc->done_burst = desc->start_burst;
- desc->start_burst += i;
+ desc->start_burst = i;
desc->ll_end = chan->ll_head;
-
- dw_edma_core_ch_doorbell(chan);
}
+/* Need hold vc.lock */
static int dw_edma_start_transfer(struct dw_edma_chan *chan)
{
struct dw_edma_desc *desc;
struct virt_dma_desc *vd;
int index = dw_edma_core_ll_cur_idx(chan);
+ int ret = 0;
if (index < 0)
dw_edma_core_reset_ll(chan);
- vd = vchan_next_desc(&chan->vc);
- if (!vd)
- return 0;
+ list_for_each_entry(vd, &chan->vc.desc_issued, node) {
+ if (!dw_edma_core_get_free_num(chan))
+ return ret;
- desc = vd2dw_edma_desc(vd);
- if (!desc)
- return 0;
+ desc = vd2dw_edma_desc(vd);
- dw_edma_core_start(desc, !desc->start_burst);
+ if (desc->start_burst != desc->nburst) {
+ dw_edma_core_start(desc, !desc->start_burst);
+ ret = 1;
+ } else {
+ break;
+ }
+ }
- return 1;
+ return ret;
}
static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
@@ -161,6 +177,31 @@ static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
res->residue = residue;
}
+/* Need hold vc.lock */
+static void dw_edma_ll_clean_pending(struct dw_edma_chan *chan, int idx)
+{
+ struct virt_dma_desc *vd, *_vd;
+
+ list_for_each_entry_safe(vd, _vd, &chan->vc.desc_issued, node) {
+ struct dw_edma_desc *desc = vd2dw_edma_desc(vd);
+
+ if (desc->start_burst == desc->nburst) {
+ if (dw_edma_core_get_done_num(chan, idx) >=
+ dw_edma_core_get_done_num(chan, desc->ll_end)) {
+
+ dw_hdma_set_callback_result(vd,
+ DMA_TRANS_NOERROR);
+ list_del(&vd->node);
+ vchan_cookie_complete(vd);
+ chan->ll_end = desc->ll_end;
+ }
+ } else {
+ break;
+ }
+ }
+
+}
+
static void dw_edma_device_caps(struct dma_chan *dchan,
struct dma_slave_caps *caps)
{
@@ -272,12 +313,13 @@ static void dw_edma_device_issue_pending(struct dma_chan *dchan)
return;
spin_lock_irqsave(&chan->vc.lock, flags);
- if (vchan_issue_pending(&chan->vc) && chan->request == EDMA_REQ_NONE &&
- chan->status == EDMA_ST_IDLE) {
+ if (vchan_issue_pending(&chan->vc)) {
chan->status = EDMA_ST_BUSY;
dw_edma_start_transfer(chan);
}
spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ dw_edma_core_ch_doorbell(chan);
}
static enum dma_status
@@ -290,7 +332,23 @@ dw_edma_device_tx_status(struct dma_chan *dchan, dma_cookie_t cookie,
unsigned long flags;
enum dma_status ret;
u32 residue = 0;
+ int idx;
+ ret = dma_cookie_status(dchan, cookie, txstate);
+ if (ret == DMA_COMPLETE)
+ return ret;
+
+ spin_lock_irqsave(&chan->vc.lock, flags);
+ idx = dw_edma_core_ll_cur_idx(chan);
+ if (idx != chan->cur_idx) {
+ chan->cur_idx = idx;
+
+ dw_edma_ll_clean_pending(chan, idx);
+ dw_edma_start_transfer(chan);
+ }
+ spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ /* check gain because dw_edma_ll_clean_pending() may update cookie */
ret = dma_cookie_status(dchan, cookie, txstate);
if (ret == DMA_COMPLETE)
return ret;
@@ -545,26 +603,20 @@ dw_edma_device_prep_interleaved_dma(struct dma_chan *dchan,
static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
{
- struct dw_edma_desc *desc;
struct virt_dma_desc *vd;
unsigned long flags;
+ int idx;
spin_lock_irqsave(&chan->vc.lock, flags);
+ idx = dw_edma_core_ll_cur_idx(chan);
+ if (idx != chan->cur_idx) {
+ chan->cur_idx = idx;
+ dw_edma_ll_clean_pending(chan, idx);
+ }
vd = vchan_next_desc(&chan->vc);
if (vd) {
switch (chan->request) {
case EDMA_REQ_NONE:
- desc = vd2dw_edma_desc(vd);
- if (desc->start_burst >= desc->nburst) {
- dw_hdma_set_callback_result(vd,
- DMA_TRANS_NOERROR);
- list_del(&vd->node);
- vchan_cookie_complete(vd);
- chan->ll_end = desc->ll_end;
- }
-
- /* Continue transferring if there are remaining chunks or issued requests.
- */
chan->status = dw_edma_start_transfer(chan) ? EDMA_ST_BUSY : EDMA_ST_IDLE;
break;
@@ -585,6 +637,8 @@ static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
}
}
spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ dw_edma_core_ch_doorbell(chan);
}
static void dw_edma_abort_interrupt(struct dw_edma_chan *chan)
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index fd4b086a36441cc3209131e4274d6c47de4d616c..94d49f8359b99a9b0f8ca708edf81ca854dff4c2 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -108,6 +108,8 @@ struct dw_edma_chan {
enum dw_edma_status status;
u8 configured;
+ int cur_idx;
+
struct dma_slave_config config;
};
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index edc71a4dbc798386508e15f44e85c23e7e50f2ee..bb9a1682f943dafef28bcf52ab83f3485068f8ed 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -499,7 +499,6 @@ static void dw_edma_v0_core_ch_doorbell(struct dw_edma_chan *chan)
dw_edma_v0_sync_ll_data(chan);
- /* Doorbell */
SET_RW_32(dw, chan->dir, doorbell,
FIELD_PREP(EDMA_V0_DOORBELL_CH_MASK, chan->id));
}
@@ -517,6 +516,27 @@ static int dw_edma_v0_core_ll_cur_idx(struct dw_edma_chan *chan)
if (!val)
return -EINVAL;
+ /*
+ * DMA engine looks like have hardware bugs, Doorbell will be missed
+ * if DMA engine running, so last update descriptor have not fetched by
+ * DMA engine, so DMA engine stop.
+ *
+ * Most like issue happen at
+ *
+ * DMA Engine | SW
+ * ======================================
+ * 1 send Read req for LL
+ * 2 update LL
+ * 3 doorbell
+ * 4 *Missed doorbell*
+ * 5 Get old LL data
+ * 6 DMA stop
+ *
+ * Workaround: Push doorbell again when found DMA stop.
+ */
+ if (dw_edma_v0_core_ch_status(chan) != DMA_IN_PROGRESS)
+ dw_edma_v0_core_ch_doorbell(chan);
+
return (val - (paddr & 0xFFFFFFFF)) / EDMA_LL_SZ;
}
--
2.34.1
On Fri, Jan 09, 2026 at 03:13:28PM -0500, Frank Li wrote:
> This use PCS-CCS-CB-TCB Producer-Consumer Synchronization module, which
> support append new DMA request during dmaengine runnings.
>
> Append new request during dmaengine runnings.
>
> But look like hardware have bug, which missed doorbell when engine is
> running. So add workaround to push doorbelll again when found engine stop.
>
> Get more than 10% performance gain.
>
> The before
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
>
> After
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
>
> Signed-off-by: Frank Li <Frank.Li@nxp.com>
> ---
Hello Frank,
First of all, I hope that your:
[PATCH v3 0/9] dmaengine: Add new API to combine configuration and descriptor preparation
series will make it to the upcoming 6.20/7.0 merge window.
This RFT series however breaks pci-epf-test:
Before:
# RUN pci_ep_data_transfer.dma.READ_TEST ...
# OK pci_ep_data_transfer.dma.READ_TEST
ok 14 pci_ep_data_transfer.dma.READ_TEST
# RUN pci_ep_data_transfer.dma.WRITE_TEST ...
# OK pci_ep_data_transfer.dma.WRITE_TEST
ok 15 pci_ep_data_transfer.dma.WRITE_TEST
After:
# RUN pci_ep_data_transfer.dma.READ_TEST ...
# READ_TEST: Test terminated by timeout
# FAIL pci_ep_data_transfer.dma.READ_TEST
not ok 14 pci_ep_data_transfer.dma.READ_TEST
# RUN pci_ep_data_transfer.dma.WRITE_TEST ...
# WRITE_TEST: Test terminated by timeout
# FAIL pci_ep_data_transfer.dma.WRITE_TEST
not ok 15 pci_ep_data_transfer.dma.WRITE_TEST
After a bisect, first bad commit:
commit 352fd8d5ed468ea616eb4974b5ac19203528b207
Author: Frank Li <Frank.Li@nxp.com>
Date: Fri Jan 9 15:13:28 2026 -0500
dmaengine: dw-edma: Dynamitc append new request during dmaengine running
Kind regards,
Niklas
On Fri, Jan 23, 2026 at 11:41:54AM +0100, Niklas Cassel wrote: > On Fri, Jan 09, 2026 at 03:13:28PM -0500, Frank Li wrote: > > This use PCS-CCS-CB-TCB Producer-Consumer Synchronization module, which > > support append new DMA request during dmaengine runnings. > > > > Append new request during dmaengine runnings. > > > > But look like hardware have bug, which missed doorbell when engine is > > running. So add workaround to push doorbelll again when found engine stop. > > > > Get more than 10% performance gain. > > > > The before > > Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s) > > > > After > > Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s) > > > > Signed-off-by: Frank Li <Frank.Li@nxp.com> > > --- > > Hello Frank, > > First of all, I hope that your: > [PATCH v3 0/9] dmaengine: Add new API to combine configuration and descriptor preparation > series will make it to the upcoming 6.20/7.0 merge window. > > > This RFT series however breaks pci-epf-test: > > Before: > # RUN pci_ep_data_transfer.dma.READ_TEST ... > # OK pci_ep_data_transfer.dma.READ_TEST > ok 14 pci_ep_data_transfer.dma.READ_TEST > # RUN pci_ep_data_transfer.dma.WRITE_TEST ... > # OK pci_ep_data_transfer.dma.WRITE_TEST > ok 15 pci_ep_data_transfer.dma.WRITE_TEST > > After: > # RUN pci_ep_data_transfer.dma.READ_TEST ... > # READ_TEST: Test terminated by timeout > # FAIL pci_ep_data_transfer.dma.READ_TEST > not ok 14 pci_ep_data_transfer.dma.READ_TEST > # RUN pci_ep_data_transfer.dma.WRITE_TEST ... > # WRITE_TEST: Test terminated by timeout > # FAIL pci_ep_data_transfer.dma.WRITE_TEST > not ok 15 pci_ep_data_transfer.dma.WRITE_TEST > > > After a bisect, first bad commit: > commit 352fd8d5ed468ea616eb4974b5ac19203528b207 > Author: Frank Li <Frank.Li@nxp.com> > Date: Fri Jan 9 15:13:28 2026 -0500 > > dmaengine: dw-edma: Dynamitc append new request during dmaengine running > Thanks, let me try to fix it. Frank > > > Kind regards, > Niklas
© 2016 - 2026 Red Hat, Inc.