dmaengine: xilinx_dma: Fix CPU stall in xilinx_dma_poll_timeout

[PATCH] dmaengine: xilinx_dma: Fix CPU stall in xilinx_dma_poll_timeout

Posted by Alex Bereza 14 hours ago

Currently when calling xilinx_dma_poll_timeout with delay_us=0 and a
condition that is never fulfilled, the CPU busy-waits for prolonged time
and the timeout triggers only with a massive delay causing a CPU stall.

This happens due to a huge underestimation of wall clock time in
poll_timeout_us_atomic. Commit 7349a69cf312 ("iopoll: Do not use
timekeeping in read_poll_timeout_atomic()") changed the behavior to no
longer use ktime_get at the expense of underestimation of wall clock
time which appears to be very large for delay_us=0. Instead of timing
out after approximately XILINX_DMA_LOOP_COUNT microseconds, the timeout
takes XILINX_DMA_LOOP_COUNT * 1000 * (time that the overhead of the for
loop in poll_timeout_us_atomic takes) which is in the range of several
minutes for XILINX_DMA_LOOP_COUNT=1000000. Fix this by using a non-zero
value for delay_us. Use delay_us=10 to keep the delay in the hot path of
starting DMA transfers minimal but still avoid CPU stalls in case of
unexpected hardware failures.

One-off measurement with delay_us=0 causes the cpu to busy wait around 7
minutes in the timeout case. After applying this patch with delay_us=10
the measured timeout was 1053428 microseconds which is roughly
equivalent to the expected 1000000 microseconds specified in
XILINX_DMA_POLL_TIMEOUT_US.

Rename XILINX_DMA_LOOP_COUNT to XILINX_DMA_POLL_TIMEOUT_US because the
former is incorrect. It is a timeout value for polling various register
bits in microseconds. It is not a loop count. Add a constant
XILINX_DMA_POLL_DELAY_US for delay_us value.

Fixes: 7349a69cf312 ("iopoll: Do not use timekeeping in read_poll_timeout_atomic()")
Signed-off-by: Alex Bereza <alex@bereza.email>
---
Hi, in addition to this patch I also have a question: what is the point
of atomically polling for the HALTED or IDLE bit in the stop_transfer
functions? Does device_terminate_all really need to be callable from
atomic context? If not, one could switch to polling non-atomically and
avoid burning CPU cycles.

As this is my first patch, please feel free to point me in the right
direction if I am missing anything.
---
 drivers/dma/xilinx/xilinx_dma.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index 02a05f215614..8556c357b665 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -165,8 +165,10 @@
 #define XILINX_DMA_FLUSH_MM2S		2
 #define XILINX_DMA_FLUSH_BOTH		1
 
-/* Delay loop counter to prevent hardware failure */
-#define XILINX_DMA_LOOP_COUNT		1000000
+/* Timeout for polling various registers */
+#define XILINX_DMA_POLL_TIMEOUT_US		1000000
+/* Delay between polls (avoid a delay of 0 to prevent CPU stalls) */
+#define XILINX_DMA_POLL_DELAY_US		10
 
 /* AXI DMA Specific Registers/Offsets */
 #define XILINX_DMA_REG_SRCDSTADDR	0x18
@@ -1332,8 +1334,9 @@ static int xilinx_dma_stop_transfer(struct xilinx_dma_chan *chan)
 
 	/* Wait for the hardware to halt */
 	return xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMASR, val,
-				       val & XILINX_DMA_DMASR_HALTED, 0,
-				       XILINX_DMA_LOOP_COUNT);
+				       val & XILINX_DMA_DMASR_HALTED,
+				       XILINX_DMA_POLL_DELAY_US,
+				       XILINX_DMA_POLL_TIMEOUT_US);
 }
 
 /**
@@ -1347,8 +1350,9 @@ static int xilinx_cdma_stop_transfer(struct xilinx_dma_chan *chan)
 	u32 val;
 
 	return xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMASR, val,
-				       val & XILINX_DMA_DMASR_IDLE, 0,
-				       XILINX_DMA_LOOP_COUNT);
+				       val & XILINX_DMA_DMASR_IDLE,
+				       XILINX_DMA_POLL_DELAY_US,
+				       XILINX_DMA_POLL_TIMEOUT_US);
 }
 
 /**
@@ -1364,8 +1368,9 @@ static void xilinx_dma_start(struct xilinx_dma_chan *chan)
 
 	/* Wait for the hardware to start */
 	err = xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMASR, val,
-				      !(val & XILINX_DMA_DMASR_HALTED), 0,
-				      XILINX_DMA_LOOP_COUNT);
+				      !(val & XILINX_DMA_DMASR_HALTED),
+				      XILINX_DMA_POLL_DELAY_US,
+				      XILINX_DMA_POLL_TIMEOUT_US);
 
 	if (err) {
 		dev_err(chan->dev, "Cannot start channel %p: %x\n",
@@ -1780,8 +1785,9 @@ static int xilinx_dma_reset(struct xilinx_dma_chan *chan)
 
 	/* Wait for the hardware to finish reset */
 	err = xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMACR, tmp,
-				      !(tmp & XILINX_DMA_DMACR_RESET), 0,
-				      XILINX_DMA_LOOP_COUNT);
+				      !(tmp & XILINX_DMA_DMACR_RESET),
+				      XILINX_DMA_POLL_DELAY_US,
+				      XILINX_DMA_POLL_TIMEOUT_US);
 
 	if (err) {
 		dev_err(chan->dev, "reset timeout, cr %x, sr %x\n",

---
base-commit: b7560798466a07d9c3fb011698e92c335ab28baf
change-id: 20260330-fix-atomic-poll-timeout-regression-4f4e3baf3fd7

Best regards,
--  
Alex Bereza <alex@bereza.email>

Re: [PATCH] dmaengine: xilinx_dma: Fix CPU stall in xilinx_dma_poll_timeout

Posted by Gupta, Suraj an hour ago


On 3/31/2026 10:14 PM, Alex Bereza wrote:
> [You don't often get email from alex@bereza.email. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> 
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
> 
> 
> Currently when calling xilinx_dma_poll_timeout with delay_us=0 and a
> condition that is never fulfilled, the CPU busy-waits for prolonged time
> and the timeout triggers only with a massive delay causing a CPU stall.
> 
> This happens due to a huge underestimation of wall clock time in
> poll_timeout_us_atomic. Commit 7349a69cf312 ("iopoll: Do not use
> timekeeping in read_poll_timeout_atomic()") changed the behavior to no
> longer use ktime_get at the expense of underestimation of wall clock
> time which appears to be very large for delay_us=0. Instead of timing
> out after approximately XILINX_DMA_LOOP_COUNT microseconds, the timeout
> takes XILINX_DMA_LOOP_COUNT * 1000 * (time that the overhead of the for
> loop in poll_timeout_us_atomic takes) which is in the range of several
> minutes for XILINX_DMA_LOOP_COUNT=1000000. Fix this by using a non-zero
> value for delay_us. Use delay_us=10 to keep the delay in the hot path of
> starting DMA transfers minimal but still avoid CPU stalls in case of
> unexpected hardware failures.
> 
> One-off measurement with delay_us=0 causes the cpu to busy wait around 7
> minutes in the timeout case. After applying this patch with delay_us=10
> the measured timeout was 1053428 microseconds which is roughly
> equivalent to the expected 1000000 microseconds specified in
> XILINX_DMA_POLL_TIMEOUT_US.
> 
> Rename XILINX_DMA_LOOP_COUNT to XILINX_DMA_POLL_TIMEOUT_US because the
> former is incorrect. It is a timeout value for polling various register
> bits in microseconds. It is not a loop count. Add a constant
> XILINX_DMA_POLL_DELAY_US for delay_us value.

Please split this change in a new patch.

> 
> Fixes: 7349a69cf312 ("iopoll: Do not use timekeeping in read_poll_timeout_atomic()")

This patch doesn't fixes anything in iopoll, please use correct fixes tag.

> Signed-off-by: Alex Bereza <alex@bereza.email>
> ---
> Hi, in addition to this patch I also have a question: what is the point
> of atomically polling for the HALTED or IDLE bit in the stop_transfer
> functions? Does device_terminate_all really need to be callable from
> atomic context? If not, one could switch to polling non-atomically and
> avoid burning CPU cycles.
> 

dmaengine_terminate_async(), which directly calls device_terminate_all 
can be called from atomic context.

Regards,
Suraj

> As this is my first patch, please feel free to point me in the right
> direction if I am missing anything.
> ---
>   drivers/dma/xilinx/xilinx_dma.c | 26 ++++++++++++++++----------
>   1 file changed, 16 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
> index 02a05f215614..8556c357b665 100644
> --- a/drivers/dma/xilinx/xilinx_dma.c
> +++ b/drivers/dma/xilinx/xilinx_dma.c
> @@ -165,8 +165,10 @@
>   #define XILINX_DMA_FLUSH_MM2S          2
>   #define XILINX_DMA_FLUSH_BOTH          1
> 
> -/* Delay loop counter to prevent hardware failure */
> -#define XILINX_DMA_LOOP_COUNT          1000000
> +/* Timeout for polling various registers */
> +#define XILINX_DMA_POLL_TIMEOUT_US             1000000
> +/* Delay between polls (avoid a delay of 0 to prevent CPU stalls) */
> +#define XILINX_DMA_POLL_DELAY_US               10
> 
>   /* AXI DMA Specific Registers/Offsets */
>   #define XILINX_DMA_REG_SRCDSTADDR      0x18
> @@ -1332,8 +1334,9 @@ static int xilinx_dma_stop_transfer(struct xilinx_dma_chan *chan)
> 
>          /* Wait for the hardware to halt */
>          return xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMASR, val,
> -                                      val & XILINX_DMA_DMASR_HALTED, 0,
> -                                      XILINX_DMA_LOOP_COUNT);
> +                                      val & XILINX_DMA_DMASR_HALTED,
> +                                      XILINX_DMA_POLL_DELAY_US,
> +                                      XILINX_DMA_POLL_TIMEOUT_US);
>   }
> 
>   /**
> @@ -1347,8 +1350,9 @@ static int xilinx_cdma_stop_transfer(struct xilinx_dma_chan *chan)
>          u32 val;
> 
>          return xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMASR, val,
> -                                      val & XILINX_DMA_DMASR_IDLE, 0,
> -                                      XILINX_DMA_LOOP_COUNT);
> +                                      val & XILINX_DMA_DMASR_IDLE,
> +                                      XILINX_DMA_POLL_DELAY_US,
> +                                      XILINX_DMA_POLL_TIMEOUT_US);
>   }
> 
>   /**
> @@ -1364,8 +1368,9 @@ static void xilinx_dma_start(struct xilinx_dma_chan *chan)
> 
>          /* Wait for the hardware to start */
>          err = xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMASR, val,
> -                                     !(val & XILINX_DMA_DMASR_HALTED), 0,
> -                                     XILINX_DMA_LOOP_COUNT);
> +                                     !(val & XILINX_DMA_DMASR_HALTED),
> +                                     XILINX_DMA_POLL_DELAY_US,
> +                                     XILINX_DMA_POLL_TIMEOUT_US);
> 
>          if (err) {
>                  dev_err(chan->dev, "Cannot start channel %p: %x\n",
> @@ -1780,8 +1785,9 @@ static int xilinx_dma_reset(struct xilinx_dma_chan *chan)
> 
>          /* Wait for the hardware to finish reset */
>          err = xilinx_dma_poll_timeout(chan, XILINX_DMA_REG_DMACR, tmp,
> -                                     !(tmp & XILINX_DMA_DMACR_RESET), 0,
> -                                     XILINX_DMA_LOOP_COUNT);
> +                                     !(tmp & XILINX_DMA_DMACR_RESET),
> +                                     XILINX_DMA_POLL_DELAY_US,
> +                                     XILINX_DMA_POLL_TIMEOUT_US);
> 
>          if (err) {
>                  dev_err(chan->dev, "reset timeout, cr %x, sr %x\n",
> 
> ---
> base-commit: b7560798466a07d9c3fb011698e92c335ab28baf
> change-id: 20260330-fix-atomic-poll-timeout-regression-4f4e3baf3fd7
> 
> Best regards,
> --
> Alex Bereza <alex@bereza.email>
> 
>