[v4] spi: spi-fsl-dspi: Target mode improvements

[PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by James Clark 3 months, 1 week ago

This will allow us to return a status from the interrupt handler in a
later commit and avoids copying it at the end of
dspi_transfer_one_message(). For consistency make polling and DMA modes
use the same mechanism.

Refactor dspi_rxtx() and dspi_poll() to not return -EINPROGRESS because
this isn't actually a status that was ever returned to the core layer
but some internal state. Wherever that was used we can look at dspi->len
instead.

No functional changes intended.

Signed-off-by: James Clark <james.clark@linaro.org>
---
 drivers/spi/spi-fsl-dspi.c | 70 ++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
index 4bd4377551b5..65567745fe9a 100644
--- a/drivers/spi/spi-fsl-dspi.c
+++ b/drivers/spi/spi-fsl-dspi.c
@@ -591,11 +591,10 @@ static int dspi_next_xfer_dma_submit(struct fsl_dspi *dspi)
 
 static void dspi_setup_accel(struct fsl_dspi *dspi);
 
-static int dspi_dma_xfer(struct fsl_dspi *dspi)
+static void dspi_dma_xfer(struct fsl_dspi *dspi)
 {
 	struct spi_message *message = dspi->cur_msg;
 	struct device *dev = &dspi->pdev->dev;
-	int ret = 0;
 
 	/*
 	 * dspi->len gets decremented by dspi_pop_tx_pushr in
@@ -612,14 +611,12 @@ static int dspi_dma_xfer(struct fsl_dspi *dspi)
 		message->actual_length += dspi->words_in_flight *
 					  dspi->oper_word_size;
 
-		ret = dspi_next_xfer_dma_submit(dspi);
-		if (ret) {
+		message->status = dspi_next_xfer_dma_submit(dspi);
+		if (message->status) {
 			dev_err(dev, "DMA transfer failed\n");
 			break;
 		}
 	}
-
-	return ret;
 }
 
 static int dspi_request_dma(struct fsl_dspi *dspi, phys_addr_t phy_addr)
@@ -986,36 +983,40 @@ static void dspi_fifo_write(struct fsl_dspi *dspi)
 				dspi->progress, !dspi->irq);
 }
 
-static int dspi_rxtx(struct fsl_dspi *dspi)
+static void dspi_rxtx(struct fsl_dspi *dspi)
 {
 	dspi_fifo_read(dspi);
 
 	if (!dspi->len)
 		/* Success! */
-		return 0;
+		return;
 
 	dspi_fifo_write(dspi);
-
-	return -EINPROGRESS;
 }
 
-static int dspi_poll(struct fsl_dspi *dspi)
+static void dspi_poll(struct fsl_dspi *dspi)
 {
-	int tries = 1000;
+	int tries;
 	u32 spi_sr;
 
-	do {
-		regmap_read(dspi->regmap, SPI_SR, &spi_sr);
-		regmap_write(dspi->regmap, SPI_SR, spi_sr);
+	while (dspi->len) {
+		for (tries = 1000; tries > 0; --tries) {
+			regmap_read(dspi->regmap, SPI_SR, &spi_sr);
+			regmap_write(dspi->regmap, SPI_SR, spi_sr);
 
-		if (spi_sr & SPI_SR_CMDTCF)
-			break;
-	} while (--tries);
+			if (spi_sr & SPI_SR_CMDTCF)
+				break;
+		}
 
-	if (!tries)
-		return -ETIMEDOUT;
+		if (!tries) {
+			dspi->cur_msg->status = -ETIMEDOUT;
+			return;
+		}
 
-	return dspi_rxtx(dspi);
+		dspi_rxtx(dspi);
+	}
+
+	dspi->cur_msg->status = 0;
 }
 
 static irqreturn_t dspi_interrupt(int irq, void *dev_id)
@@ -1029,8 +1030,13 @@ static irqreturn_t dspi_interrupt(int irq, void *dev_id)
 	if (!(spi_sr & SPI_SR_CMDTCF))
 		return IRQ_NONE;
 
-	if (dspi_rxtx(dspi) == 0)
+	dspi_rxtx(dspi);
+
+	if (!dspi->len) {
+		if (dspi->cur_msg)
+			WRITE_ONCE(dspi->cur_msg->status, 0);
 		complete(&dspi->xfer_done);
+	}
 
 	return IRQ_HANDLED;
 }
@@ -1060,7 +1066,6 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
 	struct spi_device *spi = message->spi;
 	struct spi_transfer *transfer;
 	bool cs = false;
-	int status = 0;
 	u32 val = 0;
 	bool cs_change = false;
 
@@ -1120,7 +1125,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
 				       dspi->progress, !dspi->irq);
 
 		if (dspi->devtype_data->trans_mode == DSPI_DMA_MODE) {
-			status = dspi_dma_xfer(dspi);
+			dspi_dma_xfer(dspi);
 		} else {
 			/*
 			 * Reinitialize the completion before transferring data
@@ -1134,15 +1139,12 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
 
 			dspi_fifo_write(dspi);
 
-			if (dspi->irq) {
+			if (dspi->irq)
 				wait_for_completion(&dspi->xfer_done);
-			} else {
-				do {
-					status = dspi_poll(dspi);
-				} while (status == -EINPROGRESS);
-			}
+			else
+				dspi_poll(dspi);
 		}
-		if (status)
+		if (READ_ONCE(message->status))
 			break;
 
 		spi_transfer_delay_exec(transfer);
@@ -1151,7 +1153,8 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
 			dspi_deassert_cs(spi, &cs);
 	}
 
-	if (status || !cs_change) {
+	dspi->cur_msg = NULL;
+	if (message->status || !cs_change) {
 		/* Put DSPI in stop mode */
 		regmap_update_bits(dspi->regmap, SPI_MCR,
 				   SPI_MCR_HALT, SPI_MCR_HALT);
@@ -1160,10 +1163,9 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
 			;
 	}
 
-	message->status = status;
 	spi_finalize_current_message(ctlr);
 
-	return status;
+	return message->status;
 }
 
 static int dspi_set_mtf(struct fsl_dspi *dspi)

-- 
2.34.1

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by Vladimir Oltean 3 months, 1 week ago

On Fri, Jun 27, 2025 at 11:21:38AM +0100, James Clark wrote:
> This will allow us to return a status from the interrupt handler in a
> later commit and avoids copying it at the end of
> dspi_transfer_one_message(). For consistency make polling and DMA modes
> use the same mechanism.
> 
> Refactor dspi_rxtx() and dspi_poll() to not return -EINPROGRESS because
> this isn't actually a status that was ever returned to the core layer
> but some internal state. Wherever that was used we can look at dspi->len
> instead.
> 
> No functional changes intended.
> 
> Signed-off-by: James Clark <james.clark@linaro.org>
> ---

This commit doesn't work, please do not merge this patch.

You are changing the logic in DMA mode, interrupt-based FIFO and PIO all
in one go, in a commit whose title and primary purpose is unrelated to
that. Just a mention of the type "while at it, also do that". And in
that process, that bundled refactoring introduces a subtle, but severe bug.

No, that is discouraged. Make one patch per logical change, where only
one thing is happening and which is obviously correct. It helps you and
it helps the reviewer.

Please find attached a set of 3 patches that represent a broken down and
corrected variant of this one. First 2 should be squashed together in
your next submission, they are just to illustrate the bug that you've
introduced (which can be reproduced on any SoC in XSPI mode).

The panic message is slightly confusing and does not directly point to
the issue, I'm attaching it just for the sake of having a future reference.

[    4.154185] DSA: tree 0 setup
[    4.157380] sja1105 spi2.0: Probed switch chip: SJA1105S
[    4.173894] sja1105 spi2.0: configuring for fixed/sgmii link mode
[    4.232527] sja1105 spi2.0: Link is Up - 1Gbps/Full - flow control off
[    4.312798] sja1105 spi2.0 sw0p0 (uninitialized): PHY [0000:00:00.3:07] driver [RTL8211F Gigabit Ethernet] (irq=POLL)
[    4.443689] sja1105 spi2.0 sw0p1 (uninitialized): PHY [0000:00:00.3:00] driver [Microsemi GE VSC8502 SyncE] (irq=POLL)
[    4.575718] sja1105 spi2.0 sw0p2 (uninitialized): PHY [0000:00:00.3:01] driver [Microsemi GE VSC8502 SyncE] (irq=POLL)
[    4.588012] Unable to handle kernel paging request at virtual address ffff8000801ac000
[    4.595960] Mem abort info:
[    4.598757]   ESR = 0x0000000096000007
[    4.602515]   EC = 0x25: DABT (current EL), IL = 32 bits
[    4.607843]   SET = 0, FnV = 0
[    4.610902]   EA = 0, S1PTW = 0
[    4.614048]   FSC = 0x07: level 3 translation fault
[    4.618939] Data abort info:
[    4.621822]   ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000
[    4.627323]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[    4.632388]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[    4.637714] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000082b7a000
[    4.644437] [ffff8000801ac000] pgd=0000000000000000, p4d=1000002080020403, pud=1000002080021403, pmd=1000002080022403, pte=0000000000000000
[    4.657016] Internal error: Oops: 0000000096000007 [#1]  SMP
[    4.662693] Modules linked in:
[    4.665756] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.16.0-rc3+ #30 PREEMPT
[    4.673615] Hardware name: random LS1028A board
[    4.679116] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    4.686103] pc : dspi_8on32_host_to_dev+0x8/0x24
[    4.690742] lr : dspi_fifo_write+0x178/0x1cc
[    4.695025] sp : ffff800080003eb0
[    4.698346] x29: ffff800080003ec0 x28: ffffc25414698b00 x27: ffffc2541464c170
[    4.705512] x26: 0000000000000001 x25: ffffc25414b06000 x24: 0000000111705fd3
[    4.712677] x23: ffffc25414257bae x22: ffff8000801ab5e8 x21: 00000000fffffd98
[    4.719842] x20: 0000000000000000 x19: ffff00200039a480 x18: 0000000000000006
[    4.727007] x17: ffff3dcc6b076000 x16: ffff800080000000 x15: 0000000078b30c40
[    4.734171] x14: 0000000000000000 x13: 0000000000000048 x12: 0000000000000128
[    4.741335] x11: 0000000000000001 x10: 0000000000000000 x9 : 0000000100010001
[    4.748500] x8 : ffff8000801ac000 x7 : 0000000000000000 x6 : 0000000000000000
[    4.755664] x5 : 0000000000000000 x4 : ffffc25411a308d0 x3 : 0000000000000000
[    4.762828] x2 : 0000000000000000 x1 : ffff800080003eb4 x0 : ffff00200039a480
[    4.769992] Call trace:
[    4.772441]  dspi_8on32_host_to_dev+0x8/0x24 (P)
[    4.777074]  dspi_interrupt+0x6c/0xf0
[    4.780747]  __handle_irq_event_percpu+0x8c/0x160
[    4.785470]  handle_irq_event+0x48/0xa0
[    4.789319]  handle_fasteoi_irq+0xf4/0x208
[    4.793428]  generic_handle_domain_irq+0x40/0x64
[    4.798060]  gic_handle_irq+0x4c/0x110
[    4.801820]  call_on_irq_stack+0x24/0x30
[    4.805757]  el1_interrupt+0x74/0xc0
[    4.809346]  el1h_64_irq_handler+0x18/0x24
[    4.813457]  el1h_64_irq+0x6c/0x70
[    4.816867]  arch_local_irq_enable+0x8/0xc (P)
[    4.821330]  cpuidle_enter+0x38/0x50
[    4.824914]  do_idle+0x1c4/0x250
[    4.828152]  cpu_startup_entry+0x34/0x38
[    4.832087]  kernel_init+0x0/0x1a0
[    4.835500]  start_kernel+0x2ec/0x398
[    4.839175]  __primary_switched+0x88/0x90
[    4.843200] Code: f9003008 d65f03c0 d503245f f9402c08 (b9400108)
[    4.849313] ---[ end trace 0000000000000000 ]---
[    4.853943] Kernel panic - not syncing: Oops: Fatal exception in interrupt
[    4.860840] SMP: stopping secondary CPUs
[    4.864788] Kernel Offset: 0x425391a00000 from 0xffff800080000000
[    4.870900] PHYS_OFFSET: 0xfff1000080000000
[    4.875093] CPU features: 0x1000,000804b0,02000801,0400421b
[    4.880683] Memory Limit: none
[    4.883750] ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]---

I still intend to do more testing, so please don't send the next version
just yet. Tracking down this issue took a bit more than I was planning.

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by James Clark 2 months, 2 weeks ago


On 27/06/2025 10:30 pm, Vladimir Oltean wrote:
> On Fri, Jun 27, 2025 at 11:21:38AM +0100, James Clark wrote:
>> This will allow us to return a status from the interrupt handler in a
>> later commit and avoids copying it at the end of
>> dspi_transfer_one_message(). For consistency make polling and DMA modes
>> use the same mechanism.
>>
>> Refactor dspi_rxtx() and dspi_poll() to not return -EINPROGRESS because
>> this isn't actually a status that was ever returned to the core layer
>> but some internal state. Wherever that was used we can look at dspi->len
>> instead.
>>
>> No functional changes intended.
>>
>> Signed-off-by: James Clark <james.clark@linaro.org>
>> ---
> 
> This commit doesn't work, please do not merge this patch.
> 
> You are changing the logic in DMA mode, interrupt-based FIFO and PIO all
> in one go, in a commit whose title and primary purpose is unrelated to
> that. Just a mention of the type "while at it, also do that". And in
> that process, that bundled refactoring introduces a subtle, but severe bug.
> 
> No, that is discouraged. Make one patch per logical change, where only
> one thing is happening and which is obviously correct. It helps you and
> it helps the reviewer.
> 
> Please find attached a set of 3 patches that represent a broken down and
> corrected variant of this one. First 2 should be squashed together in
> your next submission, they are just to illustrate the bug that you've
> introduced (which can be reproduced on any SoC in XSPI mode).
> 
> The panic message is slightly confusing and does not directly point to
> the issue, I'm attaching it just for the sake of having a future reference.
> 
> [    4.154185] DSA: tree 0 setup
> [    4.157380] sja1105 spi2.0: Probed switch chip: SJA1105S
> [    4.173894] sja1105 spi2.0: configuring for fixed/sgmii link mode
> [    4.232527] sja1105 spi2.0: Link is Up - 1Gbps/Full - flow control off
> [    4.312798] sja1105 spi2.0 sw0p0 (uninitialized): PHY [0000:00:00.3:07] driver [RTL8211F Gigabit Ethernet] (irq=POLL)
> [    4.443689] sja1105 spi2.0 sw0p1 (uninitialized): PHY [0000:00:00.3:00] driver [Microsemi GE VSC8502 SyncE] (irq=POLL)
> [    4.575718] sja1105 spi2.0 sw0p2 (uninitialized): PHY [0000:00:00.3:01] driver [Microsemi GE VSC8502 SyncE] (irq=POLL)
> [    4.588012] Unable to handle kernel paging request at virtual address ffff8000801ac000
> [    4.595960] Mem abort info:
> [    4.598757]   ESR = 0x0000000096000007
> [    4.602515]   EC = 0x25: DABT (current EL), IL = 32 bits
> [    4.607843]   SET = 0, FnV = 0
> [    4.610902]   EA = 0, S1PTW = 0
> [    4.614048]   FSC = 0x07: level 3 translation fault
> [    4.618939] Data abort info:
> [    4.621822]   ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000
> [    4.627323]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> [    4.632388]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> [    4.637714] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000082b7a000
> [    4.644437] [ffff8000801ac000] pgd=0000000000000000, p4d=1000002080020403, pud=1000002080021403, pmd=1000002080022403, pte=0000000000000000
> [    4.657016] Internal error: Oops: 0000000096000007 [#1]  SMP
> [    4.662693] Modules linked in:
> [    4.665756] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.16.0-rc3+ #30 PREEMPT
> [    4.673615] Hardware name: random LS1028A board
> [    4.679116] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [    4.686103] pc : dspi_8on32_host_to_dev+0x8/0x24
> [    4.690742] lr : dspi_fifo_write+0x178/0x1cc
> [    4.695025] sp : ffff800080003eb0
> [    4.698346] x29: ffff800080003ec0 x28: ffffc25414698b00 x27: ffffc2541464c170
> [    4.705512] x26: 0000000000000001 x25: ffffc25414b06000 x24: 0000000111705fd3
> [    4.712677] x23: ffffc25414257bae x22: ffff8000801ab5e8 x21: 00000000fffffd98
> [    4.719842] x20: 0000000000000000 x19: ffff00200039a480 x18: 0000000000000006
> [    4.727007] x17: ffff3dcc6b076000 x16: ffff800080000000 x15: 0000000078b30c40
> [    4.734171] x14: 0000000000000000 x13: 0000000000000048 x12: 0000000000000128
> [    4.741335] x11: 0000000000000001 x10: 0000000000000000 x9 : 0000000100010001
> [    4.748500] x8 : ffff8000801ac000 x7 : 0000000000000000 x6 : 0000000000000000
> [    4.755664] x5 : 0000000000000000 x4 : ffffc25411a308d0 x3 : 0000000000000000
> [    4.762828] x2 : 0000000000000000 x1 : ffff800080003eb4 x0 : ffff00200039a480
> [    4.769992] Call trace:
> [    4.772441]  dspi_8on32_host_to_dev+0x8/0x24 (P)
> [    4.777074]  dspi_interrupt+0x6c/0xf0
> [    4.780747]  __handle_irq_event_percpu+0x8c/0x160
> [    4.785470]  handle_irq_event+0x48/0xa0
> [    4.789319]  handle_fasteoi_irq+0xf4/0x208
> [    4.793428]  generic_handle_domain_irq+0x40/0x64
> [    4.798060]  gic_handle_irq+0x4c/0x110
> [    4.801820]  call_on_irq_stack+0x24/0x30
> [    4.805757]  el1_interrupt+0x74/0xc0
> [    4.809346]  el1h_64_irq_handler+0x18/0x24
> [    4.813457]  el1h_64_irq+0x6c/0x70
> [    4.816867]  arch_local_irq_enable+0x8/0xc (P)
> [    4.821330]  cpuidle_enter+0x38/0x50
> [    4.824914]  do_idle+0x1c4/0x250
> [    4.828152]  cpu_startup_entry+0x34/0x38
> [    4.832087]  kernel_init+0x0/0x1a0
> [    4.835500]  start_kernel+0x2ec/0x398
> [    4.839175]  __primary_switched+0x88/0x90
> [    4.843200] Code: f9003008 d65f03c0 d503245f f9402c08 (b9400108)
> [    4.849313] ---[ end trace 0000000000000000 ]---
> [    4.853943] Kernel panic - not syncing: Oops: Fatal exception in interrupt
> [    4.860840] SMP: stopping secondary CPUs
> [    4.864788] Kernel Offset: 0x425391a00000 from 0xffff800080000000
> [    4.870900] PHYS_OFFSET: 0xfff1000080000000
> [    4.875093] CPU features: 0x1000,000804b0,02000801,0400421b
> [    4.880683] Memory Limit: none
> [    4.883750] ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]---
> 
> I still intend to do more testing, so please don't send the next version
> just yet. Tracking down this issue took a bit more than I was planning.

Hi Vladimir,

Just wanted to check if you are ok for me to send a new version with 
your fixes included now?

I assume from the other discussion that we don't want to always enable 
DMA mode either, and we'll just leave it for s32g target mode only?

Thanks
James

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by Vladimir Oltean 2 months, 2 weeks ago

On Mon, Jul 21, 2025 at 02:25:55PM +0100, James Clark wrote:
> Hi Vladimir,
> 
> Just wanted to check if you are ok for me to send a new version with your
> fixes included now?

Yes, sorry for not being clear, the tests which I wanted to run were
these ones:
https://lore.kernel.org/linux-spi/20250630152612.npdobwbcezl5nlym@skbuf/

> I assume from the other discussion that we don't want to always enable DMA
> mode either, and we'll just leave it for s32g target mode only?

Yeah, for sure don't enable DMA mode unconditionally. I don't have time
right now to look into Mark's can_dma() suggestion - if you don't have
either, I suppose it is what it is, and the performance improvements
brought by your enhanced DMA patches can be brought over to other SoCs
at some unspecified time in the future.

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by James Clark 2 months, 2 weeks ago

On 21/07/2025 2:39 pm, Vladimir Oltean wrote:
> On Mon, Jul 21, 2025 at 02:25:55PM +0100, James Clark wrote:
>> Hi Vladimir,
>>
>> Just wanted to check if you are ok for me to send a new version with your
>> fixes included now?
> 
> Yes, sorry for not being clear, the tests which I wanted to run were
> these ones:
> https://lore.kernel.org/linux-spi/20250630152612.npdobwbcezl5nlym@skbuf/
> 

Got it, thanks.

>> I assume from the other discussion that we don't want to always enable DMA
>> mode either, and we'll just leave it for s32g target mode only?
> 
> Yeah, for sure don't enable DMA mode unconditionally. I don't have time
> right now to look into Mark's can_dma() suggestion - if you don't have
> either, I suppose it is what it is, and the performance improvements
> brought by your enhanced DMA patches can be brought over to other SoCs
> at some unspecified time in the future.

I think incremental is better yes, there wouldn't be much or any of this 
thrown away anyway.

I'm also not sure if it would fit with can_dma as that starts mapping 
stuff in the core layer. We want to avoid that because we need to write 
that control word for each SPI word. We could still change mode 
conditionally in the DSPI driver though, or make can_dma more flexible. 
But on top of this for sure.

James

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by Mark Brown 2 months, 2 weeks ago

On Mon, Jul 21, 2025 at 03:02:09PM +0100, James Clark wrote:

> I'm also not sure if it would fit with can_dma as that starts mapping stuff
> in the core layer. We want to avoid that because we need to write that
> control word for each SPI word. We could still change mode conditionally in
> the DSPI driver though, or make can_dma more flexible. But on top of this
> for sure.

Even if you need to open code the infrastructure the idea of a copybreak
should still be useful.

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by James Clark 3 months, 1 week ago


On 27/06/2025 10:30 pm, Vladimir Oltean wrote:
> On Fri, Jun 27, 2025 at 11:21:38AM +0100, James Clark wrote:
>> This will allow us to return a status from the interrupt handler in a
>> later commit and avoids copying it at the end of
>> dspi_transfer_one_message(). For consistency make polling and DMA modes
>> use the same mechanism.
>>
>> Refactor dspi_rxtx() and dspi_poll() to not return -EINPROGRESS because
>> this isn't actually a status that was ever returned to the core layer
>> but some internal state. Wherever that was used we can look at dspi->len
>> instead.
>>
>> No functional changes intended.
>>
>> Signed-off-by: James Clark <james.clark@linaro.org>
>> ---
> 
> This commit doesn't work, please do not merge this patch.
> 
> You are changing the logic in DMA mode, interrupt-based FIFO and PIO all
> in one go, in a commit whose title and primary purpose is unrelated to
> that. Just a mention of the type "while at it, also do that". And in
> that process, that bundled refactoring introduces a subtle, but severe bug.
> 
> No, that is discouraged. Make one patch per logical change, where only
> one thing is happening and which is obviously correct. It helps you and
> it helps the reviewer.
> 
> Please find attached a set of 3 patches that represent a broken down and
> corrected variant of this one. First 2 should be squashed together in
> your next submission, they are just to illustrate the bug that you've
> introduced (which can be reproduced on any SoC in XSPI mode).
> 

Thanks for the debugging, yes it looks like the patches could be broken 
down a bit.

Just for clarity, is this bug affecting host+polling mode? I can see the 
logic bug in dspi_poll() which I must have tested less thoroughly, but I 
can't actually see any difference in dspi_interrupt().

WRT the patches attached, we could remove dspi_rxtx(). It's only 
encapsulating two function calls, and the return half way through is 
something that is for the outer loop in the caller anyway. Open open 
coding the dspi_fifo_read() and dspi_fifo_write() ends up being clearer IMO.

> The panic message is slightly confusing and does not directly point to
> the issue, I'm attaching it just for the sake of having a future reference.
> 

The panic is a bit confusing, I didn't expect to see dspi_interrupt() if 
it only affects polling mode. Do you have a reproducer? I wasn't able to 
reproduce it in interrupt mode.

> [    4.154185] DSA: tree 0 setup
> [    4.157380] sja1105 spi2.0: Probed switch chip: SJA1105S
> [    4.173894] sja1105 spi2.0: configuring for fixed/sgmii link mode
> [    4.232527] sja1105 spi2.0: Link is Up - 1Gbps/Full - flow control off
> [    4.312798] sja1105 spi2.0 sw0p0 (uninitialized): PHY [0000:00:00.3:07] driver [RTL8211F Gigabit Ethernet] (irq=POLL)
> [    4.443689] sja1105 spi2.0 sw0p1 (uninitialized): PHY [0000:00:00.3:00] driver [Microsemi GE VSC8502 SyncE] (irq=POLL)
> [    4.575718] sja1105 spi2.0 sw0p2 (uninitialized): PHY [0000:00:00.3:01] driver [Microsemi GE VSC8502 SyncE] (irq=POLL)
> [    4.588012] Unable to handle kernel paging request at virtual address ffff8000801ac000
> [    4.595960] Mem abort info:
> [    4.598757]   ESR = 0x0000000096000007
> [    4.602515]   EC = 0x25: DABT (current EL), IL = 32 bits
> [    4.607843]   SET = 0, FnV = 0
> [    4.610902]   EA = 0, S1PTW = 0
> [    4.614048]   FSC = 0x07: level 3 translation fault
> [    4.618939] Data abort info:
> [    4.621822]   ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000
> [    4.627323]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> [    4.632388]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> [    4.637714] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000082b7a000
> [    4.644437] [ffff8000801ac000] pgd=0000000000000000, p4d=1000002080020403, pud=1000002080021403, pmd=1000002080022403, pte=0000000000000000
> [    4.657016] Internal error: Oops: 0000000096000007 [#1]  SMP
> [    4.662693] Modules linked in:
> [    4.665756] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.16.0-rc3+ #30 PREEMPT
> [    4.673615] Hardware name: random LS1028A board
> [    4.679116] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [    4.686103] pc : dspi_8on32_host_to_dev+0x8/0x24
> [    4.690742] lr : dspi_fifo_write+0x178/0x1cc
> [    4.695025] sp : ffff800080003eb0
> [    4.698346] x29: ffff800080003ec0 x28: ffffc25414698b00 x27: ffffc2541464c170
> [    4.705512] x26: 0000000000000001 x25: ffffc25414b06000 x24: 0000000111705fd3
> [    4.712677] x23: ffffc25414257bae x22: ffff8000801ab5e8 x21: 00000000fffffd98
> [    4.719842] x20: 0000000000000000 x19: ffff00200039a480 x18: 0000000000000006
> [    4.727007] x17: ffff3dcc6b076000 x16: ffff800080000000 x15: 0000000078b30c40
> [    4.734171] x14: 0000000000000000 x13: 0000000000000048 x12: 0000000000000128
> [    4.741335] x11: 0000000000000001 x10: 0000000000000000 x9 : 0000000100010001
> [    4.748500] x8 : ffff8000801ac000 x7 : 0000000000000000 x6 : 0000000000000000
> [    4.755664] x5 : 0000000000000000 x4 : ffffc25411a308d0 x3 : 0000000000000000
> [    4.762828] x2 : 0000000000000000 x1 : ffff800080003eb4 x0 : ffff00200039a480
> [    4.769992] Call trace:
> [    4.772441]  dspi_8on32_host_to_dev+0x8/0x24 (P)
> [    4.777074]  dspi_interrupt+0x6c/0xf0
> [    4.780747]  __handle_irq_event_percpu+0x8c/0x160
> [    4.785470]  handle_irq_event+0x48/0xa0
> [    4.789319]  handle_fasteoi_irq+0xf4/0x208
> [    4.793428]  generic_handle_domain_irq+0x40/0x64
> [    4.798060]  gic_handle_irq+0x4c/0x110
> [    4.801820]  call_on_irq_stack+0x24/0x30
> [    4.805757]  el1_interrupt+0x74/0xc0
> [    4.809346]  el1h_64_irq_handler+0x18/0x24
> [    4.813457]  el1h_64_irq+0x6c/0x70
> [    4.816867]  arch_local_irq_enable+0x8/0xc (P)
> [    4.821330]  cpuidle_enter+0x38/0x50
> [    4.824914]  do_idle+0x1c4/0x250
> [    4.828152]  cpu_startup_entry+0x34/0x38
> [    4.832087]  kernel_init+0x0/0x1a0
> [    4.835500]  start_kernel+0x2ec/0x398
> [    4.839175]  __primary_switched+0x88/0x90
> [    4.843200] Code: f9003008 d65f03c0 d503245f f9402c08 (b9400108)
> [    4.849313] ---[ end trace 0000000000000000 ]---
> [    4.853943] Kernel panic - not syncing: Oops: Fatal exception in interrupt
> [    4.860840] SMP: stopping secondary CPUs
> [    4.864788] Kernel Offset: 0x425391a00000 from 0xffff800080000000
> [    4.870900] PHYS_OFFSET: 0xfff1000080000000
> [    4.875093] CPU features: 0x1000,000804b0,02000801,0400421b
> [    4.880683] Memory Limit: none
> [    4.883750] ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]---
> 
> I still intend to do more testing, so please don't send the next version
> just yet. Tracking down this issue took a bit more than I was planning.

We could also revert to the original idea of adding the status int to 
struct fsl_dspi. Much simpler and nothing needs to be changed except 
reading it out in interrupt mode.

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by Vladimir Oltean 3 months, 1 week ago

On Mon, Jun 30, 2025 at 01:54:11PM +0100, James Clark wrote:
> On 27/06/2025 10:30 pm, Vladimir Oltean wrote:
> > On Fri, Jun 27, 2025 at 11:21:38AM +0100, James Clark wrote:
> > > This will allow us to return a status from the interrupt handler in a
> > > later commit and avoids copying it at the end of
> > > dspi_transfer_one_message(). For consistency make polling and DMA modes
> > > use the same mechanism.
> > > 
> > > Refactor dspi_rxtx() and dspi_poll() to not return -EINPROGRESS because
> > > this isn't actually a status that was ever returned to the core layer
> > > but some internal state. Wherever that was used we can look at dspi->len
> > > instead.
> > > 
> > > No functional changes intended.
> > > 
> > > Signed-off-by: James Clark <james.clark@linaro.org>
> > > ---
> > 
> > This commit doesn't work, please do not merge this patch.
> > 
> > You are changing the logic in DMA mode, interrupt-based FIFO and PIO all
> > in one go, in a commit whose title and primary purpose is unrelated to
> > that. Just a mention of the type "while at it, also do that". And in
> > that process, that bundled refactoring introduces a subtle, but severe bug.
> > 
> > No, that is discouraged. Make one patch per logical change, where only
> > one thing is happening and which is obviously correct. It helps you and
> > it helps the reviewer.
> > 
> > Please find attached a set of 3 patches that represent a broken down and
> > corrected variant of this one. First 2 should be squashed together in
> > your next submission, they are just to illustrate the bug that you've
> > introduced (which can be reproduced on any SoC in XSPI mode).
> > 
> 
> Thanks for the debugging, yes it looks like the patches could be broken down
> a bit.
> 
> Just for clarity, is this bug affecting host+polling mode? I can see the
> logic bug in dspi_poll() which I must have tested less thoroughly, but I
> can't actually see any difference in dspi_interrupt().

It should affect both, I tested your patches unmodified, i.e. interrupt
based XSPI FIFO mode (in master mode).

Assume (not real numbers, just for explanation's sake) dspi->len is 2
(2 FIFO sizes worth of 32-bit words, but let's assume for simplicity
that each dspi_pop_tx() call simply decrements the len by 1).

The correct behavior would be this:

dspi_transfer_one_message()
-> dspi->len = 2
-> dspi_fifo_write()
   -> dspi_xspi_fifo_write()
      -> dspi_pop_tx()
         -> dspi->len = 1
-> wait_for_completion(&dspi->xfer_done)
                                           <IRQ>
                                           dspi_interrupt()
                                           -> dspi_rxtx()
                                              -> dspi_fifo_read()
                                              -> dspi_fifo_write()
                                                 -> dspi_xspi_fifo_write()
                                                    -> dspi_pop_tx()
                                                       -> dspi->len = 0
                                           <IRQ>
                                           dspi_interrupt()
                                           -> dspi_rxtx()
                                              -> dspi_fifo_read()
                                           -> complete(&dspi->xfer_done)
-> reinit_completion(&dspi->xfer_done)

but the behavior with your proposed logic is this:

dspi_transfer_one_message()
-> dspi->len = 2
-> dspi_fifo_write()
   -> dspi_xspi_fifo_write()
      -> dspi_pop_tx()
         -> dspi->len = 1
-> wait_for_completion(&dspi->xfer_done)
                                           <IRQ>
                                           dspi_interrupt()
                                           -> dspi_rxtx()
                                              -> dspi_fifo_read()
                                              -> dspi_fifo_write()
                                                 -> dspi_xspi_fifo_write()
                                                    -> dspi_pop_tx()
                                                       -> dspi->len = 0
                                           -> complete(&dspi->xfer_done)
-> reinit_completion(&dspi->xfer_done)
                                           <IRQ>
                                           dspi_interrupt()
                                           -> Second interrupt is spurious at
                                              this point, since the process
                                              context may have proceeded
                                              to change pointers in
                                              dspi->cur_transfer, etc.

Clearer now? Essentially the complete() call is premature, it needs to
be not after the dspi_fifo_write() call, but after its subsequent
dspi_fifo_read(), which comes after yet another IRQ, in the IRQ-triggered
path.

Not sure why you are not able to reproduce this, maybe luck had it that
the complete() call never woke up the process context earlier than the
second IRQ in the above case triggered?

I'm not doing anything special in particular, just booted a board with a
SPI device driver (sja1105). This transfers some sequences of relatively
large buffers (256 bytes) at probe time, maybe that exercises the
controller driver more than the average peripheral driver.

Re: [PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status

Posted by James Clark 3 months, 1 week ago


On 30/06/2025 9:41 pm, Vladimir Oltean wrote:
> On Mon, Jun 30, 2025 at 01:54:11PM +0100, James Clark wrote:
>> On 27/06/2025 10:30 pm, Vladimir Oltean wrote:
>>> On Fri, Jun 27, 2025 at 11:21:38AM +0100, James Clark wrote:
>>>> This will allow us to return a status from the interrupt handler in a
>>>> later commit and avoids copying it at the end of
>>>> dspi_transfer_one_message(). For consistency make polling and DMA modes
>>>> use the same mechanism.
>>>>
>>>> Refactor dspi_rxtx() and dspi_poll() to not return -EINPROGRESS because
>>>> this isn't actually a status that was ever returned to the core layer
>>>> but some internal state. Wherever that was used we can look at dspi->len
>>>> instead.
>>>>
>>>> No functional changes intended.
>>>>
>>>> Signed-off-by: James Clark <james.clark@linaro.org>
>>>> ---
>>>
>>> This commit doesn't work, please do not merge this patch.
>>>
>>> You are changing the logic in DMA mode, interrupt-based FIFO and PIO all
>>> in one go, in a commit whose title and primary purpose is unrelated to
>>> that. Just a mention of the type "while at it, also do that". And in
>>> that process, that bundled refactoring introduces a subtle, but severe bug.
>>>
>>> No, that is discouraged. Make one patch per logical change, where only
>>> one thing is happening and which is obviously correct. It helps you and
>>> it helps the reviewer.
>>>
>>> Please find attached a set of 3 patches that represent a broken down and
>>> corrected variant of this one. First 2 should be squashed together in
>>> your next submission, they are just to illustrate the bug that you've
>>> introduced (which can be reproduced on any SoC in XSPI mode).
>>>
>>
>> Thanks for the debugging, yes it looks like the patches could be broken down
>> a bit.
>>
>> Just for clarity, is this bug affecting host+polling mode? I can see the
>> logic bug in dspi_poll() which I must have tested less thoroughly, but I
>> can't actually see any difference in dspi_interrupt().
> 
> It should affect both, I tested your patches unmodified, i.e. interrupt
> based XSPI FIFO mode (in master mode).
> 
> Assume (not real numbers, just for explanation's sake) dspi->len is 2
> (2 FIFO sizes worth of 32-bit words, but let's assume for simplicity
> that each dspi_pop_tx() call simply decrements the len by 1).
> 
> The correct behavior would be this:
> 
> dspi_transfer_one_message()
> -> dspi->len = 2
> -> dspi_fifo_write()
>     -> dspi_xspi_fifo_write()
>        -> dspi_pop_tx()
>           -> dspi->len = 1
> -> wait_for_completion(&dspi->xfer_done)
>                                             <IRQ>
>                                             dspi_interrupt()
>                                             -> dspi_rxtx()
>                                                -> dspi_fifo_read()
>                                                -> dspi_fifo_write()
>                                                   -> dspi_xspi_fifo_write()
>                                                      -> dspi_pop_tx()
>                                                         -> dspi->len = 0
>                                             <IRQ>
>                                             dspi_interrupt()
>                                             -> dspi_rxtx()
>                                                -> dspi_fifo_read()
>                                             -> complete(&dspi->xfer_done)
> -> reinit_completion(&dspi->xfer_done)
> 
> but the behavior with your proposed logic is this:
> 
> dspi_transfer_one_message()
> -> dspi->len = 2
> -> dspi_fifo_write()
>     -> dspi_xspi_fifo_write()
>        -> dspi_pop_tx()
>           -> dspi->len = 1
> -> wait_for_completion(&dspi->xfer_done)
>                                             <IRQ>
>                                             dspi_interrupt()
>                                             -> dspi_rxtx()
>                                                -> dspi_fifo_read()
>                                                -> dspi_fifo_write()
>                                                   -> dspi_xspi_fifo_write()
>                                                      -> dspi_pop_tx()
>                                                         -> dspi->len = 0
>                                             -> complete(&dspi->xfer_done)
> -> reinit_completion(&dspi->xfer_done)
>                                             <IRQ>
>                                             dspi_interrupt()
>                                             -> Second interrupt is spurious at
>                                                this point, since the process
>                                                context may have proceeded
>                                                to change pointers in
>                                                dspi->cur_transfer, etc.
> 
> Clearer now? Essentially the complete() call is premature, it needs to
> be not after the dspi_fifo_write() call, but after its subsequent
> dspi_fifo_read(), which comes after yet another IRQ, in the IRQ-triggered
> path.
> 

Much clearer, thanks. Not sure how I missed that, maybe a confusion 
about whether it was dspi_fifo_read() or dspi_fifo_write() that modifies 
  dspi->len.

> Not sure why you are not able to reproduce this, maybe luck had it that
> the complete() call never woke up the process context earlier than the
> second IRQ in the above case triggered?
> 
> I'm not doing anything special in particular, just booted a board with a
> SPI device driver (sja1105). This transfers some sequences of relatively
> large buffers (256 bytes) at probe time, maybe that exercises the
> controller driver more than the average peripheral driver.

It's strange because I was stressing it quite a lot, especially with the 
performance testing.

[PATCH v4 1/6] spi: spi-fsl-dspi: Clear completion counter before initiating transfer
[PATCH v4 2/6] spi: spi-fsl-dspi: Store status directly in cur_msg->status
[PATCH v4 3/6] spi: spi-fsl-dspi: Stub out DMA functions
[PATCH v4 4/6] spi: spi-fsl-dspi: Use non-coherent memory for DMA
[PATCH v4 5/6] spi: spi-fsl-dspi: Increase DMA buffer size
[PATCH v4 6/6] spi: spi-fsl-dspi: Report FIFO overflows as errors