[v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

[PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Richard Fitzgerald 2 months, 2 weeks ago

Issue DP prepare to all ports that use full CP_SM. Then wait for the
prepare to complete. This allow all the DP to prepare in parallel to
reduce the latency of starting an audio stream.

On a system with six CS35L56 amps, this reduces the startup latency,
from runtime_resume to all amps ready to play, from ~160 ms to ~60 ms.

(Test hardware: UpXtreme i14, BIOS v1.2, Core Ultra 7 155H, 3x CS35L56
on link 0, 3x CS35L56 on link 1).

An initial read of DPn_PREPARESTATUS is done before dropping into the wait,
so that a quick exit can be made if the port is already prepared. Currently
this is essential because the wait deadlocks - the stream setup takes
bus_lock, which blocks the interrupt handler - so the wait for completion
will always timeout.

However, an experiment of removing the bus_lock from stream setup, so that
the interrupt will work, shows that wait for completion takes ~700..800 us
but the quick-exit read takes 50..200 us. So the quick exit is still
valuable even if the stream.c code was rewritten to allow the completion
interrupt to work. Rewriting the code so it doesn't take bus_lock is risky.
The deadlock only lasts until the wait times out so it's not a serious
problem now that the DP prepare happens in parallel.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
---
Changes in V3:
- Removed duplicate deferencing of s_rt->slave->prop.dp0_prop.
  V2 saved it into dp0_prop, so use that.

Changes in V2:
- Fixed missing initialization of dp0_prop.

 drivers/soundwire/stream.c | 101 +++++++++++++++++++++++++++++--------
 1 file changed, 81 insertions(+), 20 deletions(-)

diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 38c9dbd35606..15d66262ad37 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -443,14 +443,12 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 				       struct sdw_port_runtime *p_rt,
 				       bool prep)
 {
-	struct completion *port_ready;
 	struct sdw_dpn_prop *dpn_prop;
 	struct sdw_prepare_ch prep_ch;
 	u32 imp_def_interrupts;
 	bool simple_ch_prep_sm;
-	u32 ch_prep_timeout;
 	bool intr = false;
-	int ret = 0, val;
+	int ret = 0;
 	u32 addr;
 
 	prep_ch.num = p_rt->num;
@@ -466,7 +464,6 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 
 		imp_def_interrupts = dpn_prop->imp_def_interrupts;
 		simple_ch_prep_sm = dpn_prop->simple_ch_prep_sm;
-		ch_prep_timeout = dpn_prop->ch_prep_timeout;
 	} else {
 		struct sdw_dp0_prop *dp0_prop = s_rt->slave->prop.dp0_prop;
 
@@ -477,7 +474,6 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 		}
 		imp_def_interrupts = dp0_prop->imp_def_interrupts;
 		simple_ch_prep_sm =  dp0_prop->simple_ch_prep_sm;
-		ch_prep_timeout = dp0_prop->ch_prep_timeout;
 	}
 
 	prep_ch.prepare = prep;
@@ -518,23 +514,16 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 			return ret;
 		}
 
-		/* Wait for completion on port ready */
-		port_ready = &s_rt->slave->port_ready[prep_ch.num];
-		wait_for_completion_timeout(port_ready,
-			msecs_to_jiffies(ch_prep_timeout));
-
-		val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
-		if ((val < 0) || (val & p_rt->ch_mask)) {
-			ret = (val < 0) ? val : -ETIMEDOUT;
-			dev_err(&s_rt->slave->dev,
-				"Chn prep failed for port %d: %d\n", prep_ch.num, ret);
-			return ret;
-		}
+		/*
+		 * Defer wait for completion to allow all peripherals to
+		 * prepare in parallel.
+		 */
+	} else {
+		/* Inform slaves about ports prepared */
+		sdw_do_port_prep(s_rt, prep_ch,
+				 prep ? SDW_OPS_PORT_POST_PREP : SDW_OPS_PORT_POST_DEPREP);
 	}
 
-	/* Inform slaves about ports prepared */
-	sdw_do_port_prep(s_rt, prep_ch, prep ? SDW_OPS_PORT_POST_PREP : SDW_OPS_PORT_POST_DEPREP);
-
 	/* Disable interrupt after Port de-prepare */
 	if (!prep && intr)
 		ret = sdw_configure_dpn_intr(s_rt->slave, p_rt->num, prep,
@@ -543,6 +532,67 @@ static int sdw_prep_deprep_slave_ports(struct sdw_bus *bus,
 	return ret;
 }
 
+static int sdw_wait_prep_slave_ports(struct sdw_bus *bus,
+				     struct sdw_slave_runtime *s_rt,
+				     struct sdw_port_runtime *p_rt)
+{
+	struct completion *port_ready;
+	struct sdw_dpn_prop *dpn_prop;
+	struct sdw_dp0_prop *dp0_prop;
+	struct sdw_prepare_ch prep_ch;
+	bool simple_ch_prep_sm;
+	u32 ch_prep_timeout;
+	int ret, val;
+
+	if (p_rt->num) {
+		dpn_prop = sdw_get_slave_dpn_prop(s_rt->slave, s_rt->direction, p_rt->num);
+		simple_ch_prep_sm = dpn_prop->simple_ch_prep_sm;
+		ch_prep_timeout = dpn_prop->ch_prep_timeout;
+	} else {
+		dp0_prop = s_rt->slave->prop.dp0_prop;
+		simple_ch_prep_sm = dp0_prop->simple_ch_prep_sm;
+		ch_prep_timeout = dp0_prop->ch_prep_timeout;
+	}
+
+	if (simple_ch_prep_sm)
+		return 0;
+
+	/*
+	 * Check if already prepared. Avoid overhead of waiting for interrupt
+	 * and port_ready completion if we don't need to.
+	 */
+	val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
+	if (val < 0) {
+		ret = val;
+		goto err;
+	}
+
+	if (val & p_rt->ch_mask) {
+		/* Wait for completion on port ready */
+		port_ready = &s_rt->slave->port_ready[p_rt->num];
+		wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));
+		val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
+		if ((val < 0) || (val & p_rt->ch_mask)) {
+			ret = (val < 0) ? val : -ETIMEDOUT;
+			goto err;
+		}
+	}
+
+	/* Inform slaves about ports prepared */
+	prep_ch.num = p_rt->num;
+	prep_ch.ch_mask = p_rt->ch_mask;
+	prep_ch.prepare = true;
+	prep_ch.bank = bus->params.next_bank;
+	sdw_do_port_prep(s_rt, prep_ch, SDW_OPS_PORT_POST_PREP);
+
+	return 0;
+
+err:
+	dev_err(&s_rt->slave->dev, "Chn prep failed for port %d: %d\n", p_rt->num, ret);
+
+	return ret;
+}
+
 static int sdw_prep_deprep_master_ports(struct sdw_master_runtime *m_rt,
 					struct sdw_port_runtime *p_rt,
 					bool prep)
@@ -594,6 +644,17 @@ static int sdw_prep_deprep_ports(struct sdw_master_runtime *m_rt, bool prep)
 		}
 	}
 
+	/* Wait for parallel CP_SM prepare completion */
+	if (prep) {
+		list_for_each_entry(s_rt, &m_rt->slave_rt_list, m_rt_node) {
+			list_for_each_entry(p_rt, &s_rt->port_list, port_node) {
+				ret = sdw_wait_prep_slave_ports(m_rt->bus, s_rt, p_rt);
+				if (ret < 0)
+					return ret;
+			}
+		}
+	}
+
 	/* Prepare/De-prepare Master port(s) */
 	list_for_each_entry(p_rt, &m_rt->port_list, port_node) {
 		ret = sdw_prep_deprep_master_ports(m_rt, p_rt, prep);
-- 
2.47.3

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Pierre-Louis Bossart 2 months ago

On 11/25/25 16:56, Richard Fitzgerald wrote:
> Issue DP prepare to all ports that use full CP_SM. Then wait for the
> prepare to complete. This allow all the DP to prepare in parallel to
> reduce the latency of starting an audio stream.
> 
> On a system with six CS35L56 amps, this reduces the startup latency,
> from runtime_resume to all amps ready to play, from ~160 ms to ~60 ms.
> 
> (Test hardware: UpXtreme i14, BIOS v1.2, Core Ultra 7 155H, 3x CS35L56
> on link 0, 3x CS35L56 on link 1).
> 
> An initial read of DPn_PREPARESTATUS is done before dropping into the wait,
> so that a quick exit can be made if the port is already prepared. Currently
> this is essential because the wait deadlocks - the stream setup takes
> bus_lock, which blocks the interrupt handler - so the wait for completion
> will always timeout.
> 
> However, an experiment of removing the bus_lock from stream setup, so that
> the interrupt will work, shows that wait for completion takes ~700..800 us
> but the quick-exit read takes 50..200 us. So the quick exit is still
> valuable even if the stream.c code was rewritten to allow the completion
> interrupt to work. Rewriting the code so it doesn't take bus_lock is risky.
> The deadlock only lasts until the wait times out so it's not a serious
> problem now that the DP prepare happens in parallel.

The only limitation I see with the stream mechanism is that we cannot start two or more streams at the same time, even if the hardware supports it. On paper it'd be interesting to e.g. start capture and playback with the same trigger (bank switch). Like you said I am not sure anyone is ready for now to test all the corner cases to remove this bus lock.

> 
> Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
> ---
> Changes in V3:
> - Removed duplicate deferencing of s_rt->slave->prop.dp0_prop.
>   V2 saved it into dp0_prop, so use that.
> 
> Changes in V2:
> +	if (simple_ch_prep_sm)
> +		return 0;
> +
> +	/*
> +	 * Check if already prepared. Avoid overhead of waiting for interrupt
> +	 * and port_ready completion if we don't need to.
> +	 */
> +	val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
> +	if (val < 0) {
> +		ret = val;
> +		goto err;
> +	}
> +
> +	if (val & p_rt->ch_mask) {

Can you explain why we don't use the ch_mask in the already-prepared case? I am missing something.

> +		/* Wait for completion on port ready */
> +		port_ready = &s_rt->slave->port_ready[p_rt->num];
> +		wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));

I understand the code is the same as before but would there be any merit in checking the timeout before starting a read? If the device is already in the weeds, doing another read adds even more time before reporting an error.

> +		val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
> +		if ((val < 0) || (val & p_rt->ch_mask)) {
> +			ret = (val < 0) ? val : -ETIMEDOUT;
> +			goto err;
> +		}
> +	}
T

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Richard Fitzgerald 2 months ago

On 09/12/2025 1:04 pm, Pierre-Louis Bossart wrote:
> On 11/25/25 16:56, Richard Fitzgerald wrote:
>> Issue DP prepare to all ports that use full CP_SM. Then wait for the
>> prepare to complete. This allow all the DP to prepare in parallel to
>> reduce the latency of starting an audio stream.
>>
>> On a system with six CS35L56 amps, this reduces the startup latency,
>> from runtime_resume to all amps ready to play, from ~160 ms to ~60 ms.
>>
>> (Test hardware: UpXtreme i14, BIOS v1.2, Core Ultra 7 155H, 3x CS35L56
>> on link 0, 3x CS35L56 on link 1).
>>
>> An initial read of DPn_PREPARESTATUS is done before dropping into the wait,
>> so that a quick exit can be made if the port is already prepared. Currently
>> this is essential because the wait deadlocks - the stream setup takes
>> bus_lock, which blocks the interrupt handler - so the wait for completion
>> will always timeout.
>>
>> However, an experiment of removing the bus_lock from stream setup, so that
>> the interrupt will work, shows that wait for completion takes ~700..800 us
>> but the quick-exit read takes 50..200 us. So the quick exit is still
>> valuable even if the stream.c code was rewritten to allow the completion
>> interrupt to work. Rewriting the code so it doesn't take bus_lock is risky.
>> The deadlock only lasts until the wait times out so it's not a serious
>> problem now that the DP prepare happens in parallel.
> 
> The only limitation I see with the stream mechanism is that we cannot start two or more streams at the same time, even if the hardware supports it. On paper it'd be interesting to e.g. start capture and playback with the same trigger (bank switch). Like you said I am not sure anyone is ready for now to test all the corner cases to remove this bus lock.
> 
>>
>> Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
>> ---
>> Changes in V3:
>> - Removed duplicate deferencing of s_rt->slave->prop.dp0_prop.
>>    V2 saved it into dp0_prop, so use that.
>>
>> Changes in V2:
>> +	if (simple_ch_prep_sm)
>> +		return 0;
>> +
>> +	/*
>> +	 * Check if already prepared. Avoid overhead of waiting for interrupt
>> +	 * and port_ready completion if we don't need to.
>> +	 */
>> +	val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>> +	if (val < 0) {
>> +		ret = val;
>> +		goto err;
>> +	}
>> +
>> +	if (val & p_rt->ch_mask) {
> 
> Can you explain why we don't use the ch_mask in the already-prepared case? I am missing something.
> 
I'm not sure what you mean here. The if() immediately above your comment
uses ch_mask to check the already-prepared state.

>> +		/* Wait for completion on port ready */
>> +		port_ready = &s_rt->slave->port_ready[p_rt->num];
>> +		wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));
> 
> I understand the code is the same as before but would there be any merit in checking the timeout before starting a read? If the device is already in the weeds, doing another read adds even more time before reporting an error.
> 
Do you mean save the system time when the DPN_PREPARE was written to
that peripheral and then check here whether the timeout period has
already elapsed?

If that's what you mean, I don't see much advantage in that. If the
hardware is working correctly, this will be detected by the read above
that checks if the peripheral has already prepared. If it has we skip
the wait_for_completion_timeout().

If the peripheral is "in the weeds", so that its prepare time has
already passed and it still isn't ready, we're no longer in a state
where we care about minimizing audio startup time because the hardware
is now broken. So it's probably not worth complicating the code to
take a few milliseconds off that case.

>> +		val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>> +		if ((val < 0) || (val & p_rt->ch_mask)) {
>> +			ret = (val < 0) ? val : -ETIMEDOUT;
>> +			goto err;
>> +		}
>> +	}
> T
>

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Pierre-Louis Bossart 2 months ago

>>> Changes in V2:
>>> +    if (simple_ch_prep_sm)
>>> +        return 0;
>>> +
>>> +    /*
>>> +     * Check if already prepared. Avoid overhead of waiting for interrupt
>>> +     * and port_ready completion if we don't need to.
>>> +     */

1.

>>> +    val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>>> +    if (val < 0) {
>>> +        ret = val;
>>> +        goto err;
>>> +    }
>>> +
>>> +    if (val & p_rt->ch_mask) {
>>
>> Can you explain why we don't use the ch_mask in the already-prepared case? I am missing something.
>>
> I'm not sure what you mean here. The if() immediately above your comment
> uses ch_mask to check the already-prepared state.

I was referring to the 1. above, you read the prepare status without checking for ch_mask first.

>>> +        /* Wait for completion on port ready */
>>> +        port_ready = &s_rt->slave->port_ready[p_rt->num];
>>> +        wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));
>>
>> I understand the code is the same as before but would there be any merit in checking the timeout before starting a read? If the device is already in the weeds, doing another read adds even more time before reporting an error.
>>
> Do you mean save the system time when the DPN_PREPARE was written to
> that peripheral and then check here whether the timeout period has
> already elapsed?

I meant testing the return value of wait_for_completion_timeout(). If you already timed out at this point with a return value of zero, there's no point in checking the status any more, the system is in the weeds.

> If that's what you mean, I don't see much advantage in that. If the
> hardware is working correctly, this will be detected by the read above
> that checks if the peripheral has already prepared. If it has we skip
> the wait_for_completion_timeout().
> 
> If the peripheral is "in the weeds", so that its prepare time has
> already passed and it still isn't ready, we're no longer in a state
> where we care about minimizing audio startup time because the hardware
> is now broken. So it's probably not worth complicating the code to
> take a few milliseconds off that case.

I agree it's no longer about minimizing the start time but rather providing an error faster, without waiting for a second timeout on read.
 
>>> +        val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>>> +        if ((val < 0) || (val & p_rt->ch_mask)) {
>>> +            ret = (val < 0) ? val : -ETIMEDOUT;
>>> +            goto err;
>>> +        }
>>> +    }
>> T
>>
>

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Richard Fitzgerald 1 month, 4 weeks ago

On 9/12/25 16:41, Pierre-Louis Bossart wrote:
> 
>>>> Changes in V2:
>>>> +    if (simple_ch_prep_sm)
>>>> +        return 0;
>>>> +
>>>> +    /*
>>>> +     * Check if already prepared. Avoid overhead of waiting for interrupt
>>>> +     * and port_ready completion if we don't need to.
>>>> +     */
> 
> 1.
> 
>>>> +    val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>>>> +    if (val < 0) {
>>>> +        ret = val;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    if (val & p_rt->ch_mask) {
>>>
>>> Can you explain why we don't use the ch_mask in the already-prepared case? I am missing something.
>>>
>> I'm not sure what you mean here. The if() immediately above your comment
>> uses ch_mask to check the already-prepared state.
> 
> I was referring to the 1. above, you read the prepare status without checking for ch_mask first.
>

What would be the purpose of checking ch_mask before the read?

>>>> +        /* Wait for completion on port ready */
>>>> +        port_ready = &s_rt->slave->port_ready[p_rt->num];
>>>> +        wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));
>>>
>>> I understand the code is the same as before but would there be any merit in checking the timeout before starting a read? If the device is already in the weeds, doing another read adds even more time before reporting an error.
>>>
>> Do you mean save the system time when the DPN_PREPARE was written to
>> that peripheral and then check here whether the timeout period has
>> already elapsed?
> 
> I meant testing the return value of wait_for_completion_timeout(). If you already timed out at this point with a return value of zero, there's no point in checking the status any more, the system is in the weeds.
> 

Wait completion will _always_ timeout because this code is holding the
bus lock, which blocks the ALERT handler from running and signalling
the completion. The wait_for_completion_timeout() is effectively
msleep(msecs_to_jiffies(ch_prep_timeout));
So we have to read the register afterwards to see whether the peripheral
actually prepared.

I've left the useless wait_for_completion_timeout() in the code so this
commit is only changing what it says it is changing, and nothing else.

What to do about the deadlocked wait_for_completion_timeout() is a
separate problem.

>> If that's what you mean, I don't see much advantage in that. If the
>> hardware is working correctly, this will be detected by the read above
>> that checks if the peripheral has already prepared. If it has we skip
>> the wait_for_completion_timeout().
>>
>> If the peripheral is "in the weeds", so that its prepare time has
>> already passed and it still isn't ready, we're no longer in a state
>> where we care about minimizing audio startup time because the hardware
>> is now broken. So it's probably not worth complicating the code to
>> take a few milliseconds off that case.
> 
> I agree it's no longer about minimizing the start time but rather providing an error faster, without waiting for a second timeout on read.
>   
>>>> +        val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>>>> +        if ((val < 0) || (val & p_rt->ch_mask)) {
>>>> +            ret = (val < 0) ? val : -ETIMEDOUT;
>>>> +            goto err;
>>>> +        }
>>>> +    }
>>> T
>>>
>>
>

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Pierre-Louis Bossart 1 month, 2 weeks ago

On 12/10/25 10:59, Richard Fitzgerald wrote:
> On 9/12/25 16:41, Pierre-Louis Bossart wrote:
>>
>>>>> Changes in V2:
>>>>> +    if (simple_ch_prep_sm)
>>>>> +        return 0;
>>>>> +
>>>>> +    /*
>>>>> +     * Check if already prepared. Avoid overhead of waiting for interrupt
>>>>> +     * and port_ready completion if we don't need to.
>>>>> +     */
>>
>> 1.
>>
>>>>> +    val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>>>>> +    if (val < 0) {
>>>>> +        ret = val;
>>>>> +        goto err;
>>>>> +    }
>>>>> +
>>>>> +    if (val & p_rt->ch_mask) {
>>>>
>>>> Can you explain why we don't use the ch_mask in the already-prepared case? I am missing something.
>>>>
>>> I'm not sure what you mean here. The if() immediately above your comment
>>> uses ch_mask to check the already-prepared state.
>>
>> I was referring to the 1. above, you read the prepare status without checking for ch_mask first.
>>
> 
> What would be the purpose of checking ch_mask before the read?

I don't know - why do we need to read it in the second case and not the first is all I am asking.
 
>>>>> +        /* Wait for completion on port ready */
>>>>> +        port_ready = &s_rt->slave->port_ready[p_rt->num];
>>>>> +        wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));
>>>>
>>>> I understand the code is the same as before but would there be any merit in checking the timeout before starting a read? If the device is already in the weeds, doing another read adds even more time before reporting an error.
>>>>
>>> Do you mean save the system time when the DPN_PREPARE was written to
>>> that peripheral and then check here whether the timeout period has
>>> already elapsed?
>>
>> I meant testing the return value of wait_for_completion_timeout(). If you already timed out at this point with a return value of zero, there's no point in checking the status any more, the system is in the weeds.
>>
> 
> Wait completion will _always_ timeout because this code is holding the
> bus lock, which blocks the ALERT handler from running and signalling
> the completion. The wait_for_completion_timeout() is effectively
> msleep(msecs_to_jiffies(ch_prep_timeout));
> So we have to read the register afterwards to see whether the peripheral
> actually prepared.
> 
> I've left the useless wait_for_completion_timeout() in the code so this
> commit is only changing what it says it is changing, and nothing else.
> 
> What to do about the deadlocked wait_for_completion_timeout() is a
> separate problem.

Humm, what happens if you have a single peripheral, does this result in an increase of the prepare time all the way to the timeout value?
I can see how preparing all ports in parallel would reduce the total time compared to a serial approach, even with a timeout, but if we end-up always timing out even in the case where there is a single device it'd be quite odd.

In other words does this patch reduce the start latency only in the case of multiple devices, but adds a 'tax' for all other cases?

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Richard Fitzgerald 1 month, 2 weeks ago

On 20/12/25 11:15, Pierre-Louis Bossart wrote:
> On 12/10/25 10:59, Richard Fitzgerald wrote:
>> On 9/12/25 16:41, Pierre-Louis Bossart wrote:
>>>
>>>>>> Changes in V2:
>>>>>> +    if (simple_ch_prep_sm)
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    /*
>>>>>> +     * Check if already prepared. Avoid overhead of waiting for interrupt
>>>>>> +     * and port_ready completion if we don't need to.
>>>>>> +     */
>>>
>>> 1.
>>>
>>>>>> +    val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>>>>>> +    if (val < 0) {
>>>>>> +        ret = val;
>>>>>> +        goto err;
>>>>>> +    }
>>>>>> +
>>>>>> +    if (val & p_rt->ch_mask) {
>>>>>
>>>>> Can you explain why we don't use the ch_mask in the already-prepared case? I am missing something.
>>>>>
>>>> I'm not sure what you mean here. The if() immediately above your comment
>>>> uses ch_mask to check the already-prepared state.
>>>
>>> I was referring to the 1. above, you read the prepare status without checking for ch_mask first.
>>>
>>
>> What would be the purpose of checking ch_mask before the read?
> 
> I don't know - why do we need to read it in the second case and not the first is all I am asking.
>

What the code is doing is:

1. Read the current prepare status
2. Check if any channels we prepared are still reporting NOT_PREPARED
3. If there are any NOT_PREPARED channels we need to do the wait, else
    we can skip it

>>>>>> +        /* Wait for completion on port ready */
>>>>>> +        port_ready = &s_rt->slave->port_ready[p_rt->num];
>>>>>> +        wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));
>>>>>
>>>>> I understand the code is the same as before but would there be any merit in checking the timeout before starting a read? If the device is already in the weeds, doing another read adds even more time before reporting an error.
>>>>>
>>>> Do you mean save the system time when the DPN_PREPARE was written to
>>>> that peripheral and then check here whether the timeout period has
>>>> already elapsed?
>>>
>>> I meant testing the return value of wait_for_completion_timeout(). If you already timed out at this point with a return value of zero, there's no point in checking the status any more, the system is in the weeds.
>>>
>>
>> Wait completion will _always_ timeout because this code is holding the
>> bus lock, which blocks the ALERT handler from running and signalling
>> the completion. The wait_for_completion_timeout() is effectively
>> msleep(msecs_to_jiffies(ch_prep_timeout));
>> So we have to read the register afterwards to see whether the peripheral
>> actually prepared.
>>
>> I've left the useless wait_for_completion_timeout() in the code so this
>> commit is only changing what it says it is changing, and nothing else.
>>
>> What to do about the deadlocked wait_for_completion_timeout() is a
>> separate problem.
> 
> Humm, what happens if you have a single peripheral, does this result in an increase of the prepare time all the way to the timeout value?
> I can see how preparing all ports in parallel would reduce the total time compared to a serial approach, even with a timeout, but if we end-up always timing out even in the case where there is a single device it'd be quite odd.

Yes, but that's a separate problem that needs fixing. The current code
in the kernel does the timeout for every amp. So if you have 4 amps with
a timeout of 10ms it takes >40ms to do the port prepare.

> In other words does this patch reduce the start latency only in the case of multiple devices, but adds a 'tax' for all other cases?

This only affects peripherals that use the full CP_SM (not simplified).

If you have only 1 peripheral there is either no change or a possibility
of skipping the wait. Before this change it would always wait the full
timeout before noticing that it is ready.

After this change, you might be able to skip the wait. If it prepared
immediately, so that it is already reporting prepared when the first
read tests for that, it will skip the wait.
(BTW I see that happening with CS35L56. It prepares so fast that it
is already done, without waiting for any of the timeouts.)

If it isn't ready that early, you get the timeout as before.

It isn't adding any penalty to single devices, unless you count the
trivial time it takes to do the register read, which is negligible
compared to getting deadlocked in the wait_for_completion_timeout().

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Pierre-Louis Bossart 1 month, 2 weeks ago

>>>>>>> +    val = sdw_read_no_pm(s_rt->slave, SDW_DPN_PREPARESTATUS(p_rt->num));
>>>>>>> +    if (val < 0) {
>>>>>>> +        ret = val;
>>>>>>> +        goto err;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    if (val & p_rt->ch_mask) {
>>>>>>
>>>>>> Can you explain why we don't use the ch_mask in the already-prepared case? I am missing something.
>>>>>>
>>>>> I'm not sure what you mean here. The if() immediately above your comment
>>>>> uses ch_mask to check the already-prepared state.
>>>>
>>>> I was referring to the 1. above, you read the prepare status without checking for ch_mask first.
>>>>
>>>
>>> What would be the purpose of checking ch_mask before the read?
>>
>> I don't know - why do we need to read it in the second case and not the first is all I am asking.
>>
> 
> What the code is doing is:
> 
> 1. Read the current prepare status
> 2. Check if any channels we prepared are still reporting NOT_PREPARED
> 3. If there are any NOT_PREPARED channels we need to do the wait, else
>    we can skip it

ok, sorry for being thick. The code is just fine, I misread it with the test split in two.

>>>>>>> +        /* Wait for completion on port ready */
>>>>>>> +        port_ready = &s_rt->slave->port_ready[p_rt->num];
>>>>>>> +        wait_for_completion_timeout(port_ready, msecs_to_jiffies(ch_prep_timeout));
>>>>>>
>>>>>> I understand the code is the same as before but would there be any merit in checking the timeout before starting a read? If the device is already in the weeds, doing another read adds even more time before reporting an error.
>>>>>>
>>>>> Do you mean save the system time when the DPN_PREPARE was written to
>>>>> that peripheral and then check here whether the timeout period has
>>>>> already elapsed?
>>>>
>>>> I meant testing the return value of wait_for_completion_timeout(). If you already timed out at this point with a return value of zero, there's no point in checking the status any more, the system is in the weeds.
>>>>
>>>
>>> Wait completion will _always_ timeout because this code is holding the
>>> bus lock, which blocks the ALERT handler from running and signalling
>>> the completion. The wait_for_completion_timeout() is effectively
>>> msleep(msecs_to_jiffies(ch_prep_timeout));
>>> So we have to read the register afterwards to see whether the peripheral
>>> actually prepared.
>>>
>>> I've left the useless wait_for_completion_timeout() in the code so this
>>> commit is only changing what it says it is changing, and nothing else.
>>>
>>> What to do about the deadlocked wait_for_completion_timeout() is a
>>> separate problem.
>>
>> Humm, what happens if you have a single peripheral, does this result in an increase of the prepare time all the way to the timeout value?
>> I can see how preparing all ports in parallel would reduce the total time compared to a serial approach, even with a timeout, but if we end-up always timing out even in the case where there is a single device it'd be quite odd.
> 
> Yes, but that's a separate problem that needs fixing. The current code
> in the kernel does the timeout for every amp. So if you have 4 amps with
> a timeout of 10ms it takes >40ms to do the port prepare.

so is this fair to say that by not testing the return value of wait_for_completion_timeout(), we never detected the 'always-timeout' behavior since the initial contribution circa 2018, even with a single device per link? 

Wow, that'd be a big 7-year old conceptual miss...
 
>> In other words does this patch reduce the start latency only in the case of multiple devices, but adds a 'tax' for all other cases?
> 
> This only affects peripherals that use the full CP_SM (not simplified).
> 
> If you have only 1 peripheral there is either no change or a possibility
> of skipping the wait. Before this change it would always wait the full
> timeout before noticing that it is ready.
> 
> After this change, you might be able to skip the wait. If it prepared
> immediately, so that it is already reporting prepared when the first
> read tests for that, it will skip the wait.
> (BTW I see that happening with CS35L56. It prepares so fast that it
> is already done, without waiting for any of the timeouts.)
> 
> If it isn't ready that early, you get the timeout as before.
> 
> It isn't adding any penalty to single devices, unless you count the
> trivial time it takes to do the register read, which is negligible
> compared to getting deadlocked in the wait_for_completion_timeout().

ok, now I get the point that the use of sdw_acquire_bus_lock() in sdw_prepare_stream() prevents the use of device-initiated interrupts.

One possible change is a departure from the use of the PORT_READY alert. This was meant to be a worst-case scenario for cases where the prepare time is in hundreds of ms - IIRC we thought devices would need to adjust their internal clock to that of the bus, which might change during a prepare operation. But as you mentioned it in practice devices are prepared much faster. So we could have a typical loop with multiple read/sleep instead. See for example cdns_clear_bit() as an example of what I have in mind, classic kernel code really.

That might speed-up the prepare time even for the single device case - and then that may be enough to avoid the deferred check on PORT_READY and make the code simpler for multiple devices as well.

At any rate you're onto something, thanks for submitting this and bearing with my questions.

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Richard Fitzgerald 1 month, 2 weeks ago

On 23/12/25 10:29, Pierre-Louis Bossart wrote:
<SNIP>

>>> I can see how preparing all ports in parallel would reduce the total time compared to a serial approach, even with a timeout, but if we end-up always timing out even in the case where there is a single device it'd be quite odd.
>>
>> Yes, but that's a separate problem that needs fixing. The current code
>> in the kernel does the timeout for every amp. So if you have 4 amps with
>> a timeout of 10ms it takes >40ms to do the port prepare.
> 
> so is this fair to say that by not testing the return value of wait_for_completion_timeout(), we never detected the 'always-timeout' behavior since the initial contribution circa 2018, even with a single device per link?
> 
> Wow, that'd be a big 7-year old conceptual miss...

It's just  a design flaw in the code. It wasn't found because it was
tested against Realtek codecs, which use the simplified CP_SM so the
code can skip the Prepare stage.

The original code checked for timeout and then always failed, because it
always timed out. So none of the Cirrus amps and codecs worked. I sent
the "temporary" workaround of reading the status register after the
wait_for_completion() and ignoring the timeout if the device is now
reporting prepared, until I could fix it properly. I've only just got
around to looking at this again.

>>> In other words does this patch reduce the start latency only in the case of multiple devices, but adds a 'tax' for all other cases?
>>
>> This only affects peripherals that use the full CP_SM (not simplified).
>>
>> If you have only 1 peripheral there is either no change or a possibility
>> of skipping the wait. Before this change it would always wait the full
>> timeout before noticing that it is ready.
>>
>> After this change, you might be able to skip the wait. If it prepared
>> immediately, so that it is already reporting prepared when the first
>> read tests for that, it will skip the wait.
>> (BTW I see that happening with CS35L56. It prepares so fast that it
>> is already done, without waiting for any of the timeouts.)
>>
>> If it isn't ready that early, you get the timeout as before.
>>
>> It isn't adding any penalty to single devices, unless you count the
>> trivial time it takes to do the register read, which is negligible
>> compared to getting deadlocked in the wait_for_completion_timeout().
> 
> ok, now I get the point that the use of sdw_acquire_bus_lock() in sdw_prepare_stream() prevents the use of device-initiated interrupts.
> 
> One possible change is a departure from the use of the PORT_READY alert. This was meant to be a worst-case scenario for cases where the prepare time is in hundreds of ms - IIRC we thought devices would need to adjust their internal clock to that of the bus, which might change during a prepare operation. But as you mentioned it in practice devices are prepared much faster. So we could have a typical loop with multiple read/sleep instead. See for example cdns_clear_bit() as an example of what I have in mind, classic kernel code really.
> 
> That might speed-up the prepare time even for the single device case - and then that may be enough to avoid the deferred check on PORT_READY and make the code simpler for multiple devices as well.
>

I've got an experimental patch to replace the
wait_for_completion_timeout() with a polling loop. It's a brute-force
approach but reduces the overhead without the scary rewrite of the bus
locking.

> At any rate you're onto something, thanks for submitting this and bearing with my questions.

I need to send a new version of this patch anyway because it will
need rebasing onto 
https://lore.kernel.org/linux-sound/20251219173830.407-1-niranjan.hy@ti.com/T/#u
That one is fixing a problem so makes sense to take it first.

I'll improve the commenting in V2 to explain better what the code
is doing.

Re: [PATCH v3] soundwire: stream: Prepare ports in parallel to reduce stream start latency

Posted by Pierre-Louis Bossart 1 month, 2 weeks ago

On 12/23/25 11:47, Richard Fitzgerald wrote:
> On 23/12/25 10:29, Pierre-Louis Bossart wrote:
> <SNIP>
> 
>>>> I can see how preparing all ports in parallel would reduce the total time compared to a serial approach, even with a timeout, but if we end-up always timing out even in the case where there is a single device it'd be quite odd.
>>>
>>> Yes, but that's a separate problem that needs fixing. The current code
>>> in the kernel does the timeout for every amp. So if you have 4 amps with
>>> a timeout of 10ms it takes >40ms to do the port prepare.
>>
>> so is this fair to say that by not testing the return value of wait_for_completion_timeout(), we never detected the 'always-timeout' behavior since the initial contribution circa 2018, even with a single device per link?
>>
>> Wow, that'd be a big 7-year old conceptual miss...
> 
> It's just  a design flaw in the code. It wasn't found because it was
> tested against Realtek codecs, which use the simplified CP_SM so the
> code can skip the Prepare stage.

ok, thanks for the precision.

> The original code checked for timeout and then always failed, because it
> always timed out. So none of the Cirrus amps and codecs worked. I sent
> the "temporary" workaround of reading the status register after the
> wait_for_completion() and ignoring the timeout if the device is now
> reporting prepared, until I could fix it properly. I've only just got
> around to looking at this again.

ok, so this is a follow-up to commit 3d3e88e from 2021, where the timeout check was removed?
This should probably be mentioned in the commit description, with a clear explanation that the timeout happens in all cases.

I personally misunderstood that earlier commit, which states 

'The previous implementation would always fail if the
wait_for_completion() timed out, even if the port was reporting
successful prepare.
'

 
>>>> In other words does this patch reduce the start latency only in the case of multiple devices, but adds a 'tax' for all other cases?
>>>
>>> This only affects peripherals that use the full CP_SM (not simplified).
>>>
>>> If you have only 1 peripheral there is either no change or a possibility
>>> of skipping the wait. Before this change it would always wait the full
>>> timeout before noticing that it is ready.
>>>
>>> After this change, you might be able to skip the wait. If it prepared
>>> immediately, so that it is already reporting prepared when the first
>>> read tests for that, it will skip the wait.
>>> (BTW I see that happening with CS35L56. It prepares so fast that it
>>> is already done, without waiting for any of the timeouts.)
>>>
>>> If it isn't ready that early, you get the timeout as before.
>>>
>>> It isn't adding any penalty to single devices, unless you count the
>>> trivial time it takes to do the register read, which is negligible
>>> compared to getting deadlocked in the wait_for_completion_timeout().
>>
>> ok, now I get the point that the use of sdw_acquire_bus_lock() in sdw_prepare_stream() prevents the use of device-initiated interrupts.
>>
>> One possible change is a departure from the use of the PORT_READY alert. This was meant to be a worst-case scenario for cases where the prepare time is in hundreds of ms - IIRC we thought devices would need to adjust their internal clock to that of the bus, which might change during a prepare operation. But as you mentioned it in practice devices are prepared much faster. So we could have a typical loop with multiple read/sleep instead. See for example cdns_clear_bit() as an example of what I have in mind, classic kernel code really.
>>
>> That might speed-up the prepare time even for the single device case - and then that may be enough to avoid the deferred check on PORT_READY and make the code simpler for multiple devices as well.
>>
> 
> I've got an experimental patch to replace the
> wait_for_completion_timeout() with a polling loop. It's a brute-force
> approach but reduces the overhead without the scary rewrite of the bus
> locking.

sounds good, it's not that bad in terms of brute-forcing and certainly less bad than a broken design where an expected interrupt cannot be serviced by construction.

>> At any rate you're onto something, thanks for submitting this and bearing with my questions.
> 
> I need to send a new version of this patch anyway because it will
> need rebasing onto https://lore.kernel.org/linux-sound/20251219173830.407-1-niranjan.hy@ti.com/T/#u
> That one is fixing a problem so makes sense to take it first.
> 
> I'll improve the commenting in V2 to explain better what the code
> is doing.

I'd recommend the 'brute-force' approach, this wait_for_completion_timeout() makes no sense - and it'd remove the need for the extra read before the nonsensical wait.

Probably something for 2026 now ;-)