[v1] remoteproc: xlnx: remote crash recovery

[PATCH v2 2/3] remoteproc: core: full attach detach during recovery

Posted by Tanmay Shah 2 months, 3 weeks ago

Current attach on recovery mechanism loads the clean resource table
during recovery, but doesn't re-allocate the resources. RPMsg
communication will fail after recovery due to this. Fix this
incorrect behavior by doing the full detach and attach of remote
processor during the recovery. This will load the clean resource table
and re-allocate all the resources, which will set up correct vring
information in the resource table.

Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
---

Changes in v2:
  - use rproc_boot instead of rproc_attach
  - move debug message early in the function

 drivers/remoteproc/remoteproc_core.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index aada2780b343..f65e8bc2d1e1 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1777,11 +1777,11 @@ static int rproc_attach_recovery(struct rproc *rproc)
 {
 	int ret;
 
-	ret = __rproc_detach(rproc);
+	ret = rproc_detach(rproc);
 	if (ret)
 		return ret;
 
-	return __rproc_attach(rproc);
+	return rproc_boot(rproc);
 }
 
 static int rproc_boot_recovery(struct rproc *rproc)
@@ -1829,6 +1829,11 @@ int rproc_trigger_recovery(struct rproc *rproc)
 	struct device *dev = &rproc->dev;
 	int ret;
 
+	dev_err(dev, "recovering %s\n", rproc->name);
+
+	if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY))
+		return rproc_attach_recovery(rproc);
+
 	ret = mutex_lock_interruptible(&rproc->lock);
 	if (ret)
 		return ret;
@@ -1837,12 +1842,7 @@ int rproc_trigger_recovery(struct rproc *rproc)
 	if (rproc->state != RPROC_CRASHED)
 		goto unlock_mutex;
 
-	dev_err(dev, "recovering %s\n", rproc->name);
-
-	if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY))
-		ret = rproc_attach_recovery(rproc);
-	else
-		ret = rproc_boot_recovery(rproc);
+	ret = rproc_boot_recovery(rproc);
 
 unlock_mutex:
 	mutex_unlock(&rproc->lock);
@@ -1860,6 +1860,7 @@ static void rproc_crash_handler_work(struct work_struct *work)
 {
 	struct rproc *rproc = container_of(work, struct rproc, crash_handler);
 	struct device *dev = &rproc->dev;
+	int ret;
 
 	dev_dbg(dev, "enter %s\n", __func__);
 
@@ -1883,8 +1884,11 @@ static void rproc_crash_handler_work(struct work_struct *work)
 
 	mutex_unlock(&rproc->lock);
 
-	if (!rproc->recovery_disabled)
-		rproc_trigger_recovery(rproc);
+	if (!rproc->recovery_disabled) {
+		ret = rproc_trigger_recovery(rproc);
+		if (ret)
+			dev_warn(dev, "rproc recovery failed, err %d\n", ret);
+	}
 
 out:
 	pm_relax(rproc->dev.parent);
@@ -2057,7 +2061,7 @@ int rproc_detach(struct rproc *rproc)
 		return ret;
 	}
 
-	if (rproc->state != RPROC_ATTACHED) {
+	if (rproc->state != RPROC_ATTACHED && rproc->state != RPROC_CRASHED) {
 		ret = -EINVAL;
 		goto out;
 	}
-- 
2.34.1

Re: [PATCH v2 2/3] remoteproc: core: full attach detach during recovery

Posted by Mathieu Poirier 2 months, 2 weeks ago

On Thu, Nov 13, 2025 at 07:44:03AM -0800, Tanmay Shah wrote:
> Current attach on recovery mechanism loads the clean resource table
> during recovery, but doesn't re-allocate the resources. RPMsg
> communication will fail after recovery due to this. Fix this
> incorrect behavior by doing the full detach and attach of remote
> processor during the recovery. This will load the clean resource table
> and re-allocate all the resources, which will set up correct vring
> information in the resource table.
> 
> Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
> ---
> 
> Changes in v2:
>   - use rproc_boot instead of rproc_attach
>   - move debug message early in the function
> 
>  drivers/remoteproc/remoteproc_core.c | 26 +++++++++++++++-----------
>  1 file changed, 15 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
> index aada2780b343..f65e8bc2d1e1 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -1777,11 +1777,11 @@ static int rproc_attach_recovery(struct rproc *rproc)
>  {
>  	int ret;
>  
> -	ret = __rproc_detach(rproc);
> +	ret = rproc_detach(rproc);
>  	if (ret)
>  		return ret;
>  
> -	return __rproc_attach(rproc);
> +	return rproc_boot(rproc);
>  }
>  
>  static int rproc_boot_recovery(struct rproc *rproc)
> @@ -1829,6 +1829,11 @@ int rproc_trigger_recovery(struct rproc *rproc)
>  	struct device *dev = &rproc->dev;
>  	int ret;
>  
> +	dev_err(dev, "recovering %s\n", rproc->name);
> +
> +	if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY))
> +		return rproc_attach_recovery(rproc);
> +

Humm... I find this a little messy.  Taking [1] as an example, I suggest moving
the "unlock_mutex" block to line 1846 and add mutex calls to
rproc_boot_recovery().  That way both rproc_attach_recovery() and
rproc_boot_recovery() are called the same way.

[1] https://elixir.bootlin.com/linux/v6.17.8/source/drivers/remoteproc/remoteproc_core.c#L1832

>  	ret = mutex_lock_interruptible(&rproc->lock);
>  	if (ret)
>  		return ret;
> @@ -1837,12 +1842,7 @@ int rproc_trigger_recovery(struct rproc *rproc)
>  	if (rproc->state != RPROC_CRASHED)
>  		goto unlock_mutex;
>  
> -	dev_err(dev, "recovering %s\n", rproc->name);
> -
> -	if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY))
> -		ret = rproc_attach_recovery(rproc);
> -	else
> -		ret = rproc_boot_recovery(rproc);
> +	ret = rproc_boot_recovery(rproc);
>  
>  unlock_mutex:
>  	mutex_unlock(&rproc->lock);
> @@ -1860,6 +1860,7 @@ static void rproc_crash_handler_work(struct work_struct *work)
>  {
>  	struct rproc *rproc = container_of(work, struct rproc, crash_handler);
>  	struct device *dev = &rproc->dev;
> +	int ret;
>  
>  	dev_dbg(dev, "enter %s\n", __func__);
>  
> @@ -1883,8 +1884,11 @@ static void rproc_crash_handler_work(struct work_struct *work)
>  
>  	mutex_unlock(&rproc->lock);
>  
> -	if (!rproc->recovery_disabled)
> -		rproc_trigger_recovery(rproc);
> +	if (!rproc->recovery_disabled) {
> +		ret = rproc_trigger_recovery(rproc);
> +		if (ret)
> +			dev_warn(dev, "rproc recovery failed, err %d\n", ret);

I would prefer a patch on its own for this one.

I'm running out of time for today, I'll review patch 3/3 tomorrow.

Thanks,
Mathieu

> +	}
>  
>  out:
>  	pm_relax(rproc->dev.parent);
> @@ -2057,7 +2061,7 @@ int rproc_detach(struct rproc *rproc)
>  		return ret;
>  	}
>  
> -	if (rproc->state != RPROC_ATTACHED) {
> +	if (rproc->state != RPROC_ATTACHED && rproc->state != RPROC_CRASHED) {
>  		ret = -EINVAL;
>  		goto out;
>  	}
> -- 
> 2.34.1
>

Re: [PATCH v2 2/3] remoteproc: core: full attach detach during recovery

Posted by Tanmay Shah 2 months, 1 week ago


On 11/20/25 11:58 AM, Mathieu Poirier wrote:
> On Thu, Nov 13, 2025 at 07:44:03AM -0800, Tanmay Shah wrote:
>> Current attach on recovery mechanism loads the clean resource table
>> during recovery, but doesn't re-allocate the resources. RPMsg
>> communication will fail after recovery due to this. Fix this
>> incorrect behavior by doing the full detach and attach of remote
>> processor during the recovery. This will load the clean resource table
>> and re-allocate all the resources, which will set up correct vring
>> information in the resource table.
>>
>> Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
>> ---
>>
>> Changes in v2:
>>    - use rproc_boot instead of rproc_attach
>>    - move debug message early in the function
>>
>>   drivers/remoteproc/remoteproc_core.c | 26 +++++++++++++++-----------
>>   1 file changed, 15 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
>> index aada2780b343..f65e8bc2d1e1 100644
>> --- a/drivers/remoteproc/remoteproc_core.c
>> +++ b/drivers/remoteproc/remoteproc_core.c
>> @@ -1777,11 +1777,11 @@ static int rproc_attach_recovery(struct rproc *rproc)
>>   {
>>   	int ret;
>>   
>> -	ret = __rproc_detach(rproc);
>> +	ret = rproc_detach(rproc);
>>   	if (ret)
>>   		return ret;
>>   
>> -	return __rproc_attach(rproc);
>> +	return rproc_boot(rproc);
>>   }
>>   
>>   static int rproc_boot_recovery(struct rproc *rproc)
>> @@ -1829,6 +1829,11 @@ int rproc_trigger_recovery(struct rproc *rproc)
>>   	struct device *dev = &rproc->dev;
>>   	int ret;
>>   
>> +	dev_err(dev, "recovering %s\n", rproc->name);
>> +
>> +	if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY))
>> +		return rproc_attach_recovery(rproc);
>> +
> 
> Humm... I find this a little messy.  Taking [1] as an example, I suggest moving
> the "unlock_mutex" block to line 1846 and add mutex calls to
> rproc_boot_recovery().  That way both rproc_attach_recovery() and
> rproc_boot_recovery() are called the same way.
> 
> [1] https://elixir.bootlin.com/linux/v6.17.8/source/drivers/remoteproc/remoteproc_core.c#L1832
> 

Sounds good. I will have to test it but I don't see problem with the 
suggestion made.

>>   	ret = mutex_lock_interruptible(&rproc->lock);
>>   	if (ret)
>>   		return ret;
>> @@ -1837,12 +1842,7 @@ int rproc_trigger_recovery(struct rproc *rproc)
>>   	if (rproc->state != RPROC_CRASHED)
>>   		goto unlock_mutex;
>>   
>> -	dev_err(dev, "recovering %s\n", rproc->name);
>> -
>> -	if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY))
>> -		ret = rproc_attach_recovery(rproc);
>> -	else
>> -		ret = rproc_boot_recovery(rproc);
>> +	ret = rproc_boot_recovery(rproc);
>>   
>>   unlock_mutex:
>>   	mutex_unlock(&rproc->lock);
>> @@ -1860,6 +1860,7 @@ static void rproc_crash_handler_work(struct work_struct *work)
>>   {
>>   	struct rproc *rproc = container_of(work, struct rproc, crash_handler);
>>   	struct device *dev = &rproc->dev;
>> +	int ret;
>>   
>>   	dev_dbg(dev, "enter %s\n", __func__);
>>   
>> @@ -1883,8 +1884,11 @@ static void rproc_crash_handler_work(struct work_struct *work)
>>   
>>   	mutex_unlock(&rproc->lock);
>>   
>> -	if (!rproc->recovery_disabled)
>> -		rproc_trigger_recovery(rproc);
>> +	if (!rproc->recovery_disabled) {
>> +		ret = rproc_trigger_recovery(rproc);
>> +		if (ret)
>> +			dev_warn(dev, "rproc recovery failed, err %d\n", ret);
> 
> I would prefer a patch on its own for this one.
> 

Ack.

> I'm running out of time for today, I'll review patch 3/3 tomorrow.
> 
> Thanks,
> Mathieu
> 
>> +	}
>>   
>>   out:
>>   	pm_relax(rproc->dev.parent);
>> @@ -2057,7 +2061,7 @@ int rproc_detach(struct rproc *rproc)
>>   		return ret;
>>   	}
>>   
>> -	if (rproc->state != RPROC_ATTACHED) {
>> +	if (rproc->state != RPROC_ATTACHED && rproc->state != RPROC_CRASHED) {
>>   		ret = -EINVAL;
>>   		goto out;
>>   	}
>> -- 
>> 2.34.1
>>

[PATCH v2 1/3] remoteproc: xlnx: enable boot recovery
[PATCH v2 2/3] remoteproc: core: full attach detach during recovery
[PATCH v2 3/3] remoteproc: xlnx: add crash detection mechanism