[v1] remoteproc: xlnx: remote crash recovery

[PATCH v2 3/3] remoteproc: xlnx: add crash detection mechanism

Posted by Tanmay Shah 2 months, 3 weeks ago

Remote processor will report the crash reason via the resource table
and notify the host via kick. The host checks this crash reason on
every kick notification from the remote and report to the core
framework. Then the rproc core framework will start the recovery
process.

Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
---

Changes in v2:
  - clear attach recovery boot flag during detach and stop ops

 drivers/remoteproc/xlnx_r5_remoteproc.c | 56 +++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c b/drivers/remoteproc/xlnx_r5_remoteproc.c
index 8677b732ad14..5d04e8c0dc52 100644
--- a/drivers/remoteproc/xlnx_r5_remoteproc.c
+++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
@@ -108,6 +108,10 @@ struct rsc_tbl_data {
 	const uintptr_t rsc_tbl;
 } __packed;
 
+enum fw_vendor_rsc {
+	FW_RSC_VENDOR_CRASH_REASON = RSC_VENDOR_START,
+};
+
 /*
  * Hardcoded TCM bank values. This will stay in driver to maintain backward
  * compatibility with device-tree that does not have TCM information.
@@ -127,9 +131,21 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
 	{0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
 };
 
+/**
+ * struct xlnx_rproc_crash_report - resource to know crash status and reason
+ *
+ * @crash_state: if true, the rproc is notifying crash, time to recover
+ * @crash_reason: reason of crash
+ */
+struct xlnx_rproc_crash_report {
+	u32 crash_state;
+	u32 crash_reason;
+} __packed;
+
 /**
  * struct zynqmp_r5_core - remoteproc core's internal data
  *
+ * @crash_report: rproc crash state and reason
  * @rsc_tbl_va: resource table virtual address
  * @sram: Array of sram memories assigned to this core
  * @num_sram: number of sram for this core
@@ -143,6 +159,7 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
  * @ipi: pointer to mailbox information
  */
 struct zynqmp_r5_core {
+	struct xlnx_rproc_crash_report *crash_report;
 	void __iomem *rsc_tbl_va;
 	struct zynqmp_sram_bank *sram;
 	int num_sram;
@@ -227,10 +244,14 @@ static void handle_event_notified(struct work_struct *work)
 static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
 {
 	struct zynqmp_ipi_message *ipi_msg, *buf_msg;
+	struct zynqmp_r5_core *r5_core;
+	struct rproc *rproc;
 	struct mbox_info *ipi;
 	size_t len;
 
 	ipi = container_of(cl, struct mbox_info, mbox_cl);
+	r5_core = ipi->r5_core;
+	rproc = r5_core->rproc;
 
 	/* copy data from ipi buffer to r5_core */
 	ipi_msg = (struct zynqmp_ipi_message *)msg;
@@ -244,6 +265,13 @@ static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
 	buf_msg->len = len;
 	memcpy(buf_msg->data, ipi_msg->data, len);
 
+	/* Check for crash only if rproc crash is expected */
+	if (rproc->state == RPROC_ATTACHED || rproc->state == RPROC_RUNNING) {
+		if (r5_core->crash_report->crash_state)
+			rproc_report_crash(rproc,
+					   r5_core->crash_report->crash_reason);
+	}
+
 	/* received and processed interrupt ack */
 	if (mbox_send_message(ipi->rx_chan, NULL) < 0)
 		dev_err(cl->dev, "ack failed to mbox rx_chan\n");
@@ -397,6 +425,7 @@ static int zynqmp_r5_rproc_start(struct rproc *rproc)
 	if (ret)
 		dev_err(r5_core->dev,
 			"failed to start RPU = 0x%x\n", r5_core->pm_domain_id);
+
 	return ret;
 }
 
@@ -438,6 +467,8 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
 	if (ret)
 		dev_err(r5_core->dev, "core force power down failed\n");
 
+	test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
+
 	return ret;
 }
 
@@ -874,6 +905,8 @@ static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
 
 static int zynqmp_r5_attach(struct rproc *rproc)
 {
+	rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
+
 	dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
 
 	return 0;
@@ -888,6 +921,8 @@ static int zynqmp_r5_detach(struct rproc *rproc)
 	 */
 	zynqmp_r5_rproc_kick(rproc, 0);
 
+	clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
+
 	return 0;
 }
 
@@ -896,6 +931,26 @@ static void zynqmp_r5_coredump(struct rproc *rproc)
 	(void)rproc;
 }
 
+static int zynqmp_r5_handle_crash_rsc(struct rproc *rproc, void *rsc,
+				      int offset, int avail)
+{
+	struct zynqmp_r5_core *r5_core = rproc->priv;
+
+	r5_core->crash_report =
+		(struct xlnx_rproc_crash_report *)(r5_core->rsc_tbl_va + offset);
+
+	return RSC_HANDLED;
+}
+
+static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc,
+				int offset, int avail)
+{
+	if (rsc_type == FW_RSC_VENDOR_CRASH_REASON)
+		return zynqmp_r5_handle_crash_rsc(rproc, rsc, offset, avail);
+
+	return RSC_IGNORED;
+}
+
 static const struct rproc_ops zynqmp_r5_rproc_ops = {
 	.prepare	= zynqmp_r5_rproc_prepare,
 	.unprepare	= zynqmp_r5_rproc_unprepare,
@@ -911,6 +966,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
 	.attach		= zynqmp_r5_attach,
 	.detach		= zynqmp_r5_detach,
 	.coredump	= zynqmp_r5_coredump,
+	.handle_rsc	= zynqmp_r5_handle_rsc,
 };
 
 /**
-- 
2.34.1

Re: [PATCH v2 3/3] remoteproc: xlnx: add crash detection mechanism

Posted by Mathieu Poirier 2 months, 2 weeks ago

On Thu, Nov 13, 2025 at 07:44:04AM -0800, Tanmay Shah wrote:
> Remote processor will report the crash reason via the resource table
> and notify the host via kick. The host checks this crash reason on
> every kick notification from the remote and report to the core
> framework. Then the rproc core framework will start the recovery
> process.

Please substitute the word "kick" for "mailbox notification".  I also have to
assume "core framework" and "rproc core framework" are the same.  Pick one and
stick with it.

> 
> Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
> ---
> 
> Changes in v2:
>   - clear attach recovery boot flag during detach and stop ops
> 
>  drivers/remoteproc/xlnx_r5_remoteproc.c | 56 +++++++++++++++++++++++++
>  1 file changed, 56 insertions(+)
> 
> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c b/drivers/remoteproc/xlnx_r5_remoteproc.c
> index 8677b732ad14..5d04e8c0dc52 100644
> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
> @@ -108,6 +108,10 @@ struct rsc_tbl_data {
>  	const uintptr_t rsc_tbl;
>  } __packed;
>  
> +enum fw_vendor_rsc {
> +	FW_RSC_VENDOR_CRASH_REASON = RSC_VENDOR_START,
> +};
> +
>  /*
>   * Hardcoded TCM bank values. This will stay in driver to maintain backward
>   * compatibility with device-tree that does not have TCM information.
> @@ -127,9 +131,21 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>  	{0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
>  };
>  
> +/**
> + * struct xlnx_rproc_crash_report - resource to know crash status and reason
> + *
> + * @crash_state: if true, the rproc is notifying crash, time to recover
> + * @crash_reason: reason of crash
> + */
> +struct xlnx_rproc_crash_report {
> +	u32 crash_state;
> +	u32 crash_reason;
> +} __packed;
> +
>  /**
>   * struct zynqmp_r5_core - remoteproc core's internal data
>   *
> + * @crash_report: rproc crash state and reason
>   * @rsc_tbl_va: resource table virtual address
>   * @sram: Array of sram memories assigned to this core
>   * @num_sram: number of sram for this core
> @@ -143,6 +159,7 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>   * @ipi: pointer to mailbox information
>   */
>  struct zynqmp_r5_core {
> +	struct xlnx_rproc_crash_report *crash_report;
>  	void __iomem *rsc_tbl_va;
>  	struct zynqmp_sram_bank *sram;
>  	int num_sram;
> @@ -227,10 +244,14 @@ static void handle_event_notified(struct work_struct *work)
>  static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
>  {
>  	struct zynqmp_ipi_message *ipi_msg, *buf_msg;
> +	struct zynqmp_r5_core *r5_core;
> +	struct rproc *rproc;
>  	struct mbox_info *ipi;
>  	size_t len;
>  
>  	ipi = container_of(cl, struct mbox_info, mbox_cl);
> +	r5_core = ipi->r5_core;
> +	rproc = r5_core->rproc;
>  
>  	/* copy data from ipi buffer to r5_core */
>  	ipi_msg = (struct zynqmp_ipi_message *)msg;
> @@ -244,6 +265,13 @@ static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
>  	buf_msg->len = len;
>  	memcpy(buf_msg->data, ipi_msg->data, len);
>  
> +	/* Check for crash only if rproc crash is expected */
> +	if (rproc->state == RPROC_ATTACHED || rproc->state == RPROC_RUNNING) {
> +		if (r5_core->crash_report->crash_state)
> +			rproc_report_crash(rproc,
> +					   r5_core->crash_report->crash_reason);

At this stage ->crash_state indicates that a crash occured, but how is it reset
once the crash has been handle?  How do we make sure the next mailbox
notification won't trigger another crash report?

> +	}
> +
>  	/* received and processed interrupt ack */
>  	if (mbox_send_message(ipi->rx_chan, NULL) < 0)
>  		dev_err(cl->dev, "ack failed to mbox rx_chan\n");
> @@ -397,6 +425,7 @@ static int zynqmp_r5_rproc_start(struct rproc *rproc)
>  	if (ret)
>  		dev_err(r5_core->dev,
>  			"failed to start RPU = 0x%x\n", r5_core->pm_domain_id);
> +

Spurious change

>  	return ret;
>  }
>  
> @@ -438,6 +467,8 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
>  	if (ret)
>  		dev_err(r5_core->dev, "core force power down failed\n");
>  
> +	test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
> +
>  	return ret;
>  }
>  
> @@ -874,6 +905,8 @@ static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
>  
>  static int zynqmp_r5_attach(struct rproc *rproc)
>  {
> +	rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
> +

Why can't this be set in probe() and left alone from thereon?

>  	dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
>  
>  	return 0;
> @@ -888,6 +921,8 @@ static int zynqmp_r5_detach(struct rproc *rproc)
>  	 */
>  	zynqmp_r5_rproc_kick(rproc, 0);
>  
> +	clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
> +

I'm not sure why this needs to be done, same comment for zynqmp_r5_rproc_stop().

>  	return 0;
>  }
>  
> @@ -896,6 +931,26 @@ static void zynqmp_r5_coredump(struct rproc *rproc)
>  	(void)rproc;
>  }
>  
> +static int zynqmp_r5_handle_crash_rsc(struct rproc *rproc, void *rsc,
> +				      int offset, int avail)
> +{
> +	struct zynqmp_r5_core *r5_core = rproc->priv;
> +
> +	r5_core->crash_report =
> +		(struct xlnx_rproc_crash_report *)(r5_core->rsc_tbl_va + offset);
> +

This function is so simple that I would fold it in zynqmp_r5_handle_rsc() below.

Thanks,
Mathieu

> +	return RSC_HANDLED;
> +}
> +
> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc,
> +				int offset, int avail)
> +{
> +	if (rsc_type == FW_RSC_VENDOR_CRASH_REASON)
> +		return zynqmp_r5_handle_crash_rsc(rproc, rsc, offset, avail);
> +
> +	return RSC_IGNORED;
> +}
> +
>  static const struct rproc_ops zynqmp_r5_rproc_ops = {
>  	.prepare	= zynqmp_r5_rproc_prepare,
>  	.unprepare	= zynqmp_r5_rproc_unprepare,
> @@ -911,6 +966,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
>  	.attach		= zynqmp_r5_attach,
>  	.detach		= zynqmp_r5_detach,
>  	.coredump	= zynqmp_r5_coredump,
> +	.handle_rsc	= zynqmp_r5_handle_rsc,
>  };
>  
>  /**
> -- 
> 2.34.1
>

Re: [PATCH v2 3/3] remoteproc: xlnx: add crash detection mechanism

Posted by Tanmay Shah 2 months, 1 week ago


On 11/21/25 9:37 AM, Mathieu Poirier wrote:
> On Thu, Nov 13, 2025 at 07:44:04AM -0800, Tanmay Shah wrote:
>> Remote processor will report the crash reason via the resource table
>> and notify the host via kick. The host checks this crash reason on
>> every kick notification from the remote and report to the core
>> framework. Then the rproc core framework will start the recovery
>> process.
> 
> Please substitute the word "kick" for "mailbox notification".  I also have to
> assume "core framework" and "rproc core framework" are the same.  Pick one and
> stick with it.
> 

Ack.

>>
>> Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
>> ---
>>
>> Changes in v2:
>>    - clear attach recovery boot flag during detach and stop ops
>>
>>   drivers/remoteproc/xlnx_r5_remoteproc.c | 56 +++++++++++++++++++++++++
>>   1 file changed, 56 insertions(+)
>>
>> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> index 8677b732ad14..5d04e8c0dc52 100644
>> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
>> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> @@ -108,6 +108,10 @@ struct rsc_tbl_data {
>>   	const uintptr_t rsc_tbl;
>>   } __packed;
>>   
>> +enum fw_vendor_rsc {
>> +	FW_RSC_VENDOR_CRASH_REASON = RSC_VENDOR_START,
>> +};
>> +
>>   /*
>>    * Hardcoded TCM bank values. This will stay in driver to maintain backward
>>    * compatibility with device-tree that does not have TCM information.
>> @@ -127,9 +131,21 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>>   	{0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
>>   };
>>   
>> +/**
>> + * struct xlnx_rproc_crash_report - resource to know crash status and reason
>> + *
>> + * @crash_state: if true, the rproc is notifying crash, time to recover
>> + * @crash_reason: reason of crash
>> + */
>> +struct xlnx_rproc_crash_report {
>> +	u32 crash_state;
>> +	u32 crash_reason;
>> +} __packed;
>> +
>>   /**
>>    * struct zynqmp_r5_core - remoteproc core's internal data
>>    *
>> + * @crash_report: rproc crash state and reason
>>    * @rsc_tbl_va: resource table virtual address
>>    * @sram: Array of sram memories assigned to this core
>>    * @num_sram: number of sram for this core
>> @@ -143,6 +159,7 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>>    * @ipi: pointer to mailbox information
>>    */
>>   struct zynqmp_r5_core {
>> +	struct xlnx_rproc_crash_report *crash_report;
>>   	void __iomem *rsc_tbl_va;
>>   	struct zynqmp_sram_bank *sram;
>>   	int num_sram;
>> @@ -227,10 +244,14 @@ static void handle_event_notified(struct work_struct *work)
>>   static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
>>   {
>>   	struct zynqmp_ipi_message *ipi_msg, *buf_msg;
>> +	struct zynqmp_r5_core *r5_core;
>> +	struct rproc *rproc;
>>   	struct mbox_info *ipi;
>>   	size_t len;
>>   
>>   	ipi = container_of(cl, struct mbox_info, mbox_cl);
>> +	r5_core = ipi->r5_core;
>> +	rproc = r5_core->rproc;
>>   
>>   	/* copy data from ipi buffer to r5_core */
>>   	ipi_msg = (struct zynqmp_ipi_message *)msg;
>> @@ -244,6 +265,13 @@ static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
>>   	buf_msg->len = len;
>>   	memcpy(buf_msg->data, ipi_msg->data, len);
>>   
>> +	/* Check for crash only if rproc crash is expected */
>> +	if (rproc->state == RPROC_ATTACHED || rproc->state == RPROC_RUNNING) {
>> +		if (r5_core->crash_report->crash_state)
>> +			rproc_report_crash(rproc,
>> +					   r5_core->crash_report->crash_reason);
> 
> At this stage ->crash_state indicates that a crash occured, but how is it reset
> once the crash has been handle?  How do we make sure the next mailbox
> notification won't trigger another crash report?
> 

I was counting on the remote firmware to reset the crash_state once it 
reboots before sending the next mailbox notification.

If it's not the best idea, I can reset the crash_state field in start() 
callback or attach() callback at the end. That will indicate that remote 
firmware has started successfully.

>> +	}
>> +
>>   	/* received and processed interrupt ack */
>>   	if (mbox_send_message(ipi->rx_chan, NULL) < 0)
>>   		dev_err(cl->dev, "ack failed to mbox rx_chan\n");
>> @@ -397,6 +425,7 @@ static int zynqmp_r5_rproc_start(struct rproc *rproc)
>>   	if (ret)
>>   		dev_err(r5_core->dev,
>>   			"failed to start RPU = 0x%x\n", r5_core->pm_domain_id);
>> +
> 
> Spurious change
> 

Ack will remove it.

>>   	return ret;
>>   }
>>   
>> @@ -438,6 +467,8 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
>>   	if (ret)
>>   		dev_err(r5_core->dev, "core force power down failed\n");
>>   
>> +	test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
>> +
>>   	return ret;
>>   }
>>   
>> @@ -874,6 +905,8 @@ static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
>>   
>>   static int zynqmp_r5_attach(struct rproc *rproc)
>>   {
>> +	rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
>> +
> 
> Why can't this be set in probe() and left alone from thereon?
> 

Right now no specific reason. But I wanted to enable recovery only if 
attach() callback is successful. If execution fails anytime before that, 
then no point in enabling it.

>>   	dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
>>   
>>   	return 0;
>> @@ -888,6 +921,8 @@ static int zynqmp_r5_detach(struct rproc *rproc)
>>   	 */
>>   	zynqmp_r5_rproc_kick(rproc, 0);
>>   
>> +	clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
>> +
> 
> I'm not sure why this needs to be done, same comment for zynqmp_r5_rproc_stop().
> 

I think for detach() may be it's not needed. I added it as a cleanup 
sequence i.e. reverse of what's done in the attach() callback.

For stop it is needed in the following case:

attach() -> stop () -> load fw () -> start ().

In this sequence we need to make sure that if recovery is requested 
after start(), then we execute "boot recovery" and not "attach recovery".


Thanks,
Tanmay



>>   	return 0;
>>   }
>>   
>> @@ -896,6 +931,26 @@ static void zynqmp_r5_coredump(struct rproc *rproc)
>>   	(void)rproc;
>>   }
>>   
>> +static int zynqmp_r5_handle_crash_rsc(struct rproc *rproc, void *rsc,
>> +				      int offset, int avail)
>> +{
>> +	struct zynqmp_r5_core *r5_core = rproc->priv;
>> +
>> +	r5_core->crash_report =
>> +		(struct xlnx_rproc_crash_report *)(r5_core->rsc_tbl_va + offset);
>> +
> 
> This function is so simple that I would fold it in zynqmp_r5_handle_rsc() below.
> 

Ack.

> Thanks,
> Mathieu
> 
>> +	return RSC_HANDLED;
>> +}
>> +
>> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc,
>> +				int offset, int avail)
>> +{
>> +	if (rsc_type == FW_RSC_VENDOR_CRASH_REASON)
>> +		return zynqmp_r5_handle_crash_rsc(rproc, rsc, offset, avail);
>> +
>> +	return RSC_IGNORED;
>> +}
>> +
>>   static const struct rproc_ops zynqmp_r5_rproc_ops = {
>>   	.prepare	= zynqmp_r5_rproc_prepare,
>>   	.unprepare	= zynqmp_r5_rproc_unprepare,
>> @@ -911,6 +966,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
>>   	.attach		= zynqmp_r5_attach,
>>   	.detach		= zynqmp_r5_detach,
>>   	.coredump	= zynqmp_r5_coredump,
>> +	.handle_rsc	= zynqmp_r5_handle_rsc,
>>   };
>>   
>>   /**
>> -- 
>> 2.34.1
>>

Re: [PATCH v2 3/3] remoteproc: xlnx: add crash detection mechanism

Posted by Mathieu Poirier 2 months, 1 week ago

On Mon, 1 Dec 2025 at 22:04, Tanmay Shah <tanmay.shah@amd.com> wrote:
>
>
>
> On 11/21/25 9:37 AM, Mathieu Poirier wrote:
> > On Thu, Nov 13, 2025 at 07:44:04AM -0800, Tanmay Shah wrote:
> >> Remote processor will report the crash reason via the resource table
> >> and notify the host via kick. The host checks this crash reason on
> >> every kick notification from the remote and report to the core
> >> framework. Then the rproc core framework will start the recovery
> >> process.
> >
> > Please substitute the word "kick" for "mailbox notification".  I also have to
> > assume "core framework" and "rproc core framework" are the same.  Pick one and
> > stick with it.
> >
>
> Ack.
>
> >>
> >> Signed-off-by: Tanmay Shah <tanmay.shah@amd.com>
> >> ---
> >>
> >> Changes in v2:
> >>    - clear attach recovery boot flag during detach and stop ops
> >>
> >>   drivers/remoteproc/xlnx_r5_remoteproc.c | 56 +++++++++++++++++++++++++
> >>   1 file changed, 56 insertions(+)
> >>
> >> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c b/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> index 8677b732ad14..5d04e8c0dc52 100644
> >> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
> >> @@ -108,6 +108,10 @@ struct rsc_tbl_data {
> >>      const uintptr_t rsc_tbl;
> >>   } __packed;
> >>
> >> +enum fw_vendor_rsc {
> >> +    FW_RSC_VENDOR_CRASH_REASON = RSC_VENDOR_START,
> >> +};
> >> +
> >>   /*
> >>    * Hardcoded TCM bank values. This will stay in driver to maintain backward
> >>    * compatibility with device-tree that does not have TCM information.
> >> @@ -127,9 +131,21 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
> >>      {0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
> >>   };
> >>
> >> +/**
> >> + * struct xlnx_rproc_crash_report - resource to know crash status and reason
> >> + *
> >> + * @crash_state: if true, the rproc is notifying crash, time to recover
> >> + * @crash_reason: reason of crash
> >> + */
> >> +struct xlnx_rproc_crash_report {
> >> +    u32 crash_state;
> >> +    u32 crash_reason;
> >> +} __packed;
> >> +
> >>   /**
> >>    * struct zynqmp_r5_core - remoteproc core's internal data
> >>    *
> >> + * @crash_report: rproc crash state and reason
> >>    * @rsc_tbl_va: resource table virtual address
> >>    * @sram: Array of sram memories assigned to this core
> >>    * @num_sram: number of sram for this core
> >> @@ -143,6 +159,7 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
> >>    * @ipi: pointer to mailbox information
> >>    */
> >>   struct zynqmp_r5_core {
> >> +    struct xlnx_rproc_crash_report *crash_report;
> >>      void __iomem *rsc_tbl_va;
> >>      struct zynqmp_sram_bank *sram;
> >>      int num_sram;
> >> @@ -227,10 +244,14 @@ static void handle_event_notified(struct work_struct *work)
> >>   static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
> >>   {
> >>      struct zynqmp_ipi_message *ipi_msg, *buf_msg;
> >> +    struct zynqmp_r5_core *r5_core;
> >> +    struct rproc *rproc;
> >>      struct mbox_info *ipi;
> >>      size_t len;
> >>
> >>      ipi = container_of(cl, struct mbox_info, mbox_cl);
> >> +    r5_core = ipi->r5_core;
> >> +    rproc = r5_core->rproc;
> >>
> >>      /* copy data from ipi buffer to r5_core */
> >>      ipi_msg = (struct zynqmp_ipi_message *)msg;
> >> @@ -244,6 +265,13 @@ static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
> >>      buf_msg->len = len;
> >>      memcpy(buf_msg->data, ipi_msg->data, len);
> >>
> >> +    /* Check for crash only if rproc crash is expected */
> >> +    if (rproc->state == RPROC_ATTACHED || rproc->state == RPROC_RUNNING) {
> >> +            if (r5_core->crash_report->crash_state)
> >> +                    rproc_report_crash(rproc,
> >> +                                       r5_core->crash_report->crash_reason);
> >
> > At this stage ->crash_state indicates that a crash occured, but how is it reset
> > once the crash has been handle?  How do we make sure the next mailbox
> > notification won't trigger another crash report?
> >
>
> I was counting on the remote firmware to reset the crash_state once it
> reboots before sending the next mailbox notification.
>
> If it's not the best idea, I can reset the crash_state field in start()
> callback or attach() callback at the end. That will indicate that remote
> firmware has started successfully.

I think this is a better solution.  That way we don't rely on
something that may or may not happen.

>
> >> +    }
> >> +
> >>      /* received and processed interrupt ack */
> >>      if (mbox_send_message(ipi->rx_chan, NULL) < 0)
> >>              dev_err(cl->dev, "ack failed to mbox rx_chan\n");
> >> @@ -397,6 +425,7 @@ static int zynqmp_r5_rproc_start(struct rproc *rproc)
> >>      if (ret)
> >>              dev_err(r5_core->dev,
> >>                      "failed to start RPU = 0x%x\n", r5_core->pm_domain_id);
> >> +
> >
> > Spurious change
> >
>
> Ack will remove it.
>
> >>      return ret;
> >>   }
> >>
> >> @@ -438,6 +467,8 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
> >>      if (ret)
> >>              dev_err(r5_core->dev, "core force power down failed\n");
> >>
> >> +    test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
> >> +
> >>      return ret;
> >>   }
> >>
> >> @@ -874,6 +905,8 @@ static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
> >>
> >>   static int zynqmp_r5_attach(struct rproc *rproc)
> >>   {
> >> +    rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
> >> +
> >
> > Why can't this be set in probe() and left alone from thereon?
> >
>
> Right now no specific reason. But I wanted to enable recovery only if
> attach() callback is successful. If execution fails anytime before that,
> then no point in enabling it.
>
> >>      dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
> >>
> >>      return 0;
> >> @@ -888,6 +921,8 @@ static int zynqmp_r5_detach(struct rproc *rproc)
> >>       */
> >>      zynqmp_r5_rproc_kick(rproc, 0);
> >>
> >> +    clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
> >> +
> >
> > I'm not sure why this needs to be done, same comment for zynqmp_r5_rproc_stop().
> >
>
> I think for detach() may be it's not needed. I added it as a cleanup
> sequence i.e. reverse of what's done in the attach() callback.
>
> For stop it is needed in the following case:
>
> attach() -> stop () -> load fw () -> start ().
>
> In this sequence we need to make sure that if recovery is requested
> after start(), then we execute "boot recovery" and not "attach recovery".
>

I think this is a valid reason, just make sure it is documented in the
code here and for _attach() above.

>
> Thanks,
> Tanmay
>
>
>
> >>      return 0;
> >>   }
> >>
> >> @@ -896,6 +931,26 @@ static void zynqmp_r5_coredump(struct rproc *rproc)
> >>      (void)rproc;
> >>   }
> >>
> >> +static int zynqmp_r5_handle_crash_rsc(struct rproc *rproc, void *rsc,
> >> +                                  int offset, int avail)
> >> +{
> >> +    struct zynqmp_r5_core *r5_core = rproc->priv;
> >> +
> >> +    r5_core->crash_report =
> >> +            (struct xlnx_rproc_crash_report *)(r5_core->rsc_tbl_va + offset);
> >> +
> >
> > This function is so simple that I would fold it in zynqmp_r5_handle_rsc() below.
> >
>
> Ack.
>
> > Thanks,
> > Mathieu
> >
> >> +    return RSC_HANDLED;
> >> +}
> >> +
> >> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc,
> >> +                            int offset, int avail)
> >> +{
> >> +    if (rsc_type == FW_RSC_VENDOR_CRASH_REASON)
> >> +            return zynqmp_r5_handle_crash_rsc(rproc, rsc, offset, avail);
> >> +
> >> +    return RSC_IGNORED;
> >> +}
> >> +
> >>   static const struct rproc_ops zynqmp_r5_rproc_ops = {
> >>      .prepare        = zynqmp_r5_rproc_prepare,
> >>      .unprepare      = zynqmp_r5_rproc_unprepare,
> >> @@ -911,6 +966,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
> >>      .attach         = zynqmp_r5_attach,
> >>      .detach         = zynqmp_r5_detach,
> >>      .coredump       = zynqmp_r5_coredump,
> >> +    .handle_rsc     = zynqmp_r5_handle_rsc,
> >>   };
> >>
> >>   /**
> >> --
> >> 2.34.1
> >>
>

[PATCH v2 1/3] remoteproc: xlnx: enable boot recovery
[PATCH v2 2/3] remoteproc: core: full attach detach during recovery
[PATCH v2 3/3] remoteproc: xlnx: add crash detection mechanism