[PATCH V1] accel/amdxdna: Fix NULL pointer dereference of mgmt_chann

Lizhi Hou posted 1 patch 1 month, 1 week ago
drivers/accel/amdxdna/aie2_message.c | 21 ++++++++++++++++-----
drivers/accel/amdxdna/aie2_pci.c     |  7 ++-----
drivers/accel/amdxdna/aie2_pci.h     |  1 +
3 files changed, 19 insertions(+), 10 deletions(-)
[PATCH V1] accel/amdxdna: Fix NULL pointer dereference of mgmt_chann
Posted by Lizhi Hou 1 month, 1 week ago
mgmt_chann may be set to NULL if the firmware returns an unexpected
error in aie2_send_mgmt_msg_wait(). This can later lead to a NULL
pointer dereference in aie2_hw_stop().

Fix this by introducing a dedicated helper to destroy mgmt_chann
and by adding proper NULL checks before accessing it.

Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
---
 drivers/accel/amdxdna/aie2_message.c | 21 ++++++++++++++++-----
 drivers/accel/amdxdna/aie2_pci.c     |  7 ++-----
 drivers/accel/amdxdna/aie2_pci.h     |  1 +
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 277a27bce850..22e1a85a7ae0 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -40,11 +40,8 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
 		return -ENODEV;
 
 	ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
-	if (ret == -ETIME) {
-		xdna_mailbox_stop_channel(ndev->mgmt_chann);
-		xdna_mailbox_destroy_channel(ndev->mgmt_chann);
-		ndev->mgmt_chann = NULL;
-	}
+	if (ret == -ETIME)
+		aie2_destroy_mgmt_chann(ndev);
 
 	if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
 		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
@@ -914,6 +911,20 @@ void aie2_msg_init(struct amdxdna_dev_hdl *ndev)
 		ndev->exec_msg_ops = &legacy_exec_message_ops;
 }
 
+void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+
+	if (!ndev->mgmt_chann)
+		return;
+
+	xdna_mailbox_stop_channel(ndev->mgmt_chann);
+	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
+	ndev->mgmt_chann = NULL;
+}
+
 static inline struct amdxdna_gem_obj *
 aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
 {
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index 85079b6fc5d9..977ce21eaf9f 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -330,9 +330,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
 
 	aie2_runtime_cfg(ndev, AIE2_RT_CFG_CLK_GATING, NULL);
 	aie2_mgmt_fw_fini(ndev);
-	xdna_mailbox_stop_channel(ndev->mgmt_chann);
-	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
-	ndev->mgmt_chann = NULL;
+	aie2_destroy_mgmt_chann(ndev);
 	drmm_kfree(&xdna->ddev, ndev->mbox);
 	ndev->mbox = NULL;
 	aie2_psp_stop(ndev->psp_hdl);
@@ -441,8 +439,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
 	return 0;
 
 destroy_mgmt_chann:
-	xdna_mailbox_stop_channel(ndev->mgmt_chann);
-	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
+	aie2_destroy_mgmt_chann(ndev);
 stop_psp:
 	aie2_psp_stop(ndev->psp_hdl);
 fini_smu:
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index b20a3661078c..e72311c77996 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -303,6 +303,7 @@ int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev,
 
 /* aie2_message.c */
 void aie2_msg_init(struct amdxdna_dev_hdl *ndev);
+void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev);
 int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
 int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
 int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 value);
-- 
2.34.1
Re: [PATCH V1] accel/amdxdna: Fix NULL pointer dereference of mgmt_chann
Posted by Mario Limonciello 1 month, 1 week ago
On 2/26/26 3:38 PM, Lizhi Hou wrote:
> mgmt_chann may be set to NULL if the firmware returns an unexpected
> error in aie2_send_mgmt_msg_wait(). This can later lead to a NULL
> pointer dereference in aie2_hw_stop().
> 
> Fix this by introducing a dedicated helper to destroy mgmt_chann
> and by adding proper NULL checks before accessing it.
> 

Is this actually going to fix it?  It sounds like a concurrency problem, no?

Do you want a mutex in the helper perhaps?

> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> ---
>   drivers/accel/amdxdna/aie2_message.c | 21 ++++++++++++++++-----
>   drivers/accel/amdxdna/aie2_pci.c     |  7 ++-----
>   drivers/accel/amdxdna/aie2_pci.h     |  1 +
>   3 files changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
> index 277a27bce850..22e1a85a7ae0 100644
> --- a/drivers/accel/amdxdna/aie2_message.c
> +++ b/drivers/accel/amdxdna/aie2_message.c
> @@ -40,11 +40,8 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
>   		return -ENODEV;
>   
>   	ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
> -	if (ret == -ETIME) {
> -		xdna_mailbox_stop_channel(ndev->mgmt_chann);
> -		xdna_mailbox_destroy_channel(ndev->mgmt_chann);
> -		ndev->mgmt_chann = NULL;
> -	}
> +	if (ret == -ETIME)
> +		aie2_destroy_mgmt_chann(ndev);
>   
>   	if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
>   		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
> @@ -914,6 +911,20 @@ void aie2_msg_init(struct amdxdna_dev_hdl *ndev)
>   		ndev->exec_msg_ops = &legacy_exec_message_ops;
>   }
>   
> +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev)
> +{
> +	struct amdxdna_dev *xdna = ndev->xdna;
> +
> +	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> +
> +	if (!ndev->mgmt_chann)
> +		return;
> +
> +	xdna_mailbox_stop_channel(ndev->mgmt_chann);
> +	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
> +	ndev->mgmt_chann = NULL;
> +}
> +
>   static inline struct amdxdna_gem_obj *
>   aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
>   {
> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> index 85079b6fc5d9..977ce21eaf9f 100644
> --- a/drivers/accel/amdxdna/aie2_pci.c
> +++ b/drivers/accel/amdxdna/aie2_pci.c
> @@ -330,9 +330,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
>   
>   	aie2_runtime_cfg(ndev, AIE2_RT_CFG_CLK_GATING, NULL);
>   	aie2_mgmt_fw_fini(ndev);
> -	xdna_mailbox_stop_channel(ndev->mgmt_chann);
> -	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
> -	ndev->mgmt_chann = NULL;
> +	aie2_destroy_mgmt_chann(ndev);
>   	drmm_kfree(&xdna->ddev, ndev->mbox);
>   	ndev->mbox = NULL;
>   	aie2_psp_stop(ndev->psp_hdl);
> @@ -441,8 +439,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
>   	return 0;
>   
>   destroy_mgmt_chann:
> -	xdna_mailbox_stop_channel(ndev->mgmt_chann);
> -	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
> +	aie2_destroy_mgmt_chann(ndev);
>   stop_psp:
>   	aie2_psp_stop(ndev->psp_hdl);
>   fini_smu:
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index b20a3661078c..e72311c77996 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -303,6 +303,7 @@ int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev,
>   
>   /* aie2_message.c */
>   void aie2_msg_init(struct amdxdna_dev_hdl *ndev);
> +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev);
>   int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
>   int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
>   int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 value);
Re: [PATCH V1] accel/amdxdna: Fix NULL pointer dereference of mgmt_chann
Posted by Lizhi Hou 1 month, 1 week ago
On 2/27/26 12:11, Mario Limonciello wrote:
> On 2/26/26 3:38 PM, Lizhi Hou wrote:
>> mgmt_chann may be set to NULL if the firmware returns an unexpected
>> error in aie2_send_mgmt_msg_wait(). This can later lead to a NULL
>> pointer dereference in aie2_hw_stop().
>>
>> Fix this by introducing a dedicated helper to destroy mgmt_chann
>> and by adding proper NULL checks before accessing it.
>>
>
> Is this actually going to fix it?  It sounds like a concurrency 
> problem, no?

It is not concurrency issue. The code is protected by dev_lock. That is 
why I added

       drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));

in my fix.

This is actually error code path issue. Normally, xdna_send_msg_wait() 
will not time out. This only happens with broken firmware. That is why 
it was not found before.

Lizhi

>
> Do you want a mutex in the helper perhaps?
>
>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> ---
>>   drivers/accel/amdxdna/aie2_message.c | 21 ++++++++++++++++-----
>>   drivers/accel/amdxdna/aie2_pci.c     |  7 ++-----
>>   drivers/accel/amdxdna/aie2_pci.h     |  1 +
>>   3 files changed, 19 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_message.c 
>> b/drivers/accel/amdxdna/aie2_message.c
>> index 277a27bce850..22e1a85a7ae0 100644
>> --- a/drivers/accel/amdxdna/aie2_message.c
>> +++ b/drivers/accel/amdxdna/aie2_message.c
>> @@ -40,11 +40,8 @@ static int aie2_send_mgmt_msg_wait(struct 
>> amdxdna_dev_hdl *ndev,
>>           return -ENODEV;
>>         ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
>> -    if (ret == -ETIME) {
>> -        xdna_mailbox_stop_channel(ndev->mgmt_chann);
>> -        xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>> -        ndev->mgmt_chann = NULL;
>> -    }
>> +    if (ret == -ETIME)
>> +        aie2_destroy_mgmt_chann(ndev);
>>         if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
>>           XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
>> @@ -914,6 +911,20 @@ void aie2_msg_init(struct amdxdna_dev_hdl *ndev)
>>           ndev->exec_msg_ops = &legacy_exec_message_ops;
>>   }
>>   +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev)
>> +{
>> +    struct amdxdna_dev *xdna = ndev->xdna;
>> +
>> +    drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>> +
>> +    if (!ndev->mgmt_chann)
>> +        return;
>> +
>> +    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>> +    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>> +    ndev->mgmt_chann = NULL;
>> +}
>> +
>>   static inline struct amdxdna_gem_obj *
>>   aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
>>   {
>> diff --git a/drivers/accel/amdxdna/aie2_pci.c 
>> b/drivers/accel/amdxdna/aie2_pci.c
>> index 85079b6fc5d9..977ce21eaf9f 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.c
>> +++ b/drivers/accel/amdxdna/aie2_pci.c
>> @@ -330,9 +330,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
>>         aie2_runtime_cfg(ndev, AIE2_RT_CFG_CLK_GATING, NULL);
>>       aie2_mgmt_fw_fini(ndev);
>> -    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>> -    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>> -    ndev->mgmt_chann = NULL;
>> +    aie2_destroy_mgmt_chann(ndev);
>>       drmm_kfree(&xdna->ddev, ndev->mbox);
>>       ndev->mbox = NULL;
>>       aie2_psp_stop(ndev->psp_hdl);
>> @@ -441,8 +439,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
>>       return 0;
>>     destroy_mgmt_chann:
>> -    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>> -    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>> +    aie2_destroy_mgmt_chann(ndev);
>>   stop_psp:
>>       aie2_psp_stop(ndev->psp_hdl);
>>   fini_smu:
>> diff --git a/drivers/accel/amdxdna/aie2_pci.h 
>> b/drivers/accel/amdxdna/aie2_pci.h
>> index b20a3661078c..e72311c77996 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.h
>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>> @@ -303,6 +303,7 @@ int aie2_get_array_async_error(struct 
>> amdxdna_dev_hdl *ndev,
>>     /* aie2_message.c */
>>   void aie2_msg_init(struct amdxdna_dev_hdl *ndev);
>> +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev);
>>   int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
>>   int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
>>   int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, 
>> u64 value);
>
Re: [PATCH V1] accel/amdxdna: Fix NULL pointer dereference of mgmt_chann
Posted by Mario Limonciello 1 month ago
On 2/27/26 9:16 PM, Lizhi Hou wrote:
> 
> On 2/27/26 12:11, Mario Limonciello wrote:
>> On 2/26/26 3:38 PM, Lizhi Hou wrote:
>>> mgmt_chann may be set to NULL if the firmware returns an unexpected
>>> error in aie2_send_mgmt_msg_wait(). This can later lead to a NULL
>>> pointer dereference in aie2_hw_stop().
>>>
>>> Fix this by introducing a dedicated helper to destroy mgmt_chann
>>> and by adding proper NULL checks before accessing it.
>>>
>>
>> Is this actually going to fix it?  It sounds like a concurrency 
>> problem, no?
> 
> It is not concurrency issue. The code is protected by dev_lock. That is 
> why I added
> 
>        drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> 
> in my fix.
> 
> This is actually error code path issue. Normally, xdna_send_msg_wait() 
> will not time out. This only happens with broken firmware. That is why 
> it was not found before.
> 
> Lizhi

Got it thanks.

Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
> 
>>
>> Do you want a mutex in the helper perhaps?
>>
>>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>> ---
>>>   drivers/accel/amdxdna/aie2_message.c | 21 ++++++++++++++++-----
>>>   drivers/accel/amdxdna/aie2_pci.c     |  7 ++-----
>>>   drivers/accel/amdxdna/aie2_pci.h     |  1 +
>>>   3 files changed, 19 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/ 
>>> amdxdna/aie2_message.c
>>> index 277a27bce850..22e1a85a7ae0 100644
>>> --- a/drivers/accel/amdxdna/aie2_message.c
>>> +++ b/drivers/accel/amdxdna/aie2_message.c
>>> @@ -40,11 +40,8 @@ static int aie2_send_mgmt_msg_wait(struct 
>>> amdxdna_dev_hdl *ndev,
>>>           return -ENODEV;
>>>         ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
>>> -    if (ret == -ETIME) {
>>> -        xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>> -        xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>> -        ndev->mgmt_chann = NULL;
>>> -    }
>>> +    if (ret == -ETIME)
>>> +        aie2_destroy_mgmt_chann(ndev);
>>>         if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
>>>           XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
>>> @@ -914,6 +911,20 @@ void aie2_msg_init(struct amdxdna_dev_hdl *ndev)
>>>           ndev->exec_msg_ops = &legacy_exec_message_ops;
>>>   }
>>>   +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev)
>>> +{
>>> +    struct amdxdna_dev *xdna = ndev->xdna;
>>> +
>>> +    drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>>> +
>>> +    if (!ndev->mgmt_chann)
>>> +        return;
>>> +
>>> +    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>> +    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>> +    ndev->mgmt_chann = NULL;
>>> +}
>>> +
>>>   static inline struct amdxdna_gem_obj *
>>>   aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
>>>   {
>>> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/ 
>>> amdxdna/aie2_pci.c
>>> index 85079b6fc5d9..977ce21eaf9f 100644
>>> --- a/drivers/accel/amdxdna/aie2_pci.c
>>> +++ b/drivers/accel/amdxdna/aie2_pci.c
>>> @@ -330,9 +330,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
>>>         aie2_runtime_cfg(ndev, AIE2_RT_CFG_CLK_GATING, NULL);
>>>       aie2_mgmt_fw_fini(ndev);
>>> -    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>> -    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>> -    ndev->mgmt_chann = NULL;
>>> +    aie2_destroy_mgmt_chann(ndev);
>>>       drmm_kfree(&xdna->ddev, ndev->mbox);
>>>       ndev->mbox = NULL;
>>>       aie2_psp_stop(ndev->psp_hdl);
>>> @@ -441,8 +439,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
>>>       return 0;
>>>     destroy_mgmt_chann:
>>> -    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>> -    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>> +    aie2_destroy_mgmt_chann(ndev);
>>>   stop_psp:
>>>       aie2_psp_stop(ndev->psp_hdl);
>>>   fini_smu:
>>> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/ 
>>> amdxdna/aie2_pci.h
>>> index b20a3661078c..e72311c77996 100644
>>> --- a/drivers/accel/amdxdna/aie2_pci.h
>>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>>> @@ -303,6 +303,7 @@ int aie2_get_array_async_error(struct 
>>> amdxdna_dev_hdl *ndev,
>>>     /* aie2_message.c */
>>>   void aie2_msg_init(struct amdxdna_dev_hdl *ndev);
>>> +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev);
>>>   int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
>>>   int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
>>>   int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, 
>>> u64 value);
>>

Re: [PATCH V1] accel/amdxdna: Fix NULL pointer dereference of mgmt_chann
Posted by Lizhi Hou 1 month ago
Applied to drm-misc-fixes

On 3/2/26 08:57, Mario Limonciello wrote:
> On 2/27/26 9:16 PM, Lizhi Hou wrote:
>>
>> On 2/27/26 12:11, Mario Limonciello wrote:
>>> On 2/26/26 3:38 PM, Lizhi Hou wrote:
>>>> mgmt_chann may be set to NULL if the firmware returns an unexpected
>>>> error in aie2_send_mgmt_msg_wait(). This can later lead to a NULL
>>>> pointer dereference in aie2_hw_stop().
>>>>
>>>> Fix this by introducing a dedicated helper to destroy mgmt_chann
>>>> and by adding proper NULL checks before accessing it.
>>>>
>>>
>>> Is this actually going to fix it?  It sounds like a concurrency 
>>> problem, no?
>>
>> It is not concurrency issue. The code is protected by dev_lock. That 
>> is why I added
>>
>>        drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>>
>> in my fix.
>>
>> This is actually error code path issue. Normally, 
>> xdna_send_msg_wait() will not time out. This only happens with broken 
>> firmware. That is why it was not found before.
>>
>> Lizhi
>
> Got it thanks.
>
> Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org>
>>
>>>
>>> Do you want a mutex in the helper perhaps?
>>>
>>>> Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox")
>>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>>> ---
>>>>   drivers/accel/amdxdna/aie2_message.c | 21 ++++++++++++++++-----
>>>>   drivers/accel/amdxdna/aie2_pci.c     |  7 ++-----
>>>>   drivers/accel/amdxdna/aie2_pci.h     |  1 +
>>>>   3 files changed, 19 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/ 
>>>> amdxdna/aie2_message.c
>>>> index 277a27bce850..22e1a85a7ae0 100644
>>>> --- a/drivers/accel/amdxdna/aie2_message.c
>>>> +++ b/drivers/accel/amdxdna/aie2_message.c
>>>> @@ -40,11 +40,8 @@ static int aie2_send_mgmt_msg_wait(struct 
>>>> amdxdna_dev_hdl *ndev,
>>>>           return -ENODEV;
>>>>         ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
>>>> -    if (ret == -ETIME) {
>>>> -        xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>>> -        xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>>> -        ndev->mgmt_chann = NULL;
>>>> -    }
>>>> +    if (ret == -ETIME)
>>>> +        aie2_destroy_mgmt_chann(ndev);
>>>>         if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
>>>>           XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
>>>> @@ -914,6 +911,20 @@ void aie2_msg_init(struct amdxdna_dev_hdl *ndev)
>>>>           ndev->exec_msg_ops = &legacy_exec_message_ops;
>>>>   }
>>>>   +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev)
>>>> +{
>>>> +    struct amdxdna_dev *xdna = ndev->xdna;
>>>> +
>>>> +    drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
>>>> +
>>>> +    if (!ndev->mgmt_chann)
>>>> +        return;
>>>> +
>>>> +    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>>> +    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>>> +    ndev->mgmt_chann = NULL;
>>>> +}
>>>> +
>>>>   static inline struct amdxdna_gem_obj *
>>>>   aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
>>>>   {
>>>> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/ 
>>>> amdxdna/aie2_pci.c
>>>> index 85079b6fc5d9..977ce21eaf9f 100644
>>>> --- a/drivers/accel/amdxdna/aie2_pci.c
>>>> +++ b/drivers/accel/amdxdna/aie2_pci.c
>>>> @@ -330,9 +330,7 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
>>>>         aie2_runtime_cfg(ndev, AIE2_RT_CFG_CLK_GATING, NULL);
>>>>       aie2_mgmt_fw_fini(ndev);
>>>> -    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>>> -    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>>> -    ndev->mgmt_chann = NULL;
>>>> +    aie2_destroy_mgmt_chann(ndev);
>>>>       drmm_kfree(&xdna->ddev, ndev->mbox);
>>>>       ndev->mbox = NULL;
>>>>       aie2_psp_stop(ndev->psp_hdl);
>>>> @@ -441,8 +439,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
>>>>       return 0;
>>>>     destroy_mgmt_chann:
>>>> -    xdna_mailbox_stop_channel(ndev->mgmt_chann);
>>>> -    xdna_mailbox_destroy_channel(ndev->mgmt_chann);
>>>> +    aie2_destroy_mgmt_chann(ndev);
>>>>   stop_psp:
>>>>       aie2_psp_stop(ndev->psp_hdl);
>>>>   fini_smu:
>>>> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/ 
>>>> amdxdna/aie2_pci.h
>>>> index b20a3661078c..e72311c77996 100644
>>>> --- a/drivers/accel/amdxdna/aie2_pci.h
>>>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>>>> @@ -303,6 +303,7 @@ int aie2_get_array_async_error(struct 
>>>> amdxdna_dev_hdl *ndev,
>>>>     /* aie2_message.c */
>>>>   void aie2_msg_init(struct amdxdna_dev_hdl *ndev);
>>>> +void aie2_destroy_mgmt_chann(struct amdxdna_dev_hdl *ndev);
>>>>   int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
>>>>   int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
>>>>   int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, 
>>>> u64 value);
>>>
>