Provide a vfio error handling callback, that can be used by devices to
handle PCI errors for passthrough devices.
Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
hw/vfio/pci.c | 8 ++++++++
hw/vfio/pci.h | 1 +
2 files changed, 9 insertions(+)
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index bc0b4c4d56..b02a974954 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
static void vfio_err_notifier_handler(void *opaque)
{
VFIOPCIDevice *vdev = opaque;
+ Error *err = NULL;
if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
return;
}
+ if (vdev->err_handler) {
+ if (vdev->err_handler(vdev, &err)) {
+ return;
+ }
+ error_report_err(err);
+ }
+
/*
* TBD. Retrieve the error details and decide what action
* needs to be taken. One of the actions could be to pass
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index e0aef82a89..faadce487c 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -146,6 +146,7 @@ struct VFIOPCIDevice {
EventNotifier err_notifier;
EventNotifier req_notifier;
int (*resetfn)(struct VFIOPCIDevice *);
+ bool (*err_handler)(struct VFIOPCIDevice *, Error **);
uint32_t vendor_id;
uint32_t device_id;
uint32_t sub_vendor_id;
--
2.43.0
Farhan Ali <alifm@linux.ibm.com> writes:
> Provide a vfio error handling callback, that can be used by devices to
> handle PCI errors for passthrough devices.
>
> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
> ---
> hw/vfio/pci.c | 8 ++++++++
> hw/vfio/pci.h | 1 +
> 2 files changed, 9 insertions(+)
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index bc0b4c4d56..b02a974954 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
> static void vfio_err_notifier_handler(void *opaque)
> {
> VFIOPCIDevice *vdev = opaque;
> + Error *err = NULL;
>
> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
> return;
> }
>
> + if (vdev->err_handler) {
> + if (vdev->err_handler(vdev, &err)) {
> + return;
> + }
> + error_report_err(err);
> + }
This is unusual.
Functions taking an Error ** argument usually do so to report errors.
The rules spelled out in qapi/error.h apply. In particular:
* - On success, the function should not touch *errp. On failure, it
* should set a new error, e.g. with error_setg(errp, ...), or
* propagate an existing one, e.g. with error_propagate(errp, ...).
*
* - Whenever practical, also return a value that indicates success /
* failure. This can make the error checking more concise, and can
* avoid useless error object creation and destruction. Note that
* we still have many functions returning void. We recommend
* • bool-valued functions return true on success / false on failure,
If ->err_handler() behaved that way, it @err would be null after it
returns false. We'd call error_report_err(NULL), and crash.
Functions with unusual behavior need a contract: a comment spelling out
their behavior.
What is the intended behavior of the err_handler() callback?
> +
> /*
> * TBD. Retrieve the error details and decide what action
> * needs to be taken. One of the actions could be to pass
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index e0aef82a89..faadce487c 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -146,6 +146,7 @@ struct VFIOPCIDevice {
> EventNotifier err_notifier;
> EventNotifier req_notifier;
> int (*resetfn)(struct VFIOPCIDevice *);
> + bool (*err_handler)(struct VFIOPCIDevice *, Error **);
> uint32_t vendor_id;
> uint32_t device_id;
> uint32_t sub_vendor_id;
On 9/25/2025 9:57 PM, Markus Armbruster wrote:
> Farhan Ali <alifm@linux.ibm.com> writes:
>
>> Provide a vfio error handling callback, that can be used by devices to
>> handle PCI errors for passthrough devices.
>>
>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>> ---
>> hw/vfio/pci.c | 8 ++++++++
>> hw/vfio/pci.h | 1 +
>> 2 files changed, 9 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index bc0b4c4d56..b02a974954 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>> static void vfio_err_notifier_handler(void *opaque)
>> {
>> VFIOPCIDevice *vdev = opaque;
>> + Error *err = NULL;
>>
>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>> return;
>> }
>>
>> + if (vdev->err_handler) {
>> + if (vdev->err_handler(vdev, &err)) {
>> + return;
>> + }
>> + error_report_err(err);
>> + }
> This is unusual.
>
> Functions taking an Error ** argument usually do so to report errors.
> The rules spelled out in qapi/error.h apply. In particular:
>
> * - On success, the function should not touch *errp. On failure, it
> * should set a new error, e.g. with error_setg(errp, ...), or
> * propagate an existing one, e.g. with error_propagate(errp, ...).
> *
> * - Whenever practical, also return a value that indicates success /
> * failure. This can make the error checking more concise, and can
> * avoid useless error object creation and destruction. Note that
> * we still have many functions returning void. We recommend
> * • bool-valued functions return true on success / false on failure,
>
> If ->err_handler() behaved that way, it @err would be null after it
> returns false. We'd call error_report_err(NULL), and crash.
>
> Functions with unusual behavior need a contract: a comment spelling out
> their behavior.
>
> What is the intended behavior of the err_handler() callback?
Hi Markus,
Thanks for reviewing! The intended behavior for err_handler() is to set
errp and report the error on false/failure. With the above code, I also
intended fall through to vm_stop() when err_handler() fails.
I think I misunderstood the errp error handling, it seems like the
correct way to do what I intended would be
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index b02a974954..630de46c90 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3070,10 +3070,11 @@ static void vfio_err_notifier_handler(void *opaque)
}
if (vdev->err_handler) {
- if (vdev->err_handler(vdev, &err)) {
+ if (!vdev->err_handler(vdev, &err)) {
+ error_report_err(err);
+ } else {
return;
}
- error_report_err(err);
}
Please correct me if I missed anything.
Thanks
Farhan
>
>> +
>> /*
>> * TBD. Retrieve the error details and decide what action
>> * needs to be taken. One of the actions could be to pass
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index e0aef82a89..faadce487c 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -146,6 +146,7 @@ struct VFIOPCIDevice {
>> EventNotifier err_notifier;
>> EventNotifier req_notifier;
>> int (*resetfn)(struct VFIOPCIDevice *);
>> + bool (*err_handler)(struct VFIOPCIDevice *, Error **);
>> uint32_t vendor_id;
>> uint32_t device_id;
>> uint32_t sub_vendor_id;
>
Farhan Ali <alifm@linux.ibm.com> writes:
> On 9/25/2025 9:57 PM, Markus Armbruster wrote:
>> Farhan Ali <alifm@linux.ibm.com> writes:
>>
>>> Provide a vfio error handling callback, that can be used by devices to
>>> handle PCI errors for passthrough devices.
>>>
>>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>>> ---
>>> hw/vfio/pci.c | 8 ++++++++
>>> hw/vfio/pci.h | 1 +
>>> 2 files changed, 9 insertions(+)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index bc0b4c4d56..b02a974954 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>>> static void vfio_err_notifier_handler(void *opaque)
>>> {
>>> VFIOPCIDevice *vdev = opaque;
>>> + Error *err = NULL;
>>>
>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>> return;
>>> }
>>>
>>> + if (vdev->err_handler) {
>>> + if (vdev->err_handler(vdev, &err)) {
>>> + return;
>>> + }
>>> + error_report_err(err);
>>> + }
>>
>> This is unusual.
>>
>> Functions taking an Error ** argument usually do so to report errors.
>> The rules spelled out in qapi/error.h apply. In particular:
>>
>> * - On success, the function should not touch *errp. On failure, it
>> * should set a new error, e.g. with error_setg(errp, ...), or
>> * propagate an existing one, e.g. with error_propagate(errp, ...).
>> *
>> * - Whenever practical, also return a value that indicates success /
>> * failure. This can make the error checking more concise, and can
>> * avoid useless error object creation and destruction. Note that
>> * we still have many functions returning void. We recommend
>> * • bool-valued functions return true on success / false on failure,
>>
>> If ->err_handler() behaved that way, it @err would be null after it
>> returns false. We'd call error_report_err(NULL), and crash.
>>
>> Functions with unusual behavior need a contract: a comment spelling out
>> their behavior.
>>
>> What is the intended behavior of the err_handler() callback?
>
> Hi Markus,
>
> Thanks for reviewing! The intended behavior for err_handler() is to set errp and report the error on false/failure. With the above code, I also intended fall through to vm_stop() when err_handler() fails.
>
> I think I misunderstood the errp error handling, it seems like the correct way to do what I intended would be
>
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index b02a974954..630de46c90 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3070,10 +3070,11 @@ static void vfio_err_notifier_handler(void *opaque)
> }
>
> if (vdev->err_handler) {
> - if (vdev->err_handler(vdev, &err)) {
> + if (!vdev->err_handler(vdev, &err)) {
> + error_report_err(err);
> + } else {
> return;
> }
> - error_report_err(err);
> }
>
> Please correct me if I missed anything.
Resulting function:
static void vfio_err_notifier_handler(void *opaque)
{
VFIOPCIDevice *vdev = opaque;
Error *err = NULL;
if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
return;
}
if (vdev->err_handler) {
if (!vdev->err_handler(vdev, &err)) {
error_report_err(err);
} else {
return;
}
}
/*
* TBD. Retrieve the error details and decide what action
* needs to be taken. One of the actions could be to pass
* the error to the guest and have the guest driver recover
* from the error. This requires that PCIe capabilities be
* exposed to the guest. For now, we just terminate the
* guest to contain the error.
*/
error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
vm_stop(RUN_STATE_INTERNAL_ERROR);
}
Slighly rearranged for clearer control flow:
static void vfio_err_notifier_handler(void *opaque)
{
VFIOPCIDevice *vdev = opaque;
Error *err = NULL;
if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
return;
}
if (vdev->err_handler) {
if (vdev->err_handler(vdev, &err)) {
/* Error successfully handled */
return;
}
error_report_err(err);
}
/*
* TBD. Retrieve the error details and decide what action
* needs to be taken. One of the actions could be to pass
* the error to the guest and have the guest driver recover
* from the error. This requires that PCIe capabilities be
* exposed to the guest. For now, we just terminate the
* guest to contain the error.
*/
error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
vm_stop(RUN_STATE_INTERNAL_ERROR);
}
Questions / issues:
* Is the comment still accurate?
* When ->err_handler() fails, we report the error twice. Would it make
sense to combine the two error reports into one?
* Preexisting: the second error message is ugly.
Error messages should be short and to the point: single phrase, with
no newline or trailing punctuation. The "please collect ..." part
does not belong to the error message proper, it's advice on what to
do. Better: report the error, then print advice:
error_report("%s(%s) Unrecoverable error detected",
__func__, vdev->vbasedev.name);
error_printf("Please collect any data possible and then kill the guest.");
Including __func__ in an error message is an anti-pattern. Look at
vfio_err_notifier_handler(fred) Unrecoverable error detected
with a user's eyes: "vfio_err_notifier_handler" is programmer
gobbledygook, the device name "fred" is useful once you realize what
it is, "Unrecoverable error detected" lacks detail.
[...]
On 9/27/25 07:59, Markus Armbruster wrote:
> Farhan Ali <alifm@linux.ibm.com> writes:
>
>> On 9/25/2025 9:57 PM, Markus Armbruster wrote:
>>> Farhan Ali <alifm@linux.ibm.com> writes:
>>>
>>>> Provide a vfio error handling callback, that can be used by devices to
>>>> handle PCI errors for passthrough devices.
>>>>
>>>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>>>> ---
>>>> hw/vfio/pci.c | 8 ++++++++
>>>> hw/vfio/pci.h | 1 +
>>>> 2 files changed, 9 insertions(+)
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index bc0b4c4d56..b02a974954 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>>>> static void vfio_err_notifier_handler(void *opaque)
>>>> {
>>>> VFIOPCIDevice *vdev = opaque;
>>>> + Error *err = NULL;
>>>>
>>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>> return;
>>>> }
>>>>
>>>> + if (vdev->err_handler) {
>>>> + if (vdev->err_handler(vdev, &err)) {
>>>> + return;
>>>> + }
>>>> + error_report_err(err);
>>>> + }
>>>
>>> This is unusual.
>>>
>>> Functions taking an Error ** argument usually do so to report errors.
>>> The rules spelled out in qapi/error.h apply. In particular:
>>>
>>> * - On success, the function should not touch *errp. On failure, it
>>> * should set a new error, e.g. with error_setg(errp, ...), or
>>> * propagate an existing one, e.g. with error_propagate(errp, ...).
>>> *
>>> * - Whenever practical, also return a value that indicates success /
>>> * failure. This can make the error checking more concise, and can
>>> * avoid useless error object creation and destruction. Note that
>>> * we still have many functions returning void. We recommend
>>> * • bool-valued functions return true on success / false on failure,
>>>
>>> If ->err_handler() behaved that way, it @err would be null after it
>>> returns false. We'd call error_report_err(NULL), and crash.
>>>
>>> Functions with unusual behavior need a contract: a comment spelling out
>>> their behavior.
>>>
>>> What is the intended behavior of the err_handler() callback?
>>
>> Hi Markus,
>>
>> Thanks for reviewing! The intended behavior for err_handler() is to set errp and report the error on false/failure. With the above code, I also intended fall through to vm_stop() when err_handler() fails.
>>
>> I think I misunderstood the errp error handling, it seems like the correct way to do what I intended would be
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index b02a974954..630de46c90 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3070,10 +3070,11 @@ static void vfio_err_notifier_handler(void *opaque)
>> }
>>
>> if (vdev->err_handler) {
>> - if (vdev->err_handler(vdev, &err)) {
>> + if (!vdev->err_handler(vdev, &err)) {
>> + error_report_err(err);
>> + } else {
>> return;
>> }
>> - error_report_err(err);
>> }
>>
>> Please correct me if I missed anything.
>
> Resulting function:
>
> static void vfio_err_notifier_handler(void *opaque)
> {
> VFIOPCIDevice *vdev = opaque;
> Error *err = NULL;
>
> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
> return;
> }
>
> if (vdev->err_handler) {
> if (!vdev->err_handler(vdev, &err)) {
> error_report_err(err);
> } else {
> return;
> }
> }
>
> /*
> * TBD. Retrieve the error details and decide what action
> * needs to be taken. One of the actions could be to pass
> * the error to the guest and have the guest driver recover
> * from the error. This requires that PCIe capabilities be
> * exposed to the guest. For now, we just terminate the
> * guest to contain the error.
> */
>
> error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
>
> vm_stop(RUN_STATE_INTERNAL_ERROR);
> }
>
> Slighly rearranged for clearer control flow:
>
> static void vfio_err_notifier_handler(void *opaque)
> {
> VFIOPCIDevice *vdev = opaque;
> Error *err = NULL;
>
> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
> return;
> }
>
> if (vdev->err_handler) {
> if (vdev->err_handler(vdev, &err)) {
> /* Error successfully handled */
> return;
> }
> error_report_err(err);
> }
>
> /*
> * TBD. Retrieve the error details and decide what action
> * needs to be taken. One of the actions could be to pass
> * the error to the guest and have the guest driver recover
> * from the error. This requires that PCIe capabilities be
> * exposed to the guest. For now, we just terminate the
> * guest to contain the error.
> */
>
> error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
>
> vm_stop(RUN_STATE_INTERNAL_ERROR);
> }
>
> Questions / issues:
>
> * Is the comment still accurate?
>
> * When ->err_handler() fails, we report the error twice. Would it make
> sense to combine the two error reports into one?
Yes. It was my request too.
Thanks,
C.
> * Preexisting: the second error message is ugly.
>
> Error messages should be short and to the point: single phrase, with
> no newline or trailing punctuation. The "please collect ..." part
> does not belong to the error message proper, it's advice on what to
> do. Better: report the error, then print advice:
>
> error_report("%s(%s) Unrecoverable error detected",
> __func__, vdev->vbasedev.name);
> error_printf("Please collect any data possible and then kill the guest.");
>
> Including __func__ in an error message is an anti-pattern. Look at
>
> vfio_err_notifier_handler(fred) Unrecoverable error detected
>
> with a user's eyes: "vfio_err_notifier_handler" is programmer
> gobbledygook, the device name "fred" is useful once you realize what
> it is, "Unrecoverable error detected" lacks detail.
>
> [...]
>
On 9/27/2025 12:05 AM, Cédric Le Goater wrote:
> On 9/27/25 07:59, Markus Armbruster wrote:
>> Farhan Ali <alifm@linux.ibm.com> writes:
>>
>>> On 9/25/2025 9:57 PM, Markus Armbruster wrote:
>>>> Farhan Ali <alifm@linux.ibm.com> writes:
>>>>
>>>>> Provide a vfio error handling callback, that can be used by
>>>>> devices to
>>>>> handle PCI errors for passthrough devices.
>>>>>
>>>>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>>>>> ---
>>>>> hw/vfio/pci.c | 8 ++++++++
>>>>> hw/vfio/pci.h | 1 +
>>>>> 2 files changed, 9 insertions(+)
>>>>>
>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>> index bc0b4c4d56..b02a974954 100644
>>>>> --- a/hw/vfio/pci.c
>>>>> +++ b/hw/vfio/pci.c
>>>>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>>>>> static void vfio_err_notifier_handler(void *opaque)
>>>>> {
>>>>> VFIOPCIDevice *vdev = opaque;
>>>>> + Error *err = NULL;
>>>>>
>>>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>>> return;
>>>>> }
>>>>>
>>>>> + if (vdev->err_handler) {
>>>>> + if (vdev->err_handler(vdev, &err)) {
>>>>> + return;
>>>>> + }
>>>>> + error_report_err(err);
>>>>> + }
>>>>
>>>> This is unusual.
>>>>
>>>> Functions taking an Error ** argument usually do so to report errors.
>>>> The rules spelled out in qapi/error.h apply. In particular:
>>>>
>>>> * - On success, the function should not touch *errp. On
>>>> failure, it
>>>> * should set a new error, e.g. with error_setg(errp, ...), or
>>>> * propagate an existing one, e.g. with error_propagate(errp,
>>>> ...).
>>>> *
>>>> * - Whenever practical, also return a value that indicates
>>>> success /
>>>> * failure. This can make the error checking more concise, and
>>>> can
>>>> * avoid useless error object creation and destruction. Note that
>>>> * we still have many functions returning void. We recommend
>>>> * • bool-valued functions return true on success / false on
>>>> failure,
>>>>
>>>> If ->err_handler() behaved that way, it @err would be null after it
>>>> returns false. We'd call error_report_err(NULL), and crash.
>>>>
>>>> Functions with unusual behavior need a contract: a comment spelling
>>>> out
>>>> their behavior.
>>>>
>>>> What is the intended behavior of the err_handler() callback?
>>>
>>> Hi Markus,
>>>
>>> Thanks for reviewing! The intended behavior for err_handler() is to
>>> set errp and report the error on false/failure. With the above code,
>>> I also intended fall through to vm_stop() when err_handler() fails.
>>>
>>> I think I misunderstood the errp error handling, it seems like the
>>> correct way to do what I intended would be
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index b02a974954..630de46c90 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -3070,10 +3070,11 @@ static void vfio_err_notifier_handler(void
>>> *opaque)
>>> }
>>>
>>> if (vdev->err_handler) {
>>> - if (vdev->err_handler(vdev, &err)) {
>>> + if (!vdev->err_handler(vdev, &err)) {
>>> + error_report_err(err);
>>> + } else {
>>> return;
>>> }
>>> - error_report_err(err);
>>> }
>>>
>>> Please correct me if I missed anything.
>>
>> Resulting function:
>>
>> static void vfio_err_notifier_handler(void *opaque)
>> {
>> VFIOPCIDevice *vdev = opaque;
>> Error *err = NULL;
>>
>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>> return;
>> }
>>
>> if (vdev->err_handler) {
>> if (!vdev->err_handler(vdev, &err)) {
>> error_report_err(err);
>> } else {
>> return;
>> }
>> }
>>
>> /*
>> * TBD. Retrieve the error details and decide what action
>> * needs to be taken. One of the actions could be to pass
>> * the error to the guest and have the guest driver recover
>> * from the error. This requires that PCIe capabilities be
>> * exposed to the guest. For now, we just terminate the
>> * guest to contain the error.
>> */
>>
>> error_report("%s(%s) Unrecoverable error detected. Please
>> collect any data possible and then kill the guest", __func__,
>> vdev->vbasedev.name);
>>
>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>> }
>>
>> Slighly rearranged for clearer control flow:
>>
>> static void vfio_err_notifier_handler(void *opaque)
>> {
>> VFIOPCIDevice *vdev = opaque;
>> Error *err = NULL;
>>
>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>> return;
>> }
>>
>> if (vdev->err_handler) {
>> if (vdev->err_handler(vdev, &err)) {
>> /* Error successfully handled */
>> return;
>> }
>> error_report_err(err);
>> }
Yes, this is what i intended to do with my patch and provide a clearer
flow. Though the compiler error reported by Cedric, is a little
confusing, need to understand why that happens.
>>
>> /*
>> * TBD. Retrieve the error details and decide what action
>> * needs to be taken. One of the actions could be to pass
>> * the error to the guest and have the guest driver recover
>> * from the error. This requires that PCIe capabilities be
>> * exposed to the guest. For now, we just terminate the
>> * guest to contain the error.
>> */
>>
>> error_report("%s(%s) Unrecoverable error detected. Please
>> collect any data possible and then kill the guest", __func__,
>> vdev->vbasedev.name);
>>
>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>> }
>>
>> Questions / issues:
>>
>> * Is the comment still accurate?
This comment would still apply for vfio-pci devices on other
architectures except for s390x. We are trying to change this behavior
for s390x.
>>
>> * When ->err_handler() fails, we report the error twice. Would it make
>> sense to combine the two error reports into one?
>
> Yes. It was my request too.
>
> Thanks,
>
> C.
I was a little hesitant about changing the existing error message as its
been there for almost 12 years (since commit 7b4b0e9eda ("vfio:
QEMU-AER: Qemu changes to support AER for VFIO-PCI devices")). Nothing
should ever dependent on specific error messages, but still.. .If the
preference is to combine/change the message I can do that.
>
>
>
>> * Preexisting: the second error message is ugly.
>>
>> Error messages should be short and to the point: single phrase, with
>> no newline or trailing punctuation. The "please collect ..." part
>> does not belong to the error message proper, it's advice on what to
>> do. Better: report the error, then print advice:
>>
>> error_report("%s(%s) Unrecoverable error detected",
>> __func__, vdev->vbasedev.name);
>> error_printf("Please collect any data possible and then kill
>> the guest.");
>>
>> Including __func__ in an error message is an anti-pattern. Look at
>>
>> vfio_err_notifier_handler(fred) Unrecoverable error detected
>>
>> with a user's eyes: "vfio_err_notifier_handler" is programmer
>> gobbledygook, the device name "fred" is useful once you realize what
>> it is, "Unrecoverable error detected" lacks detail.
>>
>> [...]
>>
>
How about "(device) Unrecoverable PCIe error detected for device"
Thanks
Farhan
Farhan Ali <alifm@linux.ibm.com> writes:
> On 9/27/2025 12:05 AM, Cédric Le Goater wrote:
>> On 9/27/25 07:59, Markus Armbruster wrote:
>>> Farhan Ali <alifm@linux.ibm.com> writes:
>>>
>>>> On 9/25/2025 9:57 PM, Markus Armbruster wrote:
>>>>> Farhan Ali <alifm@linux.ibm.com> writes:
>>>>>
>>>>>> Provide a vfio error handling callback, that can be used by devices to
>>>>>> handle PCI errors for passthrough devices.
>>>>>>
>>>>>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>>>>>> ---
>>>>>> hw/vfio/pci.c | 8 ++++++++
>>>>>> hw/vfio/pci.h | 1 +
>>>>>> 2 files changed, 9 insertions(+)
>>>>>>
>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>> index bc0b4c4d56..b02a974954 100644
>>>>>> --- a/hw/vfio/pci.c
>>>>>> +++ b/hw/vfio/pci.c
>>>>>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>>>>>> static void vfio_err_notifier_handler(void *opaque)
>>>>>> {
>>>>>> VFIOPCIDevice *vdev = opaque;
>>>>>> + Error *err = NULL;
>>>>>>
>>>>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>>>> return;
>>>>>> }
>>>>>>
>>>>>> + if (vdev->err_handler) {
>>>>>> + if (vdev->err_handler(vdev, &err)) {
>>>>>> + return;
>>>>>> + }
>>>>>> + error_report_err(err);
>>>>>> + }
>>>>>
>>>>> This is unusual.
>>>>>
>>>>> Functions taking an Error ** argument usually do so to report errors.
>>>>> The rules spelled out in qapi/error.h apply. In particular:
>>>>>
>>>>> * - On success, the function should not touch *errp. On failure, it
>>>>> * should set a new error, e.g. with error_setg(errp, ...), or
>>>>> * propagate an existing one, e.g. with error_propagate(errp, ...).
>>>>> *
>>>>> * - Whenever practical, also return a value that indicates success /
>>>>> * failure. This can make the error checking more concise, and can
>>>>> * avoid useless error object creation and destruction. Note that
>>>>> * we still have many functions returning void. We recommend
>>>>> * • bool-valued functions return true on success / false on failure,
>>>>>
>>>>> If ->err_handler() behaved that way, it @err would be null after it
>>>>> returns false. We'd call error_report_err(NULL), and crash.
>>>>>
>>>>> Functions with unusual behavior need a contract: a comment spelling out
>>>>> their behavior.
>>>>>
>>>>> What is the intended behavior of the err_handler() callback?
>>>>
>>>> Hi Markus,
>>>>
>>>> Thanks for reviewing! The intended behavior for err_handler() is to set errp and report the error on false/failure. With the above code, I also intended fall through to vm_stop() when err_handler() fails.
>>>>
>>>> I think I misunderstood the errp error handling, it seems like the correct way to do what I intended would be
>>>>
>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>> index b02a974954..630de46c90 100644
>>>> --- a/hw/vfio/pci.c
>>>> +++ b/hw/vfio/pci.c
>>>> @@ -3070,10 +3070,11 @@ static void vfio_err_notifier_handler(void *opaque)
>>>> }
>>>>
>>>> if (vdev->err_handler) {
>>>> - if (vdev->err_handler(vdev, &err)) {
>>>> + if (!vdev->err_handler(vdev, &err)) {
>>>> + error_report_err(err);
>>>> + } else {
>>>> return;
>>>> }
>>>> - error_report_err(err);
>>>> }
>>>>
>>>> Please correct me if I missed anything.
>>>
>>> Resulting function:
>>>
>>> static void vfio_err_notifier_handler(void *opaque)
>>> {
>>> VFIOPCIDevice *vdev = opaque;
>>> Error *err = NULL;
>>>
>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>> return;
>>> }
>>>
>>> if (vdev->err_handler) {
>>> if (!vdev->err_handler(vdev, &err)) {
>>> error_report_err(err);
>>> } else {
>>> return;
>>> }
>>> }
>>>
>>> /*
>>> * TBD. Retrieve the error details and decide what action
>>> * needs to be taken. One of the actions could be to pass
>>> * the error to the guest and have the guest driver recover
>>> * from the error. This requires that PCIe capabilities be
>>> * exposed to the guest. For now, we just terminate the
>>> * guest to contain the error.
>>> */
>>>
>>> error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
>>>
>>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>>> }
>>>
>>> Slighly rearranged for clearer control flow:
>>>
>>> static void vfio_err_notifier_handler(void *opaque)
>>> {
>>> VFIOPCIDevice *vdev = opaque;
>>> Error *err = NULL;
>>>
>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>> return;
>>> }
>>>
>>> if (vdev->err_handler) {
>>> if (vdev->err_handler(vdev, &err)) {
>>> /* Error successfully handled */
>>> return;
>>> }
>>> error_report_err(err);
>>> }
>
> Yes, this is what i intended to do with my patch and provide a clearer flow. Though the compiler error reported by Cedric, is a little confusing, need to understand why that happens.
>
>
>>>
>>> /*
>>> * TBD. Retrieve the error details and decide what action
>>> * needs to be taken. One of the actions could be to pass
>>> * the error to the guest and have the guest driver recover
>>> * from the error. This requires that PCIe capabilities be
>>> * exposed to the guest. For now, we just terminate the
>>> * guest to contain the error.
>>> */
>>>
>>> error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
>>>
>>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>>> }
>>>
>>> Questions / issues:
>>>
>>> * Is the comment still accurate?
>
> This comment would still apply for vfio-pci devices on other architectures except for s390x. We are trying to change this behavior for s390x.
The comment is about things that should be done to handle the error.
Would these things be done here, or in a suitable ->err_handler()?
>>>
>>> * When ->err_handler() fails, we report the error twice. Would it make
>>> sense to combine the two error reports into one?
>>
>> Yes. It was my request too.
>>
>> Thanks,
>>
>> C.
>
> I was a little hesitant about changing the existing error message as its been there for almost 12 years (since commit 7b4b0e9eda ("vfio: QEMU-AER: Qemu changes to support AER for VFIO-PCI devices")). Nothing should ever dependent on specific error messages, but still.. .If the preference is to combine/change the message I can do that.
Don't hesitate to improve error messages.
>>> * Preexisting: the second error message is ugly.
>>>
>>> Error messages should be short and to the point: single phrase, with
>>> no newline or trailing punctuation. The "please collect ..." part
>>> does not belong to the error message proper, it's advice on what to
>>> do. Better: report the error, then print advice:
>>>
>>> error_report("%s(%s) Unrecoverable error detected",
>>> __func__, vdev->vbasedev.name);
>>> error_printf("Please collect any data possible and then kill the guest.");
>>>
>>> Including __func__ in an error message is an anti-pattern. Look at
>>>
>>> vfio_err_notifier_handler(fred) Unrecoverable error detected
>>>
>>> with a user's eyes: "vfio_err_notifier_handler" is programmer
>>> gobbledygook, the device name "fred" is useful once you realize what
>>> it is, "Unrecoverable error detected" lacks detail.
>>>
>>> [...]
>>>
>>
> How about "(device) Unrecoverable PCIe error detected for device"
Suggest "for device %s", where %s identifies the device to the user.
On 9/30/2025 2:20 AM, Markus Armbruster wrote:
> Farhan Ali <alifm@linux.ibm.com> writes:
>
>> On 9/27/2025 12:05 AM, Cédric Le Goater wrote:
>>> On 9/27/25 07:59, Markus Armbruster wrote:
>>>> Farhan Ali <alifm@linux.ibm.com> writes:
>>>>
>>>>> On 9/25/2025 9:57 PM, Markus Armbruster wrote:
>>>>>> Farhan Ali <alifm@linux.ibm.com> writes:
>>>>>>
>>>>>>> Provide a vfio error handling callback, that can be used by devices to
>>>>>>> handle PCI errors for passthrough devices.
>>>>>>>
>>>>>>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>>>>>>> ---
>>>>>>> hw/vfio/pci.c | 8 ++++++++
>>>>>>> hw/vfio/pci.h | 1 +
>>>>>>> 2 files changed, 9 insertions(+)
>>>>>>>
>>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>>> index bc0b4c4d56..b02a974954 100644
>>>>>>> --- a/hw/vfio/pci.c
>>>>>>> +++ b/hw/vfio/pci.c
>>>>>>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>>>>>>> static void vfio_err_notifier_handler(void *opaque)
>>>>>>> {
>>>>>>> VFIOPCIDevice *vdev = opaque;
>>>>>>> + Error *err = NULL;
>>>>>>>
>>>>>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>>>>> return;
>>>>>>> }
>>>>>>>
>>>>>>> + if (vdev->err_handler) {
>>>>>>> + if (vdev->err_handler(vdev, &err)) {
>>>>>>> + return;
>>>>>>> + }
>>>>>>> + error_report_err(err);
>>>>>>> + }
>>>>>> This is unusual.
>>>>>>
>>>>>> Functions taking an Error ** argument usually do so to report errors.
>>>>>> The rules spelled out in qapi/error.h apply. In particular:
>>>>>>
>>>>>> * - On success, the function should not touch *errp. On failure, it
>>>>>> * should set a new error, e.g. with error_setg(errp, ...), or
>>>>>> * propagate an existing one, e.g. with error_propagate(errp, ...).
>>>>>> *
>>>>>> * - Whenever practical, also return a value that indicates success /
>>>>>> * failure. This can make the error checking more concise, and can
>>>>>> * avoid useless error object creation and destruction. Note that
>>>>>> * we still have many functions returning void. We recommend
>>>>>> * • bool-valued functions return true on success / false on failure,
>>>>>>
>>>>>> If ->err_handler() behaved that way, it @err would be null after it
>>>>>> returns false. We'd call error_report_err(NULL), and crash.
>>>>>>
>>>>>> Functions with unusual behavior need a contract: a comment spelling out
>>>>>> their behavior.
>>>>>>
>>>>>> What is the intended behavior of the err_handler() callback?
>>>>> Hi Markus,
>>>>>
>>>>> Thanks for reviewing! The intended behavior for err_handler() is to set errp and report the error on false/failure. With the above code, I also intended fall through to vm_stop() when err_handler() fails.
>>>>>
>>>>> I think I misunderstood the errp error handling, it seems like the correct way to do what I intended would be
>>>>>
>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>> index b02a974954..630de46c90 100644
>>>>> --- a/hw/vfio/pci.c
>>>>> +++ b/hw/vfio/pci.c
>>>>> @@ -3070,10 +3070,11 @@ static void vfio_err_notifier_handler(void *opaque)
>>>>> }
>>>>>
>>>>> if (vdev->err_handler) {
>>>>> - if (vdev->err_handler(vdev, &err)) {
>>>>> + if (!vdev->err_handler(vdev, &err)) {
>>>>> + error_report_err(err);
>>>>> + } else {
>>>>> return;
>>>>> }
>>>>> - error_report_err(err);
>>>>> }
>>>>>
>>>>> Please correct me if I missed anything.
>>>> Resulting function:
>>>>
>>>> static void vfio_err_notifier_handler(void *opaque)
>>>> {
>>>> VFIOPCIDevice *vdev = opaque;
>>>> Error *err = NULL;
>>>>
>>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>> return;
>>>> }
>>>>
>>>> if (vdev->err_handler) {
>>>> if (!vdev->err_handler(vdev, &err)) {
>>>> error_report_err(err);
>>>> } else {
>>>> return;
>>>> }
>>>> }
>>>>
>>>> /*
>>>> * TBD. Retrieve the error details and decide what action
>>>> * needs to be taken. One of the actions could be to pass
>>>> * the error to the guest and have the guest driver recover
>>>> * from the error. This requires that PCIe capabilities be
>>>> * exposed to the guest. For now, we just terminate the
>>>> * guest to contain the error.
>>>> */
>>>>
>>>> error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
>>>>
>>>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>>>> }
>>>>
>>>> Slighly rearranged for clearer control flow:
>>>>
>>>> static void vfio_err_notifier_handler(void *opaque)
>>>> {
>>>> VFIOPCIDevice *vdev = opaque;
>>>> Error *err = NULL;
>>>>
>>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>>> return;
>>>> }
>>>>
>>>> if (vdev->err_handler) {
>>>> if (vdev->err_handler(vdev, &err)) {
>>>> /* Error successfully handled */
>>>> return;
>>>> }
>>>> error_report_err(err);
>>>> }
>> Yes, this is what i intended to do with my patch and provide a clearer flow. Though the compiler error reported by Cedric, is a little confusing, need to understand why that happens.
>>
>>
>>>> /*
>>>> * TBD. Retrieve the error details and decide what action
>>>> * needs to be taken. One of the actions could be to pass
>>>> * the error to the guest and have the guest driver recover
>>>> * from the error. This requires that PCIe capabilities be
>>>> * exposed to the guest. For now, we just terminate the
>>>> * guest to contain the error.
>>>> */
>>>>
>>>> error_report("%s(%s) Unrecoverable error detected. Please collect any data possible and then kill the guest", __func__, vdev->vbasedev.name);
>>>>
>>>> vm_stop(RUN_STATE_INTERNAL_ERROR);
>>>> }
>>>>
>>>> Questions / issues:
>>>>
>>>> * Is the comment still accurate?
>> This comment would still apply for vfio-pci devices on other architectures except for s390x. We are trying to change this behavior for s390x.
> The comment is about things that should be done to handle the error.
> Would these things be done here, or in a suitable ->err_handler()?
Ideally in the err_handler(). And for s390x we try do what the comment
mentions, which is inject the error into the guest through s390x
architecture specific mechanism. I can remove the comment block.
>
>>>> * When ->err_handler() fails, we report the error twice. Would it make
>>>> sense to combine the two error reports into one?
>>> Yes. It was my request too.
>>>
>>> Thanks,
>>>
>>> C.
>> I was a little hesitant about changing the existing error message as its been there for almost 12 years (since commit 7b4b0e9eda ("vfio: QEMU-AER: Qemu changes to support AER for VFIO-PCI devices")). Nothing should ever dependent on specific error messages, but still.. .If the preference is to combine/change the message I can do that.
> Don't hesitate to improve error messages.
>
>>>> * Preexisting: the second error message is ugly.
>>>>
>>>> Error messages should be short and to the point: single phrase, with
>>>> no newline or trailing punctuation. The "please collect ..." part
>>>> does not belong to the error message proper, it's advice on what to
>>>> do. Better: report the error, then print advice:
>>>>
>>>> error_report("%s(%s) Unrecoverable error detected",
>>>> __func__, vdev->vbasedev.name);
>>>> error_printf("Please collect any data possible and then kill the guest.");
>>>>
>>>> Including __func__ in an error message is an anti-pattern. Look at
>>>>
>>>> vfio_err_notifier_handler(fred) Unrecoverable error detected
>>>>
>>>> with a user's eyes: "vfio_err_notifier_handler" is programmer
>>>> gobbledygook, the device name "fred" is useful once you realize what
>>>> it is, "Unrecoverable error detected" lacks detail.
>>>>
>>>> [...]
>>>>
>> How about "(device) Unrecoverable PCIe error detected for device"
> Suggest "for device %s", where %s identifies the device to the user.
Sure, I can make the change.
Thanks
Farhan
>
Farhan Ali <alifm@linux.ibm.com> writes: > On 9/30/2025 2:20 AM, Markus Armbruster wrote: >> Farhan Ali <alifm@linux.ibm.com> writes: >> >>> On 9/27/2025 12:05 AM, Cédric Le Goater wrote: >>>> On 9/27/25 07:59, Markus Armbruster wrote: [...] >>>>> * Is the comment still accurate? >>> >>> This comment would still apply for vfio-pci devices on other architectures except for s390x. We are trying to change this behavior for s390x. >> >> The comment is about things that should be done to handle the error. >> Would these things be done here, or in a suitable ->err_handler()? > > Ideally in the err_handler(). And for s390x we try do what the comment mentions, which is inject the error into the guest through s390x architecture specific mechanism. I can remove the comment block. Well, if there's stuff left to do, a comment outlining it is desirable. If I understand you correctly, then the one we have is no longer accurate. Could you update it, so it is? [...]
On 9/30/2025 9:52 PM, Markus Armbruster wrote: > Farhan Ali <alifm@linux.ibm.com> writes: > >> On 9/30/2025 2:20 AM, Markus Armbruster wrote: >>> Farhan Ali <alifm@linux.ibm.com> writes: >>> >>>> On 9/27/2025 12:05 AM, Cédric Le Goater wrote: >>>>> On 9/27/25 07:59, Markus Armbruster wrote: > [...] > >>>>>> * Is the comment still accurate? >>>> This comment would still apply for vfio-pci devices on other architectures except for s390x. We are trying to change this behavior for s390x. >>> The comment is about things that should be done to handle the error. >>> Would these things be done here, or in a suitable ->err_handler()? >> Ideally in the err_handler(). And for s390x we try do what the comment mentions, which is inject the error into the guest through s390x architecture specific mechanism. I can remove the comment block. > Well, if there's stuff left to do, a comment outlining it is desirable. > If I understand you correctly, then the one we have is no longer > accurate. Could you update it, so it is? > > [...] > How about something like this? We can retrieve the error details and decide what action needs to be taken in err_handler(). One of the actions could be to pass the error to the guest and have the guest driver recover from the error. This requires that PCIe capabilities be exposed to the guest. If err_handler() is not implemented/fails, we just terminate the guest to contain the error. Thanks Farhan
Farhan Ali <alifm@linux.ibm.com> writes: > On 9/30/2025 9:52 PM, Markus Armbruster wrote: >> Farhan Ali <alifm@linux.ibm.com> writes: >> >>> On 9/30/2025 2:20 AM, Markus Armbruster wrote: >>>> Farhan Ali <alifm@linux.ibm.com> writes: >>>> >>>>> On 9/27/2025 12:05 AM, Cédric Le Goater wrote: >>>>>> On 9/27/25 07:59, Markus Armbruster wrote: >> >> [...] >> >>>>>>> * Is the comment still accurate? >>>>> >>>>> This comment would still apply for vfio-pci devices on other architectures except for s390x. We are trying to change this behavior for s390x. >>> >>>> The comment is about things that should be done to handle the error. >>>> Would these things be done here, or in a suitable ->err_handler()? >>> >>> Ideally in the err_handler(). And for s390x we try do what the comment mentions, which is inject the error into the guest through s390x architecture specific mechanism. I can remove the comment block. >> >> Well, if there's stuff left to do, a comment outlining it is desirable. >> If I understand you correctly, then the one we have is no longer >> accurate. Could you update it, so it is? >> >> [...] >> > How about something like this? > > We can retrieve the error details and decide what action needs to be taken in err_handler(). One of the actions could be to pass the error to the guest and have the guest driver recover from the error. This requires that PCIe capabilities be exposed to the guest. If err_handler() is not implemented/fails, we just terminate the guest to contain the error. Looks good to me. I lack the expertise to really judge.
On 9/26/25 06:57, Markus Armbruster wrote:
> Farhan Ali <alifm@linux.ibm.com> writes:
>
>> Provide a vfio error handling callback, that can be used by devices to
>> handle PCI errors for passthrough devices.
>>
>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>> ---
>> hw/vfio/pci.c | 8 ++++++++
>> hw/vfio/pci.h | 1 +
>> 2 files changed, 9 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index bc0b4c4d56..b02a974954 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>> static void vfio_err_notifier_handler(void *opaque)
>> {
>> VFIOPCIDevice *vdev = opaque;
>> + Error *err = NULL;
>>
>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>> return;
>> }
>>
>> + if (vdev->err_handler) {
>> + if (vdev->err_handler(vdev, &err)) {
>> + return;
>> + }
>> + error_report_err(err);
>> + }
>
> This is unusual.
and the compiler complains :
../hw/vfio/pci.c: In function ‘vfio_err_notifier_handler’:
../hw/vfio/pci.c:3076:9: error: dangling pointer to ‘err’ may be used [-Werror=dangling-pointer=]
3076 | error_report_err(err);
| ^~~~~~~~~~~~~~~~~~~~~
../hw/vfio/pci.c:3066:12: note: ‘err’ declared here
3066 | Error *err = NULL;
| ^~~
cc1: all warnings being treated as errors
C.
>
> Functions taking an Error ** argument usually do so to report errors.
> The rules spelled out in qapi/error.h apply. In particular:
>
> * - On success, the function should not touch *errp. On failure, it
> * should set a new error, e.g. with error_setg(errp, ...), or
> * propagate an existing one, e.g. with error_propagate(errp, ...).
> *
> * - Whenever practical, also return a value that indicates success /
> * failure. This can make the error checking more concise, and can
> * avoid useless error object creation and destruction. Note that
> * we still have many functions returning void. We recommend
> * • bool-valued functions return true on success / false on failure,
>
> If ->err_handler() behaved that way, it @err would be null after it
> returns false. We'd call error_report_err(NULL), and crash.
>
> Functions with unusual behavior need a contract: a comment spelling out
> their behavior.
>
> What is the intended behavior of the err_handler() callback?
>
>> +
>> /*
>> * TBD. Retrieve the error details and decide what action
>> * needs to be taken. One of the actions could be to pass
>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>> index e0aef82a89..faadce487c 100644
>> --- a/hw/vfio/pci.h
>> +++ b/hw/vfio/pci.h
>> @@ -146,6 +146,7 @@ struct VFIOPCIDevice {
>> EventNotifier err_notifier;
>> EventNotifier req_notifier;
>> int (*resetfn)(struct VFIOPCIDevice *);
>> + bool (*err_handler)(struct VFIOPCIDevice *, Error **);
>> uint32_t vendor_id;
>> uint32_t device_id;
>> uint32_t sub_vendor_id;
>
On 9/26/2025 12:40 AM, Cédric Le Goater wrote:
> On 9/26/25 06:57, Markus Armbruster wrote:
>> Farhan Ali <alifm@linux.ibm.com> writes:
>>
>>> Provide a vfio error handling callback, that can be used by devices to
>>> handle PCI errors for passthrough devices.
>>>
>>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>>> ---
>>> hw/vfio/pci.c | 8 ++++++++
>>> hw/vfio/pci.h | 1 +
>>> 2 files changed, 9 insertions(+)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index bc0b4c4d56..b02a974954 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -3063,11 +3063,19 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
>>> static void vfio_err_notifier_handler(void *opaque)
>>> {
>>> VFIOPCIDevice *vdev = opaque;
>>> + Error *err = NULL;
>>> if (!event_notifier_test_and_clear(&vdev->err_notifier)) {
>>> return;
>>> }
>>> + if (vdev->err_handler) {
>>> + if (vdev->err_handler(vdev, &err)) {
>>> + return;
>>> + }
>>> + error_report_err(err);
>>> + }
>>
>> This is unusual.
>
> and the compiler complains :
>
> ../hw/vfio/pci.c: In function ‘vfio_err_notifier_handler’:
> ../hw/vfio/pci.c:3076:9: error: dangling pointer to ‘err’ may be used
> [-Werror=dangling-pointer=]
> 3076 | error_report_err(err);
> | ^~~~~~~~~~~~~~~~~~~~~
> ../hw/vfio/pci.c:3066:12: note: ‘err’ declared here
> 3066 | Error *err = NULL;
> | ^~~
> cc1: all warnings being treated as errors
>
>
> C.
Compiling on s390x didn't cause this compiler error, but indeed
compiling on x86 it did.
Thanks
Farhan
>
>
>>
>> Functions taking an Error ** argument usually do so to report errors.
>> The rules spelled out in qapi/error.h apply. In particular:
>>
>> * - On success, the function should not touch *errp. On failure, it
>> * should set a new error, e.g. with error_setg(errp, ...), or
>> * propagate an existing one, e.g. with error_propagate(errp, ...).
>> *
>> * - Whenever practical, also return a value that indicates success /
>> * failure. This can make the error checking more concise, and can
>> * avoid useless error object creation and destruction. Note that
>> * we still have many functions returning void. We recommend
>> * • bool-valued functions return true on success / false on failure,
>>
>> If ->err_handler() behaved that way, it @err would be null after it
>> returns false. We'd call error_report_err(NULL), and crash.
>>
>> Functions with unusual behavior need a contract: a comment spelling out
>> their behavior.
>>
>> What is the intended behavior of the err_handler() callback?
>>
>>> +
>>> /*
>>> * TBD. Retrieve the error details and decide what action
>>> * needs to be taken. One of the actions could be to pass
>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>> index e0aef82a89..faadce487c 100644
>>> --- a/hw/vfio/pci.h
>>> +++ b/hw/vfio/pci.h
>>> @@ -146,6 +146,7 @@ struct VFIOPCIDevice {
>>> EventNotifier err_notifier;
>>> EventNotifier req_notifier;
>>> int (*resetfn)(struct VFIOPCIDevice *);
>>> + bool (*err_handler)(struct VFIOPCIDevice *, Error **);
>>> uint32_t vendor_id;
>>> uint32_t device_id;
>>> uint32_t sub_vendor_id;
>>
>
>
© 2016 - 2026 Red Hat, Inc.