[v3] Error recovery for vfio-pci devices on s390x

[PATCH v3 07/10] s390/pci: Store PCI error information for passthrough devices

Posted by Farhan Ali 3 weeks ago

For a passthrough device we need co-operation from user space to recover
the device. This would require to bubble up any error information to user
space.  Let's store this error information for passthrough devices, so it
can be retrieved later.

Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 arch/s390/include/asm/pci.h      | 28 ++++++++++
 arch/s390/pci/pci.c              |  1 +
 arch/s390/pci/pci_event.c        | 95 +++++++++++++++++++-------------
 drivers/vfio/pci/vfio_pci_zdev.c |  2 +
 4 files changed, 88 insertions(+), 38 deletions(-)

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index f47f62fc3bfd..72e05af90e08 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -116,6 +116,31 @@ struct zpci_bus {
 	enum pci_bus_speed	max_bus_speed;
 };
 
+/* Content Code Description for PCI Function Error */
+struct zpci_ccdf_err {
+	u32 reserved1;
+	u32 fh;                         /* function handle */
+	u32 fid;                        /* function id */
+	u32 ett         :  4;           /* expected table type */
+	u32 mvn         : 12;           /* MSI vector number */
+	u32 dmaas       :  8;           /* DMA address space */
+	u32 reserved2   :  6;
+	u32 q           :  1;           /* event qualifier */
+	u32 rw          :  1;           /* read/write */
+	u64 faddr;                      /* failing address */
+	u32 reserved3;
+	u16 reserved4;
+	u16 pec;                        /* PCI event code */
+} __packed;
+
+#define ZPCI_ERR_PENDING_MAX 16
+struct zpci_ccdf_pending {
+	size_t count;
+	int head;
+	int tail;
+	struct zpci_ccdf_err err[ZPCI_ERR_PENDING_MAX];
+};
+
 /* Private data per function */
 struct zpci_dev {
 	struct zpci_bus *zbus;
@@ -191,6 +216,8 @@ struct zpci_dev {
 	struct iommu_domain *s390_domain; /* attached IOMMU domain */
 	struct kvm_zdev *kzdev;
 	struct mutex kzdev_lock;
+	struct zpci_ccdf_pending pending_errs;
+	struct mutex pending_errs_lock;
 	spinlock_t dom_lock;		/* protect s390_domain change */
 };
 
@@ -316,6 +343,7 @@ void zpci_debug_exit_device(struct zpci_dev *);
 int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
 int zpci_clear_error_state(struct zpci_dev *zdev);
 int zpci_reset_load_store_blocked(struct zpci_dev *zdev);
+void zpci_cleanup_pending_errors(struct zpci_dev *zdev);
 
 #ifdef CONFIG_NUMA
 
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 5baeb5f6f674..02c7b06cfd0e 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -896,6 +896,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
 	mutex_init(&zdev->state_lock);
 	mutex_init(&zdev->fmb_lock);
 	mutex_init(&zdev->kzdev_lock);
+	mutex_init(&zdev->pending_errs_lock);
 
 	return zdev;
 
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 541d536be052..ac527410812b 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -18,23 +18,6 @@
 #include "pci_bus.h"
 #include "pci_report.h"
 
-/* Content Code Description for PCI Function Error */
-struct zpci_ccdf_err {
-	u32 reserved1;
-	u32 fh;				/* function handle */
-	u32 fid;			/* function id */
-	u32 ett		:  4;		/* expected table type */
-	u32 mvn		: 12;		/* MSI vector number */
-	u32 dmaas	:  8;		/* DMA address space */
-	u32		:  6;
-	u32 q		:  1;		/* event qualifier */
-	u32 rw		:  1;		/* read/write */
-	u64 faddr;			/* failing address */
-	u32 reserved3;
-	u16 reserved4;
-	u16 pec;			/* PCI event code */
-} __packed;
-
 /* Content Code Description for PCI Function Availability */
 struct zpci_ccdf_avail {
 	u32 reserved1;
@@ -76,6 +59,41 @@ static bool is_driver_supported(struct pci_driver *driver)
 	return true;
 }
 
+static void zpci_store_pci_error(struct pci_dev *pdev,
+				 struct zpci_ccdf_err *ccdf)
+{
+	struct zpci_dev *zdev = to_zpci(pdev);
+	int i;
+
+	mutex_lock(&zdev->pending_errs_lock);
+	if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
+		pr_err("%s: Cannot store PCI error info for device",
+				pci_name(pdev));
+		mutex_unlock(&zdev->pending_errs_lock);
+		return;
+	}
+
+	i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
+	memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
+	zdev->pending_errs.tail++;
+	zdev->pending_errs.count++;
+	mutex_unlock(&zdev->pending_errs_lock);
+}
+
+void zpci_cleanup_pending_errors(struct zpci_dev *zdev)
+{
+	struct pci_dev *pdev = NULL;
+
+	mutex_lock(&zdev->pending_errs_lock);
+	pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
+	if (zdev->pending_errs.count)
+		pr_err("%s: Unhandled PCI error events count=%zu",
+				pci_name(pdev), zdev->pending_errs.count);
+	memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
+	mutex_unlock(&zdev->pending_errs_lock);
+}
+EXPORT_SYMBOL_GPL(zpci_cleanup_pending_errors);
+
 static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
 							 struct pci_driver *driver)
 {
@@ -169,7 +187,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
  * and the platform determines which functions are affected for
  * multi-function devices.
  */
-static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
+static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
+							  struct zpci_ccdf_err *ccdf)
 {
 	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
 	struct zpci_dev *zdev = to_zpci(pdev);
@@ -188,13 +207,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
 	}
 	pdev->error_state = pci_channel_io_frozen;
 
-	if (needs_mediated_recovery(pdev)) {
-		pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
-			pci_name(pdev));
-		status_str = "failed (pass-through)";
-		goto out_unlock;
-	}
-
 	driver = to_pci_driver(pdev->dev.driver);
 	if (!is_driver_supported(driver)) {
 		if (!driver) {
@@ -210,12 +222,22 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
 		goto out_unlock;
 	}
 
+	if (needs_mediated_recovery(pdev))
+		zpci_store_pci_error(pdev, ccdf);
+
 	ers_res = zpci_event_notify_error_detected(pdev, driver);
 	if (ers_result_indicates_abort(ers_res)) {
 		status_str = "failed (abort on detection)";
 		goto out_unlock;
 	}
 
+	if (needs_mediated_recovery(pdev)) {
+		pr_info("%s: Recovering passthrough device\n", pci_name(pdev));
+		ers_res = PCI_ERS_RESULT_RECOVERED;
+		status_str = "in progress";
+		goto out_unlock;
+	}
+
 	if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
 		ers_res = zpci_event_do_error_state_clear(pdev, driver);
 		if (ers_result_indicates_abort(ers_res)) {
@@ -258,25 +280,20 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
  * @pdev: PCI function for which to report
  * @es: PCI channel failure state to report
  */
-static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
+static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
+				  struct zpci_ccdf_err *ccdf)
 {
 	struct pci_driver *driver;
 
 	pci_dev_lock(pdev);
 	pdev->error_state = es;
-	/**
-	 * While vfio-pci's error_detected callback notifies user-space QEMU
-	 * reacts to this by freezing the guest. In an s390 environment PCI
-	 * errors are rarely fatal so this is overkill. Instead in the future
-	 * we will inject the error event and let the guest recover the device
-	 * itself.
-	 */
+
 	if (needs_mediated_recovery(pdev))
-		goto out;
+		zpci_store_pci_error(pdev, ccdf);
 	driver = to_pci_driver(pdev->dev.driver);
 	if (driver && driver->err_handler && driver->err_handler->error_detected)
 		driver->err_handler->error_detected(pdev, pdev->error_state);
-out:
+
 	pci_dev_unlock(pdev);
 }
 
@@ -312,6 +329,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
 	pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
 	       pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
 
+
 	if (!pdev)
 		goto no_pdev;
 
@@ -322,12 +340,13 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
 		break;
 	case 0x0040: /* Service Action or Error Recovery Failed */
 	case 0x003b:
-		zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+		zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
 		break;
 	default: /* PCI function left in the error state attempt to recover */
-		ers_res = zpci_event_attempt_error_recovery(pdev);
+		ers_res = zpci_event_attempt_error_recovery(pdev, ccdf);
 		if (ers_res != PCI_ERS_RESULT_RECOVERED)
-			zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
+			zpci_event_io_failure(pdev, pci_channel_io_perm_failure,
+					ccdf);
 		break;
 	}
 	pci_dev_put(pdev);
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index a7bc23ce8483..2be37eab9279 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -168,6 +168,8 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
 
 	zdev->mediated_recovery = false;
 
+	zpci_cleanup_pending_errors(zdev);
+
 	if (!vdev->vdev.kvm)
 		return;
 
-- 
2.43.0

Re: [PATCH v3 07/10] s390/pci: Store PCI error information for passthrough devices

Posted by Niklas Schnelle 2 weeks, 3 days ago

On Thu, 2025-09-11 at 11:33 -0700, Farhan Ali wrote:
> For a passthrough device we need co-operation from user space to recover
> the device. This would require to bubble up any error information to user
> space.  Let's store this error information for passthrough devices, so it
> can be retrieved later.
> 
> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
> ---
>  arch/s390/include/asm/pci.h      | 28 ++++++++++
>  arch/s390/pci/pci.c              |  1 +
>  arch/s390/pci/pci_event.c        | 95 +++++++++++++++++++-------------
>  drivers/vfio/pci/vfio_pci_zdev.c |  2 +
>  4 files changed, 88 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
> index f47f62fc3bfd..72e05af90e08 100644
> --- a/arch/s390/include/asm/pci.h
> +++ b/arch/s390/include/asm/pci.h
> @@ -116,6 +116,31 @@ struct zpci_bus {
>  	enum pci_bus_speed	max_bus_speed;
>  };
>  
> +/* Content Code Description for PCI Function Error */
> +struct zpci_ccdf_err {
> +	u32 reserved1;
> +	u32 fh;                         /* function handle */
> +	u32 fid;                        /* function id */
> +	u32 ett         :  4;           /* expected table type */
> +	u32 mvn         : 12;           /* MSI vector number */
> +	u32 dmaas       :  8;           /* DMA address space */
> +	u32 reserved2   :  6;
> +	u32 q           :  1;           /* event qualifier */
> +	u32 rw          :  1;           /* read/write */
> +	u64 faddr;                      /* failing address */
> +	u32 reserved3;
> +	u16 reserved4;
> +	u16 pec;                        /* PCI event code */
> +} __packed;
> +
> +#define ZPCI_ERR_PENDING_MAX 16

16 pending error events sounds like a lot for a single devices. This
also means that the array alone already spans more than 2 cache lines
(256 byte on s390x). I can't imagine that we'd ever have that many
errors pending. This is especially true since a device already in an
error state would be the least likely to cause more errors. We have
seen cases of 2 errors in the past, so maybe 4 would give us good head
room?

> +struct zpci_ccdf_pending {
> +	size_t count;
> +	int head;
> +	int tail;
> +	struct zpci_ccdf_err err[ZPCI_ERR_PENDING_MAX];
> +};
> +
>  /* Private data per function */
>  struct zpci_dev {
>  	struct zpci_bus *zbus;
> @@ -191,6 +216,8 @@ struct zpci_dev {
>  	struct iommu_domain *s390_domain; /* attached IOMMU domain */
>  	struct kvm_zdev *kzdev;
>  	struct mutex kzdev_lock;
> +	struct zpci_ccdf_pending pending_errs;
> +	struct mutex pending_errs_lock;
>  	spinlock_t dom_lock;		/* protect s390_domain change */
>  };
> 
--- snip ---

> -
>  /* Content Code Description for PCI Function Availability */
>  struct zpci_ccdf_avail {
>  	u32 reserved1;
> @@ -76,6 +59,41 @@ static bool is_driver_supported(struct pci_driver *driver)
>  	return true;
>  }
>  
> +static void zpci_store_pci_error(struct pci_dev *pdev,
> +				 struct zpci_ccdf_err *ccdf)
> +{
> +	struct zpci_dev *zdev = to_zpci(pdev);
> +	int i;
> +
> +	mutex_lock(&zdev->pending_errs_lock);
> +	if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
> +		pr_err("%s: Cannot store PCI error info for device",
> +				pci_name(pdev));
> +		mutex_unlock(&zdev->pending_errs_lock);

I think the error message should state that the maximum number of
pending error events has been queued. As with the ZPI_ERR_PENDING_MAX I
really don't think we would reach this even at 4 vs 16 max pending but
if we do I agree that having the first couple of errors saved is
probably nice for analysis.

> +		return;
> +	}
> +
> +	i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
> +	memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
> +	zdev->pending_errs.tail++;
> +	zdev->pending_errs.count++;

With tail being int this would be undefined behavior if it ever
overflowed. Since the array is of fixed length that is always smaller
than 256 how about making tail, head, and count u8. The memory saving
doesn't matter but at least overflow becomes well defined.

> +	mutex_unlock(&zdev->pending_errs_lock);
> +}
> +
> +void zpci_cleanup_pending_errors(struct zpci_dev *zdev)
> +{
> +	struct pci_dev *pdev = NULL;
> +
> +	mutex_lock(&zdev->pending_errs_lock);
> +	pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
> +	if (zdev->pending_errs.count)
> +		pr_err("%s: Unhandled PCI error events count=%zu",
> +				pci_name(pdev), zdev->pending_errs.count);

I think this could be a zpci_dbg(). That way you also don't need the
pci_get_slot() which is also buggy as it misses a pci_dev_put(). The
message also doesn't seem useful for the user. As I understand it this
would happen if a vfio-pci user dies without handling all the error
events but then vfio-pci will also reset the slot on closing of the
fds, no? So the device will get reset anyway.

> +	memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));

If this goes wrong and we subsequently crash or take a live memory dump
I'd prefer to have bread crumbs such as the errors that weren't cleaned
up. Wouldn't it be enough to just set the count to zero and for debug
the original count will be in s390dbf. Also maybe it would make sense
to pull the zdev->mediated_recovery clearing in here?

> +	mutex_unlock(&zdev->pending_errs_lock);
> +}
> +EXPORT_SYMBOL_GPL(zpci_cleanup_pending_errors);
> +
>  static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
>  							 struct pci_driver *driver)
>  {
> @@ -169,7 +187,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
>   * and the platform determines which functions are affected for
>   * multi-function devices.
>   */
> -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
> +							  struct zpci_ccdf_err *ccdf)
>  {
>  	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
>  	struct zpci_dev *zdev = to_zpci(pdev);
> @@ -188,13 +207,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>  	}
>  	pdev->error_state = pci_channel_io_frozen;
>  
> -	if (needs_mediated_recovery(pdev)) {
> -		pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
> -			pci_name(pdev));
> -		status_str = "failed (pass-through)";
> -		goto out_unlock;
> -	}
> -
>  	driver = to_pci_driver(pdev->dev.driver);
>  	if (!is_driver_supported(driver)) {
>  		if (!driver) {
> @@ -210,12 +222,22 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>  		goto out_unlock;
>  	}
>  
> +	if (needs_mediated_recovery(pdev))
> +		zpci_store_pci_error(pdev, ccdf);
> +
>  	ers_res = zpci_event_notify_error_detected(pdev, driver);
>  	if (ers_result_indicates_abort(ers_res)) {
>  		status_str = "failed (abort on detection)";
>  		goto out_unlock;
>  	}
>  
> +	if (needs_mediated_recovery(pdev)) {
> +		pr_info("%s: Recovering passthrough device\n", pci_name(pdev));

I'd say technically we're not recovering the device here but rather
leaving it alone so user-space can take over the recovery. Maybe this
could be made explicit in the message. Something like:

""%s: Leaving recovery of pass-through device to user-space\n"

> +		ers_res = PCI_ERS_RESULT_RECOVERED;
> +		status_str = "in progress";
> +		goto out_unlock;
> +	}
> +
>  	if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
>  		ers_res = zpci_event_do_error_state_clear(pdev, driver);
>  		if (ers_result_indicates_abort(ers_res)) {
> @@ -258,25 +280,20 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>   * @pdev: PCI function for which to report
>   * @es: PCI channel failure state to report
>   */
> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
> +				  struct zpci_ccdf_err *ccdf)
>  {
>  	struct pci_driver *driver;
>  
>  	pci_dev_lock(pdev);
>  	pdev->error_state = es;
> -	/**
> -	 * While vfio-pci's error_detected callback notifies user-space QEMU
> -	 * reacts to this by freezing the guest. In an s390 environment PCI
> -	 * errors are rarely fatal so this is overkill. Instead in the future
> -	 * we will inject the error event and let the guest recover the device
> -	 * itself.
> -	 */
> +
>  	if (needs_mediated_recovery(pdev))
> -		goto out;
> +		zpci_store_pci_error(pdev, ccdf);
>  	driver = to_pci_driver(pdev->dev.driver);
>  	if (driver && driver->err_handler && driver->err_handler->error_detected)
>  		driver->err_handler->error_detected(pdev, pdev->error_state);
> -out:
> +
>  	pci_dev_unlock(pdev);
>  }
>  
> @@ -312,6 +329,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
>  	pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
>  	       pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
>  
> +
>  	if (!pdev)

Nit, stray empty line.

>  		goto no_pdev;
>  
> @@ -322,12 +340,13 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
>  		break;
>  	case 0x0040: /* Service Action or Error Recovery Failed */
>  	case 0x003b:
> -		zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> +		zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
>  		break;
>  	default: /* PCI function left in the error state attempt to recover */
> -		ers_res = zpci_event_attempt_error_recovery(pdev);
> +		ers_res = zpci_event_attempt_error_recovery(pdev, ccdf);
>  		if (ers_res != PCI_ERS_RESULT_RECOVERED)
> -			zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> +			zpci_event_io_failure(pdev, pci_channel_io_perm_failure,
> +					ccdf);
>  		break;
>  	}
>  	pci_dev_put(pdev);
> diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
> index a7bc23ce8483..2be37eab9279 100644
> --- a/drivers/vfio/pci/vfio_pci_zdev.c
> +++ b/drivers/vfio/pci/vfio_pci_zdev.c
> @@ -168,6 +168,8 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
>  
>  	zdev->mediated_recovery = false;
>  
> +	zpci_cleanup_pending_errors(zdev);
> +
>  	if (!vdev->vdev.kvm)
>  		return;
>

Re: [PATCH v3 07/10] s390/pci: Store PCI error information for passthrough devices

Posted by Farhan Ali 2 weeks, 3 days ago

On 9/15/2025 4:42 AM, Niklas Schnelle wrote:
> On Thu, 2025-09-11 at 11:33 -0700, Farhan Ali wrote:
>> For a passthrough device we need co-operation from user space to recover
>> the device. This would require to bubble up any error information to user
>> space.  Let's store this error information for passthrough devices, so it
>> can be retrieved later.
>>
>> Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
>> ---
>>   arch/s390/include/asm/pci.h      | 28 ++++++++++
>>   arch/s390/pci/pci.c              |  1 +
>>   arch/s390/pci/pci_event.c        | 95 +++++++++++++++++++-------------
>>   drivers/vfio/pci/vfio_pci_zdev.c |  2 +
>>   4 files changed, 88 insertions(+), 38 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
>> index f47f62fc3bfd..72e05af90e08 100644
>> --- a/arch/s390/include/asm/pci.h
>> +++ b/arch/s390/include/asm/pci.h
>> @@ -116,6 +116,31 @@ struct zpci_bus {
>>   	enum pci_bus_speed	max_bus_speed;
>>   };
>>   
>> +/* Content Code Description for PCI Function Error */
>> +struct zpci_ccdf_err {
>> +	u32 reserved1;
>> +	u32 fh;                         /* function handle */
>> +	u32 fid;                        /* function id */
>> +	u32 ett         :  4;           /* expected table type */
>> +	u32 mvn         : 12;           /* MSI vector number */
>> +	u32 dmaas       :  8;           /* DMA address space */
>> +	u32 reserved2   :  6;
>> +	u32 q           :  1;           /* event qualifier */
>> +	u32 rw          :  1;           /* read/write */
>> +	u64 faddr;                      /* failing address */
>> +	u32 reserved3;
>> +	u16 reserved4;
>> +	u16 pec;                        /* PCI event code */
>> +} __packed;
>> +
>> +#define ZPCI_ERR_PENDING_MAX 16
> 16 pending error events sounds like a lot for a single devices. This
> also means that the array alone already spans more than 2 cache lines
> (256 byte on s390x). I can't imagine that we'd ever have that many
> errors pending. This is especially true since a device already in an
> error state would be the least likely to cause more errors. We have
> seen cases of 2 errors in the past, so maybe 4 would give us good head
> room?

Yeah, I wasn't sure how much headroom did we need. As you said having 
more than 2 is rare, so 4 should give us enough room.


>> +struct zpci_ccdf_pending {
>> +	size_t count;
>> +	int head;
>> +	int tail;
>> +	struct zpci_ccdf_err err[ZPCI_ERR_PENDING_MAX];
>> +};
>> +
>>   /* Private data per function */
>>   struct zpci_dev {
>>   	struct zpci_bus *zbus;
>> @@ -191,6 +216,8 @@ struct zpci_dev {
>>   	struct iommu_domain *s390_domain; /* attached IOMMU domain */
>>   	struct kvm_zdev *kzdev;
>>   	struct mutex kzdev_lock;
>> +	struct zpci_ccdf_pending pending_errs;
>> +	struct mutex pending_errs_lock;
>>   	spinlock_t dom_lock;		/* protect s390_domain change */
>>   };
>>
> --- snip ---
>
>> -
>>   /* Content Code Description for PCI Function Availability */
>>   struct zpci_ccdf_avail {
>>   	u32 reserved1;
>> @@ -76,6 +59,41 @@ static bool is_driver_supported(struct pci_driver *driver)
>>   	return true;
>>   }
>>   
>> +static void zpci_store_pci_error(struct pci_dev *pdev,
>> +				 struct zpci_ccdf_err *ccdf)
>> +{
>> +	struct zpci_dev *zdev = to_zpci(pdev);
>> +	int i;
>> +
>> +	mutex_lock(&zdev->pending_errs_lock);
>> +	if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
>> +		pr_err("%s: Cannot store PCI error info for device",
>> +				pci_name(pdev));
>> +		mutex_unlock(&zdev->pending_errs_lock);
> I think the error message should state that the maximum number of
> pending error events has been queued. As with the ZPI_ERR_PENDING_MAX I
> really don't think we would reach this even at 4 vs 16 max pending but
> if we do I agree that having the first couple of errors saved is
> probably nice for analysis.

Ack, will change the error message.


>
>> +		return;
>> +	}
>> +
>> +	i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
>> +	memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
>> +	zdev->pending_errs.tail++;
>> +	zdev->pending_errs.count++;
> With tail being int this would be undefined behavior if it ever
> overflowed. Since the array is of fixed length that is always smaller
> than 256 how about making tail, head, and count u8. The memory saving
> doesn't matter but at least overflow becomes well defined.

Yeah, this makes sense, will update this.


>
>> +	mutex_unlock(&zdev->pending_errs_lock);
>> +}
>> +
>> +void zpci_cleanup_pending_errors(struct zpci_dev *zdev)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +
>> +	mutex_lock(&zdev->pending_errs_lock);
>> +	pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
>> +	if (zdev->pending_errs.count)
>> +		pr_err("%s: Unhandled PCI error events count=%zu",
>> +				pci_name(pdev), zdev->pending_errs.count);
> I think this could be a zpci_dbg(). That way you also don't need the
> pci_get_slot() which is also buggy as it misses a pci_dev_put(). The
> message also doesn't seem useful for the user. As I understand it this
> would happen if a vfio-pci user dies without handling all the error
> events but then vfio-pci will also reset the slot on closing of the
> fds, no? So the device will get reset anyway.

Right, the device will reset anyway. But I wanted to at least give an 
indication to the user that some events were not handled correctly. 
Maybe pr_err is a little extreme, so can convert to a warn? This should 
be rare as well behaving applications shouldn't do this. I am fine with 
zpci_dbg as well, its just the kernel needs to be in debug mode for us 
to get this info.

>
>> +	memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
> If this goes wrong and we subsequently crash or take a live memory dump
> I'd prefer to have bread crumbs such as the errors that weren't cleaned
> up. Wouldn't it be enough to just set the count to zero and for debug
> the original count will be in s390dbf.

I think setting count to zero should be enough, but I am wary about 
keeping stale state around. How about just logging the count that was 
not handled, in s390dbf? I think we already dump the ccdf in s390df if 
we get any error event. So it should be enough for us to trace back the 
unhandled error events?

> Also maybe it would make sense
> to pull the zdev->mediated_recovery clearing in here?

I would like to keep the mediated_recovery flag separate from just 
cleaning up the errors. The flag gets initialized when we open the vfio 
device and so having the flag cleared on close makes it easier to track 
this IMHO.


>
>> +	mutex_unlock(&zdev->pending_errs_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(zpci_cleanup_pending_errors);
>> +
>>   static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
>>   							 struct pci_driver *driver)
>>   {
>> @@ -169,7 +187,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
>>    * and the platform determines which functions are affected for
>>    * multi-function devices.
>>    */
>> -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>> +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
>> +							  struct zpci_ccdf_err *ccdf)
>>   {
>>   	pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
>>   	struct zpci_dev *zdev = to_zpci(pdev);
>> @@ -188,13 +207,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>>   	}
>>   	pdev->error_state = pci_channel_io_frozen;
>>   
>> -	if (needs_mediated_recovery(pdev)) {
>> -		pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
>> -			pci_name(pdev));
>> -		status_str = "failed (pass-through)";
>> -		goto out_unlock;
>> -	}
>> -
>>   	driver = to_pci_driver(pdev->dev.driver);
>>   	if (!is_driver_supported(driver)) {
>>   		if (!driver) {
>> @@ -210,12 +222,22 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>>   		goto out_unlock;
>>   	}
>>   
>> +	if (needs_mediated_recovery(pdev))
>> +		zpci_store_pci_error(pdev, ccdf);
>> +
>>   	ers_res = zpci_event_notify_error_detected(pdev, driver);
>>   	if (ers_result_indicates_abort(ers_res)) {
>>   		status_str = "failed (abort on detection)";
>>   		goto out_unlock;
>>   	}
>>   
>> +	if (needs_mediated_recovery(pdev)) {
>> +		pr_info("%s: Recovering passthrough device\n", pci_name(pdev));
> I'd say technically we're not recovering the device here but rather
> leaving it alone so user-space can take over the recovery. Maybe this
> could be made explicit in the message. Something like:
>
> ""%s: Leaving recovery of pass-through device to user-space\n"
>
Sure, will update

Thanks
Farhan


>
>> +		ers_res = PCI_ERS_RESULT_RECOVERED;
>> +		status_str = "in progress";
>> +		goto out_unlock;
>> +	}
>> +
>>   	if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
>>   		ers_res = zpci_event_do_error_state_clear(pdev, driver);
>>   		if (ers_result_indicates_abort(ers_res)) {
>> @@ -258,25 +280,20 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
>>    * @pdev: PCI function for which to report
>>    * @es: PCI channel failure state to report
>>    */
>> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
>> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
>> +				  struct zpci_ccdf_err *ccdf)
>>   {
>>   	struct pci_driver *driver;
>>   
>>   	pci_dev_lock(pdev);
>>   	pdev->error_state = es;
>> -	/**
>> -	 * While vfio-pci's error_detected callback notifies user-space QEMU
>> -	 * reacts to this by freezing the guest. In an s390 environment PCI
>> -	 * errors are rarely fatal so this is overkill. Instead in the future
>> -	 * we will inject the error event and let the guest recover the device
>> -	 * itself.
>> -	 */
>> +
>>   	if (needs_mediated_recovery(pdev))
>> -		goto out;
>> +		zpci_store_pci_error(pdev, ccdf);
>>   	driver = to_pci_driver(pdev->dev.driver);
>>   	if (driver && driver->err_handler && driver->err_handler->error_detected)
>>   		driver->err_handler->error_detected(pdev, pdev->error_state);
>> -out:
>> +
>>   	pci_dev_unlock(pdev);
>>   }
>>   
>> @@ -312,6 +329,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
>>   	pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n",
>>   	       pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid);
>>   
>> +
>>   	if (!pdev)
> Nit, stray empty line.
>
>>   		goto no_pdev;
>>   
>> @@ -322,12 +340,13 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
>>   		break;
>>   	case 0x0040: /* Service Action or Error Recovery Failed */
>>   	case 0x003b:
>> -		zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
>> +		zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
>>   		break;
>>   	default: /* PCI function left in the error state attempt to recover */
>> -		ers_res = zpci_event_attempt_error_recovery(pdev);
>> +		ers_res = zpci_event_attempt_error_recovery(pdev, ccdf);
>>   		if (ers_res != PCI_ERS_RESULT_RECOVERED)
>> -			zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
>> +			zpci_event_io_failure(pdev, pci_channel_io_perm_failure,
>> +					ccdf);
>>   		break;
>>   	}
>>   	pci_dev_put(pdev);
>> diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
>> index a7bc23ce8483..2be37eab9279 100644
>> --- a/drivers/vfio/pci/vfio_pci_zdev.c
>> +++ b/drivers/vfio/pci/vfio_pci_zdev.c
>> @@ -168,6 +168,8 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
>>   
>>   	zdev->mediated_recovery = false;
>>   
>> +	zpci_cleanup_pending_errors(zdev);
>> +
>>   	if (!vdev->vdev.kvm)
>>   		return;
>>

Re: [PATCH v3 07/10] s390/pci: Store PCI error information for passthrough devices

Posted by Niklas Schnelle 2 weeks, 2 days ago

On Mon, 2025-09-15 at 11:12 -0700, Farhan Ali wrote:
> On 9/15/2025 4:42 AM, Niklas Schnelle wrote:
> > On Thu, 2025-09-11 at 11:33 -0700, Farhan Ali wrote:
> > > For a passthrough device we need co-operation from user space to recover
> > > the device. This would require to bubble up any error information to user
> > > space.  Let's store this error information for passthrough devices, so it
> > > can be retrieved later.
> > > 
> > > Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
> > > ---
> > > 
--- snip ---
> > > +	mutex_unlock(&zdev->pending_errs_lock);
> > > +}
> > > +
> > > +void zpci_cleanup_pending_errors(struct zpci_dev *zdev)
> > > +{
> > > +	struct pci_dev *pdev = NULL;
> > > +
> > > +	mutex_lock(&zdev->pending_errs_lock);
> > > +	pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
> > > +	if (zdev->pending_errs.count)
> > > +		pr_err("%s: Unhandled PCI error events count=%zu",
> > > +				pci_name(pdev), zdev->pending_errs.count);
> > I think this could be a zpci_dbg(). That way you also don't need the
> > pci_get_slot() which is also buggy as it misses a pci_dev_put(). The
> > message also doesn't seem useful for the user. As I understand it this
> > would happen if a vfio-pci user dies without handling all the error
> > events but then vfio-pci will also reset the slot on closing of the
> > fds, no? So the device will get reset anyway.
> 
> Right, the device will reset anyway. But I wanted to at least give an 
> indication to the user that some events were not handled correctly. 
> Maybe pr_err is a little extreme, so can convert to a warn? This should 
> be rare as well behaving applications shouldn't do this. I am fine with 
> zpci_dbg as well, its just the kernel needs to be in debug mode for us 
> to get this info.

No, zpci_dbg() logs to /sys/kernel/debug/s390dbf/pci_msg/sprintf
without need for debug mode. I'm also ok with a pr_warn() or maybe even
pr_info(). I can see your argument that this may be useful to have in
dmesg e.g. when debugging a user-space driver without having to know
about s390 specific debug aids.

> 
> > 
> > > +	memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
> > If this goes wrong and we subsequently crash or take a live memory dump
> > I'd prefer to have bread crumbs such as the errors that weren't cleaned
> > up. Wouldn't it be enough to just set the count to zero and for debug
> > the original count will be in s390dbf.
> 
> I think setting count to zero should be enough, but I am wary about 
> keeping stale state around. How about just logging the count that was 
> not handled, in s390dbf? I think we already dump the ccdf in s390df if 
> we get any error event. So it should be enough for us to trace back the 
> unhandled error events?
> 
> > Also maybe it would make sense
> > to pull the zdev->mediated_recovery clearing in here?
> 
> I would like to keep the mediated_recovery flag separate from just 
> cleaning up the errors. The flag gets initialized when we open the vfio 
> device and so having the flag cleared on close makes it easier to track 
> this IMHO.

Ok yeah I can see the symmetry argument.
>