[PATCH v1 4/6] vfio-pci/zdev: Setup a zpci memory region for error information

Farhan Ali posted 6 patches 1 month, 3 weeks ago
There is a newer version of this series
[PATCH v1 4/6] vfio-pci/zdev: Setup a zpci memory region for error information
Posted by Farhan Ali 1 month, 3 weeks ago
For zpci devices, we have platform specific error information.
To enable recovery for passthrough devices, we want to expose
this error information to user space. We can expose this information
via a device specific (read only) memory region.

Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 drivers/vfio/pci/vfio_pci_zdev.c | 76 ++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        |  2 +
 include/uapi/linux/vfio_zdev.h   |  5 +++
 3 files changed, 83 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 2be37eab9279..818235b28caa 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -141,15 +141,91 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 	return ret;
 }
 
+static ssize_t vfio_pci_zdev_read_err_region(struct vfio_pci_core_device *vdev,
+					    char __user *buf, size_t count,
+					    loff_t *ppos, bool iswrite)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	struct zpci_ccdf_err err;
+	struct vfio_device_zpci_err_region *region;
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	int head = 0;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	region = vdev->region[i].data;
+
+	if (!zdev)
+		return -ENODEV;
+
+	if (pos + count > vdev->region[i].size || iswrite)
+		return -EINVAL;
+
+	mutex_lock(&zdev->pending_errs_lock);
+	if (zdev->pending_errs.count) {
+		head = zdev->pending_errs.head % ZPCI_ERR_PENDING_MAX;
+		err = zdev->pending_errs.err[head];
+		region->pec = err.pec;
+		zdev->pending_errs.head++;
+		zdev->pending_errs.count--;
+		region->pending_errors = zdev->pending_errs.count;
+	}
+	mutex_unlock(&zdev->pending_errs_lock);
+
+	if (copy_to_user(buf, (void *)region + pos, count))
+		count = -EFAULT;
+
+	return count;
+}
+
+static void vfio_pci_zdev_release_err_region(struct vfio_pci_core_device *vdev,
+					     struct vfio_pci_region *region)
+{
+	struct vfio_device_zpci_err_region *err_region = region->data;
+
+	kfree(err_region);
+}
+
+static const struct vfio_pci_regops vfio_pci_zdev_err_regops = {
+	.rw = vfio_pci_zdev_read_err_region,
+	.release = vfio_pci_zdev_release_err_region
+};
+
+static int vfio_pci_zdev_setup_err_region(struct vfio_pci_core_device *vdev)
+{
+	struct vfio_device_zpci_err_region *region;
+	int ret = 0;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	ret = vfio_pci_core_register_dev_region(vdev,
+		PCI_VENDOR_ID_IBM | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_IBM_ZPCI_ERROR_REGION,
+		&vfio_pci_zdev_err_regops,
+		sizeof(*region), VFIO_REGION_INFO_FLAG_READ, region);
+
+	if (ret)
+		kfree(region);
+
+
+	return ret;
+}
+
 int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
 {
 	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	int ret;
 
 	if (!zdev)
 		return -ENODEV;
 
 	zdev->mediated_recovery = true;
 
+	ret = vfio_pci_zdev_setup_err_region(vdev);
+	if (ret)
+		return ret;
+
 	if (!vdev->vdev.kvm)
 		return 0;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 75100bf009ba..452b87f3672e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -369,6 +369,8 @@ struct vfio_region_info_cap_type {
  */
 #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
 
+#define VFIO_REGION_SUBTYPE_IBM_ZPCI_ERROR_REGION (2)
+
 /* sub-types for VFIO_REGION_TYPE_GFX */
 #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
 
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
index 77f2aff1f27e..bcd06f334a42 100644
--- a/include/uapi/linux/vfio_zdev.h
+++ b/include/uapi/linux/vfio_zdev.h
@@ -82,4 +82,9 @@ struct vfio_device_info_cap_zpci_pfip {
 	__u8 pfip[];
 };
 
+struct vfio_device_zpci_err_region {
+	__u16 pec;
+	int pending_errors;
+};
+
 #endif
-- 
2.43.0
Re: [PATCH v1 4/6] vfio-pci/zdev: Setup a zpci memory region for error information
Posted by Alex Williamson 1 month, 3 weeks ago
On Wed, 13 Aug 2025 10:08:18 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:
> diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
> index 77f2aff1f27e..bcd06f334a42 100644
> --- a/include/uapi/linux/vfio_zdev.h
> +++ b/include/uapi/linux/vfio_zdev.h
> @@ -82,4 +82,9 @@ struct vfio_device_info_cap_zpci_pfip {
>  	__u8 pfip[];
>  };
>  
> +struct vfio_device_zpci_err_region {
> +	__u16 pec;
> +	int pending_errors;
> +};
> +
>  #endif

If this is uapi it would hopefully include some description, but if
this is the extent of what can be read from the device specific region,
why not just return it via a DEVICE_FEATURE ioctl?  Thanks,

Alex
Re: [PATCH v1 4/6] vfio-pci/zdev: Setup a zpci memory region for error information
Posted by Farhan Ali 1 month, 3 weeks ago
On 8/13/2025 1:30 PM, Alex Williamson wrote:
> On Wed, 13 Aug 2025 10:08:18 -0700
> Farhan Ali <alifm@linux.ibm.com> wrote:
>> diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
>> index 77f2aff1f27e..bcd06f334a42 100644
>> --- a/include/uapi/linux/vfio_zdev.h
>> +++ b/include/uapi/linux/vfio_zdev.h
>> @@ -82,4 +82,9 @@ struct vfio_device_info_cap_zpci_pfip {
>>   	__u8 pfip[];
>>   };
>>   
>> +struct vfio_device_zpci_err_region {
>> +	__u16 pec;
>> +	int pending_errors;
>> +};
>> +
>>   #endif
> If this is uapi it would hopefully include some description, but if
> this is the extent of what can be read from the device specific region,
> why not just return it via a DEVICE_FEATURE ioctl?  Thanks,
>
> Alex
>
Yes, will add more details about the uapi. My thinking was based on how 
we expose some other vfio device information on s390x, such as SCHIB for 
vfio-ccw device.

I didn't think about the DEVICE_FEATURE ioctl. But looking into it, it 
looks like we would have to define a device feature (for eg: 
VFIO_DEVICE_FEATURE_ZPCI_ERROR), and expose this information via 
GET_FEATURE? If the preference is to use the DEVICE_FEATURE ioctl I can 
try that. Curious, any specific reason you prefer the DEVICE_FEATURE 
ioctl to the memory region?

Thanks
Farhan
Re: [PATCH v1 4/6] vfio-pci/zdev: Setup a zpci memory region for error information
Posted by Alex Williamson 1 month, 3 weeks ago
On Wed, 13 Aug 2025 14:25:59 -0700
Farhan Ali <alifm@linux.ibm.com> wrote:

> On 8/13/2025 1:30 PM, Alex Williamson wrote:
> > On Wed, 13 Aug 2025 10:08:18 -0700
> > Farhan Ali <alifm@linux.ibm.com> wrote:  
> >> diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
> >> index 77f2aff1f27e..bcd06f334a42 100644
> >> --- a/include/uapi/linux/vfio_zdev.h
> >> +++ b/include/uapi/linux/vfio_zdev.h
> >> @@ -82,4 +82,9 @@ struct vfio_device_info_cap_zpci_pfip {
> >>   	__u8 pfip[];
> >>   };
> >>   
> >> +struct vfio_device_zpci_err_region {
> >> +	__u16 pec;
> >> +	int pending_errors;
> >> +};
> >> +
> >>   #endif  
> > If this is uapi it would hopefully include some description, but if
> > this is the extent of what can be read from the device specific region,
> > why not just return it via a DEVICE_FEATURE ioctl?  Thanks,
> >
> > Alex
> >  
> Yes, will add more details about the uapi. My thinking was based on how 
> we expose some other vfio device information on s390x, such as SCHIB for 
> vfio-ccw device.
> 
> I didn't think about the DEVICE_FEATURE ioctl. But looking into it, it 
> looks like we would have to define a device feature (for eg: 
> VFIO_DEVICE_FEATURE_ZPCI_ERROR), and expose this information via 
> GET_FEATURE?

Yes, and there's a probe capability built-in to determine support.

> If the preference is to use the DEVICE_FEATURE ioctl I can 
> try that. Curious, any specific reason you prefer the DEVICE_FEATURE 
> ioctl to the memory region?

Given our current segmenting of the vfio device fd, we're using 40-bits
of address space for a 6-byte structure.  We're returning structured
data that has no requirement to be read at arbitrary offsets and
lengths.  For example, does this series really even handle a short
read?  We adjust counters for any read, it's more prone to those sorts
of errors.

Maybe if you actually wanted to allow the user to mmap the error array
buffer and move the head as they read data while the kernel
asynchronously fills from the tail, it might make sense to use a
region, but as used here I don't think it's the right interface.
Thanks,

Alex