Use the associated vfio feature ioctl to enable interpretation for devices
when requested. As part of this process, we must use the host function
handle rather than a QEMU-generated one -- this is provided as part of the
ioctl payload.
Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++-
hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++-
hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++
include/hw/s390x/s390-pci-bus.h | 1 +
include/hw/s390x/s390-pci-vfio.h | 15 +++++++
5 files changed, 199 insertions(+), 2 deletions(-)
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 01b58ebc70..a39ccfee05 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr)
}
}
+static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev)
+{
+ uint32_t idx;
+ int rc;
+
+ rc = s390_pci_probe_interp(pbdev);
+ if (rc) {
+ return rc;
+ }
+
+ rc = s390_pci_update_passthrough_fh(pbdev);
+ if (rc) {
+ return rc;
+ }
+
+ /*
+ * The host device is already in an enabled state, but we always present
+ * the initial device state to the guest as disabled (ZPCI_FS_DISABLED).
+ * Therefore, mask off the enable bit from the passthrough handle until
+ * the guest issues a CLP SET PCI FN later to enable the device.
+ */
+ pbdev->fh &= ~FH_MASK_ENABLE;
+
+ /* Next, see if the idx is already in-use */
+ idx = pbdev->fh & FH_MASK_INDEX;
+ if (pbdev->idx != idx) {
+ if (s390_pci_find_dev_by_idx(s, idx)) {
+ return -EINVAL;
+ }
+ /*
+ * Update the idx entry with the passed through idx
+ * If the relinquished idx is lower than next_idx, use it
+ * to replace next_idx
+ */
+ g_hash_table_remove(s->zpci_table, &pbdev->idx);
+ if (idx < s->next_idx) {
+ s->next_idx = idx;
+ }
+ pbdev->idx = idx;
+ g_hash_table_insert(s->zpci_table, &pbdev->idx, pbdev);
+ }
+
+ return 0;
+}
+
static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
Error **errp)
{
S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev);
PCIDevice *pdev = NULL;
S390PCIBusDevice *pbdev = NULL;
+ int rc;
if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
PCIBridge *pb = PCI_BRIDGE(dev);
@@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
set_pbdev_info(pbdev);
if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
- pbdev->fh |= FH_SHM_VFIO;
+ /*
+ * By default, interpretation is always requested; if the available
+ * facilities indicate it is not available, fallback to the
+ * intercept model.
+ */
+ if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) {
+ DPRINTF("zPCI interpretation facilities missing.\n");
+ pbdev->interp = false;
+ }
+ if (pbdev->interp) {
+ rc = s390_pci_interp_plug(s, pbdev);
+ if (rc) {
+ error_setg(errp, "zpci interp plug failed: %d", rc);
+ return;
+ }
+ }
pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev);
/* Fill in CLP information passed via the vfio region */
s390_pci_get_clp_info(pbdev);
+ if (!pbdev->interp) {
+ /* Do vfio passthrough but intercept for I/O */
+ pbdev->fh |= FH_SHM_VFIO;
+ }
} else {
pbdev->fh |= FH_SHM_EMUL;
+ /* Always intercept emulated devices */
+ pbdev->interp = false;
}
if (s390_pci_msix_init(pbdev)) {
@@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = {
DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED),
DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid),
DEFINE_PROP_STRING("target", S390PCIBusDevice, target),
+ DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 6d400d4147..e9a0dc12e4 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -18,6 +18,7 @@
#include "sysemu/hw_accel.h"
#include "hw/s390x/s390-pci-inst.h"
#include "hw/s390x/s390-pci-bus.h"
+#include "hw/s390x/s390-pci-vfio.h"
#include "hw/s390x/tod.h"
#ifndef DEBUG_S390PCI_INST
@@ -156,6 +157,47 @@ out:
return rc;
}
+static int clp_enable_interp(S390PCIBusDevice *pbdev)
+{
+ int rc;
+
+ rc = s390_pci_set_interp(pbdev, true);
+ if (rc) {
+ DPRINTF("Failed to enable interpretation\n");
+ return rc;
+ }
+ rc = s390_pci_update_passthrough_fh(pbdev);
+ if (rc) {
+ DPRINTF("Failed to update passthrough fh\n");
+ return rc;
+ }
+ if (!(pbdev->fh & FH_MASK_ENABLE)) {
+ DPRINTF("Passthrough handle is not enabled\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int clp_disable_interp(S390PCIBusDevice *pbdev)
+{
+ int rc;
+
+ rc = s390_pci_set_interp(pbdev, false);
+ if (rc) {
+ DPRINTF("Failed to disable interpretation\n");
+ return rc;
+ }
+
+ rc = s390_pci_update_passthrough_fh(pbdev);
+ if (rc) {
+ DPRINTF("Failed to update passthrough fh\n");
+ return rc;
+ }
+
+ return 0;
+}
+
int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra)
{
ClpReqHdr *reqh;
@@ -246,7 +288,19 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra)
goto out;
}
- pbdev->fh |= FH_MASK_ENABLE;
+ /*
+ * If interpretation is specified, attempt to enable this now and
+ * update with the host fh
+ */
+ if (pbdev->interp) {
+ if (clp_enable_interp(pbdev)) {
+ stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR);
+ goto out;
+ }
+ } else {
+ pbdev->fh |= FH_MASK_ENABLE;
+ }
+
pbdev->state = ZPCI_FS_ENABLED;
stl_p(&ressetpci->fh, pbdev->fh);
stw_p(&ressetpci->hdr.rsp, CLP_RC_OK);
@@ -257,6 +311,13 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra)
goto out;
}
device_legacy_reset(DEVICE(pbdev));
+ if (pbdev->interp) {
+ if (clp_disable_interp(pbdev)) {
+ stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR);
+ goto out;
+ }
+ }
+ /* Mask off the enabled bit for interpreted devices too */
pbdev->fh &= ~FH_MASK_ENABLE;
pbdev->state = ZPCI_FS_DISABLED;
stl_p(&ressetpci->fh, pbdev->fh);
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index 6f80a47e29..2cab3a9e89 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -97,6 +97,58 @@ void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt)
}
}
+int s390_pci_probe_interp(S390PCIBusDevice *pbdev)
+{
+ VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev);
+ struct vfio_device_feature feat = {
+ .argsz = sizeof(struct vfio_device_feature),
+ .flags = VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_ZPCI_INTERP
+ };
+
+ return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, &feat);
+}
+
+int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable)
+{
+ VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev);
+ struct vfio_device_zpci_interp *data;
+ int size = sizeof(struct vfio_device_feature) + sizeof(*data);
+ g_autofree struct vfio_device_feature *feat = g_malloc0(size);
+
+ feat->argsz = size;
+ feat->flags = VFIO_DEVICE_FEATURE_SET + VFIO_DEVICE_FEATURE_ZPCI_INTERP;
+
+ data = (struct vfio_device_zpci_interp *)&feat->data;
+ if (enable) {
+ data->flags = VFIO_DEVICE_ZPCI_FLAG_INTERP;
+ } else {
+ data->flags = 0;
+ }
+
+ return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat);
+}
+
+int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev)
+{
+ VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev);
+ struct vfio_device_zpci_interp *data;
+ int size = sizeof(struct vfio_device_feature) + sizeof(*data);
+ g_autofree struct vfio_device_feature *feat = g_malloc0(size);
+ int rc;
+
+ feat->argsz = size;
+ feat->flags = VFIO_DEVICE_FEATURE_GET + VFIO_DEVICE_FEATURE_ZPCI_INTERP;
+
+ rc = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat);
+ if (rc) {
+ return rc;
+ }
+
+ data = (struct vfio_device_zpci_interp *)&feat->data;
+ pbdev->fh = data->fh;
+ return 0;
+}
+
static void s390_pci_read_base(S390PCIBusDevice *pbdev,
struct vfio_device_info *info)
{
diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
index da3cde2bb4..a9843dfe97 100644
--- a/include/hw/s390x/s390-pci-bus.h
+++ b/include/hw/s390x/s390-pci-bus.h
@@ -350,6 +350,7 @@ struct S390PCIBusDevice {
IndAddr *indicator;
bool pci_unplug_request_processed;
bool unplug_requested;
+ bool interp;
QTAILQ_ENTRY(S390PCIBusDevice) link;
};
diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h
index ff708aef50..42533e38f7 100644
--- a/include/hw/s390x/s390-pci-vfio.h
+++ b/include/hw/s390x/s390-pci-vfio.h
@@ -20,6 +20,9 @@ bool s390_pci_update_dma_avail(int fd, unsigned int *avail);
S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
S390PCIBusDevice *pbdev);
void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt);
+int s390_pci_probe_interp(S390PCIBusDevice *pbdev);
+int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable);
+int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev);
void s390_pci_get_clp_info(S390PCIBusDevice *pbdev);
#else
static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail)
@@ -33,6 +36,18 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
}
static inline void s390_pci_end_dma_count(S390pciState *s,
S390PCIDMACount *cnt) { }
+int s390_pci_probe_interp(S390PCIBusDevice *pbdev)
+{
+ return -EINVAL;
+}
+static inline int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable)
+{
+ return -EINVAL;
+}
+static inline int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev)
+{
+ return -EINVAL;
+}
static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { }
#endif
--
2.27.0
On 14/01/2022 21.38, Matthew Rosato wrote:
> Use the associated vfio feature ioctl to enable interpretation for devices
> when requested. As part of this process, we must use the host function
> handle rather than a QEMU-generated one -- this is provided as part of the
> ioctl payload.
>
> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
> hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++-
> hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++-
> hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++
> include/hw/s390x/s390-pci-bus.h | 1 +
> include/hw/s390x/s390-pci-vfio.h | 15 +++++++
> 5 files changed, 199 insertions(+), 2 deletions(-)
>
> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
> index 01b58ebc70..a39ccfee05 100644
> --- a/hw/s390x/s390-pci-bus.c
> +++ b/hw/s390x/s390-pci-bus.c
> @@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr)
> }
> }
>
> +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev)
> +{
> + uint32_t idx;
> + int rc;
> +
> + rc = s390_pci_probe_interp(pbdev);
> + if (rc) {
> + return rc;
> + }
> +
> + rc = s390_pci_update_passthrough_fh(pbdev);
> + if (rc) {
> + return rc;
> + }
> +
> + /*
> + * The host device is already in an enabled state, but we always present
> + * the initial device state to the guest as disabled (ZPCI_FS_DISABLED).
> + * Therefore, mask off the enable bit from the passthrough handle until
> + * the guest issues a CLP SET PCI FN later to enable the device.
> + */
> + pbdev->fh &= ~FH_MASK_ENABLE;
> +
> + /* Next, see if the idx is already in-use */
> + idx = pbdev->fh & FH_MASK_INDEX;
> + if (pbdev->idx != idx) {
> + if (s390_pci_find_dev_by_idx(s, idx)) {
> + return -EINVAL;
> + }
> + /*
> + * Update the idx entry with the passed through idx
> + * If the relinquished idx is lower than next_idx, use it
> + * to replace next_idx
> + */
> + g_hash_table_remove(s->zpci_table, &pbdev->idx);
> + if (idx < s->next_idx) {
> + s->next_idx = idx;
> + }
> + pbdev->idx = idx;
> + g_hash_table_insert(s->zpci_table, &pbdev->idx, pbdev);
> + }
> +
> + return 0;
> +}
> +
> static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> Error **errp)
> {
> S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev);
> PCIDevice *pdev = NULL;
> S390PCIBusDevice *pbdev = NULL;
> + int rc;
>
> if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
> PCIBridge *pb = PCI_BRIDGE(dev);
> @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> set_pbdev_info(pbdev);
>
> if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
> - pbdev->fh |= FH_SHM_VFIO;
> + /*
> + * By default, interpretation is always requested; if the available
> + * facilities indicate it is not available, fallback to the
> + * intercept model.
> + */
> + if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) {
> + DPRINTF("zPCI interpretation facilities missing.\n");
> + pbdev->interp = false;
> + }
Wrong indentation in the above three lines.
> + if (pbdev->interp) {
> + rc = s390_pci_interp_plug(s, pbdev);
> + if (rc) {
> + error_setg(errp, "zpci interp plug failed: %d", rc);
The error message is a little bit scarce for something that might be
presented to the user - maybe write at least "interpretation" instead of
"interp" ?
> + return;
> + }
> + }
> pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev);
> /* Fill in CLP information passed via the vfio region */
> s390_pci_get_clp_info(pbdev);
> + if (!pbdev->interp) {
> + /* Do vfio passthrough but intercept for I/O */
> + pbdev->fh |= FH_SHM_VFIO;
> + }
> } else {
> pbdev->fh |= FH_SHM_EMUL;
> + /* Always intercept emulated devices */
> + pbdev->interp = false;
> }
Thomas
On 1/17/22 9:51 AM, Thomas Huth wrote:
> On 14/01/2022 21.38, Matthew Rosato wrote:
...
>> static void s390_pcihost_plug(HotplugHandler *hotplug_dev,
>> DeviceState *dev,
>> Error **errp)
>> {
>> S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev);
>> PCIDevice *pdev = NULL;
>> S390PCIBusDevice *pbdev = NULL;
>> + int rc;
>> if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
>> PCIBridge *pb = PCI_BRIDGE(dev);
>> @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler
>> *hotplug_dev, DeviceState *dev,
>> set_pbdev_info(pbdev);
>> if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
>> - pbdev->fh |= FH_SHM_VFIO;
>> + /*
>> + * By default, interpretation is always requested; if the
>> available
>> + * facilities indicate it is not available, fallback to the
>> + * intercept model.
>> + */
>> + if (pbdev->interp &&
>> !s390_has_feat(S390_FEAT_ZPCI_INTERP)) {
>> + DPRINTF("zPCI interpretation facilities
>> missing.\n");
>> + pbdev->interp = false;
>> + }
>
> Wrong indentation in the above three lines.
Thanks
>
>> + if (pbdev->interp) {
>> + rc = s390_pci_interp_plug(s, pbdev);
>> + if (rc) {
>> + error_setg(errp, "zpci interp plug failed: %d", rc);
>
> The error message is a little bit scarce for something that might be
> presented to the user - maybe write at least "interpretation" instead of
> "interp" ?
>
Good point, I'll re-word to something like "Plug failed for zPCI device
in interpretation mode: %d"
>> + return;
>> + }
>> + }
>> pbdev->iommu->dma_limit = s390_pci_start_dma_count(s,
>> pbdev);
>> /* Fill in CLP information passed via the vfio region */
>> s390_pci_get_clp_info(pbdev);
>> + if (!pbdev->interp) {
>> + /* Do vfio passthrough but intercept for I/O */
>> + pbdev->fh |= FH_SHM_VFIO;
>> + }
>> } else {
>> pbdev->fh |= FH_SHM_EMUL;
>> + /* Always intercept emulated devices */
>> + pbdev->interp = false;
>> }
>
> Thomas
>
On 14/01/2022 21.38, Matthew Rosato wrote:
> Use the associated vfio feature ioctl to enable interpretation for devices
> when requested. As part of this process, we must use the host function
> handle rather than a QEMU-generated one -- this is provided as part of the
> ioctl payload.
>
> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
> hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++-
> hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++-
> hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++
> include/hw/s390x/s390-pci-bus.h | 1 +
> include/hw/s390x/s390-pci-vfio.h | 15 +++++++
> 5 files changed, 199 insertions(+), 2 deletions(-)
>
> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
> index 01b58ebc70..a39ccfee05 100644
> --- a/hw/s390x/s390-pci-bus.c
> +++ b/hw/s390x/s390-pci-bus.c
...
> @@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = {
> DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED),
> DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid),
> DEFINE_PROP_STRING("target", S390PCIBusDevice, target),
> + DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true),
> DEFINE_PROP_END_OF_LIST(),
> };
Since this is something that the user can see, would it maybe make sense to
provide a full word instead of an abbreviation here? I.e. "interpret" or
"interpretation" instead of "interp" ?
Thomas
On 1/17/22 10:38 AM, Thomas Huth wrote:
> On 14/01/2022 21.38, Matthew Rosato wrote:
>> Use the associated vfio feature ioctl to enable interpretation for
>> devices
>> when requested. As part of this process, we must use the host function
>> handle rather than a QEMU-generated one -- this is provided as part of
>> the
>> ioctl payload.
>>
>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
>> ---
>> hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++-
>> hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++-
>> hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++
>> include/hw/s390x/s390-pci-bus.h | 1 +
>> include/hw/s390x/s390-pci-vfio.h | 15 +++++++
>> 5 files changed, 199 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
>> index 01b58ebc70..a39ccfee05 100644
>> --- a/hw/s390x/s390-pci-bus.c
>> +++ b/hw/s390x/s390-pci-bus.c
> ...
>> @@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = {
>> DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED),
>> DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid),
>> DEFINE_PROP_STRING("target", S390PCIBusDevice, target),
>> + DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true),
>> DEFINE_PROP_END_OF_LIST(),
>> };
>
> Since this is something that the user can see, would it maybe make sense
> to provide a full word instead of an abbreviation here? I.e. "interpret"
> or "interpretation" instead of "interp" ?
I'll go with "interpret" unless someone else has a strong opinion on it.
On 1/14/22 21:38, Matthew Rosato wrote:
> Use the associated vfio feature ioctl to enable interpretation for devices
> when requested. As part of this process, we must use the host function
> handle rather than a QEMU-generated one -- this is provided as part of the
> ioctl payload.
>
> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
> hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++-
> hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++-
> hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++
> include/hw/s390x/s390-pci-bus.h | 1 +
> include/hw/s390x/s390-pci-vfio.h | 15 +++++++
> 5 files changed, 199 insertions(+), 2 deletions(-)
>
> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
> index 01b58ebc70..a39ccfee05 100644
> --- a/hw/s390x/s390-pci-bus.c
> +++ b/hw/s390x/s390-pci-bus.c
> @@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr)
> }
> }
>
...snip...
> diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h
> index ff708aef50..42533e38f7 100644
> --- a/include/hw/s390x/s390-pci-vfio.h
> +++ b/include/hw/s390x/s390-pci-vfio.h
> @@ -20,6 +20,9 @@ bool s390_pci_update_dma_avail(int fd, unsigned int *avail);
> S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
> S390PCIBusDevice *pbdev);
> void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt);
> +int s390_pci_probe_interp(S390PCIBusDevice *pbdev);
> +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable);
> +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev);
> void s390_pci_get_clp_info(S390PCIBusDevice *pbdev);
> #else
> static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail)
> @@ -33,6 +36,18 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
> }
> static inline void s390_pci_end_dma_count(S390pciState *s,
> S390PCIDMACount *cnt) { }
> +int s390_pci_probe_interp(S390PCIBusDevice *pbdev)
static inline ?
> +{
> + return -EINVAL;
> +}
> +static inline int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable)
> +{
> + return -EINVAL;
> +}
> +static inline int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev)
> +{
> + return -EINVAL;
> +}
> static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { }
> #endif
>
>
--
Pierre Morel
IBM Lab Boeblingen
On 1/14/22 21:38, Matthew Rosato wrote:
> Use the associated vfio feature ioctl to enable interpretation for devices
> when requested. As part of this process, we must use the host function
> handle rather than a QEMU-generated one -- this is provided as part of the
> ioctl payload.
I wonder if we should not explain here that having interpretation as a
default and silently fall back to interception allows backward
compatibility while allowing performence be chosing by default.
(You can say it better as I do :) )
>
> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
> hw/s390x/s390-pci-bus.c | 70 +++++++++++++++++++++++++++++++-
> hw/s390x/s390-pci-inst.c | 63 +++++++++++++++++++++++++++-
> hw/s390x/s390-pci-vfio.c | 52 ++++++++++++++++++++++++
> include/hw/s390x/s390-pci-bus.h | 1 +
> include/hw/s390x/s390-pci-vfio.h | 15 +++++++
> 5 files changed, 199 insertions(+), 2 deletions(-)
>
> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
> index 01b58ebc70..a39ccfee05 100644
> --- a/hw/s390x/s390-pci-bus.c
> +++ b/hw/s390x/s390-pci-bus.c
> @@ -971,12 +971,58 @@ static void s390_pci_update_subordinate(PCIDevice *dev, uint32_t nr)
> }
> }
>
> +static int s390_pci_interp_plug(S390pciState *s, S390PCIBusDevice *pbdev)
> +{
> + uint32_t idx;
> + int rc;
> +
> + rc = s390_pci_probe_interp(pbdev);
> + if (rc) {
> + return rc;
> + }
> +
> + rc = s390_pci_update_passthrough_fh(pbdev);
> + if (rc) {
> + return rc;
> + }
> +
> + /*
> + * The host device is already in an enabled state, but we always present
> + * the initial device state to the guest as disabled (ZPCI_FS_DISABLED).
> + * Therefore, mask off the enable bit from the passthrough handle until
> + * the guest issues a CLP SET PCI FN later to enable the device.
> + */
> + pbdev->fh &= ~FH_MASK_ENABLE;
> +
> + /* Next, see if the idx is already in-use */
> + idx = pbdev->fh & FH_MASK_INDEX;
> + if (pbdev->idx != idx) {
> + if (s390_pci_find_dev_by_idx(s, idx)) {
> + return -EINVAL;
> + }
> + /*
> + * Update the idx entry with the passed through idx
> + * If the relinquished idx is lower than next_idx, use it
> + * to replace next_idx
> + */
> + g_hash_table_remove(s->zpci_table, &pbdev->idx);
> + if (idx < s->next_idx) {
> + s->next_idx = idx;
> + }
> + pbdev->idx = idx;
> + g_hash_table_insert(s->zpci_table, &pbdev->idx, pbdev);
> + }
> +
> + return 0;
> +}
> +
> static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> Error **errp)
> {
> S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev);
> PCIDevice *pdev = NULL;
> S390PCIBusDevice *pbdev = NULL;
> + int rc;
>
> if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
> PCIBridge *pb = PCI_BRIDGE(dev);
> @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> set_pbdev_info(pbdev);
>
> if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
> - pbdev->fh |= FH_SHM_VFIO;
> + /*
> + * By default, interpretation is always requested; if the available
> + * facilities indicate it is not available, fallback to the
> + * intercept model.
s/intercept/interception/ ?
> + */
> + if (pbdev->interp && !s390_has_feat(S390_FEAT_ZPCI_INTERP)) {
> + DPRINTF("zPCI interpretation facilities missing.\n");
> + pbdev->interp = false;
> + }
> + if (pbdev->interp) {
> + rc = s390_pci_interp_plug(s, pbdev);
> + if (rc) {
> + error_setg(errp, "zpci interp plug failed: %d", rc);
> + return;
> + }
> + }
Can't we rearrange that as
if (pbdev->interp) {
if (s390_has_feat) {
} else {
}
}
> pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev);
> /* Fill in CLP information passed via the vfio region */
> s390_pci_get_clp_info(pbdev);
> + if (!pbdev->interp) {
> + /* Do vfio passthrough but intercept for I/O */
> + pbdev->fh |= FH_SHM_VFIO;
> + }
> } else {
> pbdev->fh |= FH_SHM_EMUL;
> + /* Always intercept emulated devices */
> + pbdev->interp = false;
> }
>
> if (s390_pci_msix_init(pbdev)) {
> @@ -1360,6 +1427,7 @@ static Property s390_pci_device_properties[] = {
> DEFINE_PROP_UINT16("uid", S390PCIBusDevice, uid, UID_UNDEFINED),
> DEFINE_PROP_S390_PCI_FID("fid", S390PCIBusDevice, fid),
> DEFINE_PROP_STRING("target", S390PCIBusDevice, target),
> + DEFINE_PROP_BOOL("interp", S390PCIBusDevice, interp, true),
> DEFINE_PROP_END_OF_LIST(),
> };
>
> diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
> index 6d400d4147..e9a0dc12e4 100644
> --- a/hw/s390x/s390-pci-inst.c
> +++ b/hw/s390x/s390-pci-inst.c
> @@ -18,6 +18,7 @@
> #include "sysemu/hw_accel.h"
> #include "hw/s390x/s390-pci-inst.h"
> #include "hw/s390x/s390-pci-bus.h"
> +#include "hw/s390x/s390-pci-vfio.h"
> #include "hw/s390x/tod.h"
>
> #ifndef DEBUG_S390PCI_INST
> @@ -156,6 +157,47 @@ out:
> return rc;
> }
>
> +static int clp_enable_interp(S390PCIBusDevice *pbdev)
> +{
> + int rc;
> +
> + rc = s390_pci_set_interp(pbdev, true);
> + if (rc) {
> + DPRINTF("Failed to enable interpretation\n");
> + return rc;
> + }
> + rc = s390_pci_update_passthrough_fh(pbdev);
> + if (rc) {
> + DPRINTF("Failed to update passthrough fh\n");
> + return rc;
> + }
> + if (!(pbdev->fh & FH_MASK_ENABLE)) {
> + DPRINTF("Passthrough handle is not enabled\n");
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static int clp_disable_interp(S390PCIBusDevice *pbdev)
> +{
> + int rc;
> +
> + rc = s390_pci_set_interp(pbdev, false);
> + if (rc) {
> + DPRINTF("Failed to disable interpretation\n");
> + return rc;
> + }
> +
> + rc = s390_pci_update_passthrough_fh(pbdev);
> + if (rc) {
> + DPRINTF("Failed to update passthrough fh\n");
> + return rc;
> + }
> +
> + return 0;
> +}
> +
> int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra)
> {
> ClpReqHdr *reqh;
> @@ -246,7 +288,19 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra)
> goto out;
> }
>
> - pbdev->fh |= FH_MASK_ENABLE;
> + /*
> + * If interpretation is specified, attempt to enable this now and
> + * update with the host fh
> + */
> + if (pbdev->interp) {
> + if (clp_enable_interp(pbdev)) {
> + stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR);
> + goto out;
> + }
> + } else {
> + pbdev->fh |= FH_MASK_ENABLE;
> + }
> +
> pbdev->state = ZPCI_FS_ENABLED;
> stl_p(&ressetpci->fh, pbdev->fh);
> stw_p(&ressetpci->hdr.rsp, CLP_RC_OK);
> @@ -257,6 +311,13 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra)
> goto out;
> }
> device_legacy_reset(DEVICE(pbdev));
> + if (pbdev->interp) {
> + if (clp_disable_interp(pbdev)) {
> + stw_p(&ressetpci->hdr.rsp, CLP_RC_SETPCIFN_ERR);
> + goto out;
> + }
> + }
> + /* Mask off the enabled bit for interpreted devices too */
> pbdev->fh &= ~FH_MASK_ENABLE;
> pbdev->state = ZPCI_FS_DISABLED;
> stl_p(&ressetpci->fh, pbdev->fh);
> diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
> index 6f80a47e29..2cab3a9e89 100644
> --- a/hw/s390x/s390-pci-vfio.c
> +++ b/hw/s390x/s390-pci-vfio.c
> @@ -97,6 +97,58 @@ void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt)
> }
> }
>
> +int s390_pci_probe_interp(S390PCIBusDevice *pbdev)
> +{
> + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev);
> + struct vfio_device_feature feat = {
> + .argsz = sizeof(struct vfio_device_feature),
> + .flags = VFIO_DEVICE_FEATURE_PROBE | VFIO_DEVICE_FEATURE_ZPCI_INTERP
> + };
> +
> + return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, &feat);
> +}
> +
> +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable)
> +{
> + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev);
> + struct vfio_device_zpci_interp *data;
> + int size = sizeof(struct vfio_device_feature) + sizeof(*data);
> + g_autofree struct vfio_device_feature *feat = g_malloc0(size);
> +
> + feat->argsz = size;
> + feat->flags = VFIO_DEVICE_FEATURE_SET + VFIO_DEVICE_FEATURE_ZPCI_INTERP;
> +
> + data = (struct vfio_device_zpci_interp *)&feat->data;
> + if (enable) {
> + data->flags = VFIO_DEVICE_ZPCI_FLAG_INTERP;
> + } else {
> + data->flags = 0;
> + }
> +
> + return ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat);
> +}
> +
> +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev)
> +{
> + VFIOPCIDevice *vdev = VFIO_PCI(pbdev->pdev);
> + struct vfio_device_zpci_interp *data;
> + int size = sizeof(struct vfio_device_feature) + sizeof(*data);
> + g_autofree struct vfio_device_feature *feat = g_malloc0(size);
> + int rc;
> +
> + feat->argsz = size;
> + feat->flags = VFIO_DEVICE_FEATURE_GET + VFIO_DEVICE_FEATURE_ZPCI_INTERP;
> +
> + rc = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_FEATURE, feat);
> + if (rc) {
> + return rc;
> + }
> +
> + data = (struct vfio_device_zpci_interp *)&feat->data;
> + pbdev->fh = data->fh;
> + return 0;
> +}
> +
> static void s390_pci_read_base(S390PCIBusDevice *pbdev,
> struct vfio_device_info *info)
> {
> diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h
> index da3cde2bb4..a9843dfe97 100644
> --- a/include/hw/s390x/s390-pci-bus.h
> +++ b/include/hw/s390x/s390-pci-bus.h
> @@ -350,6 +350,7 @@ struct S390PCIBusDevice {
> IndAddr *indicator;
> bool pci_unplug_request_processed;
> bool unplug_requested;
> + bool interp;
> QTAILQ_ENTRY(S390PCIBusDevice) link;
> };
>
> diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h
> index ff708aef50..42533e38f7 100644
> --- a/include/hw/s390x/s390-pci-vfio.h
> +++ b/include/hw/s390x/s390-pci-vfio.h
> @@ -20,6 +20,9 @@ bool s390_pci_update_dma_avail(int fd, unsigned int *avail);
> S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
> S390PCIBusDevice *pbdev);
> void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt);
> +int s390_pci_probe_interp(S390PCIBusDevice *pbdev);
> +int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable);
> +int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev);
> void s390_pci_get_clp_info(S390PCIBusDevice *pbdev);
> #else
> static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail)
> @@ -33,6 +36,18 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s,
> }
> static inline void s390_pci_end_dma_count(S390pciState *s,
> S390PCIDMACount *cnt) { }
> +int s390_pci_probe_interp(S390PCIBusDevice *pbdev)
> +{
> + return -EINVAL;
> +}
> +static inline int s390_pci_set_interp(S390PCIBusDevice *pbdev, bool enable)
> +{
> + return -EINVAL;
> +}
> +static inline int s390_pci_update_passthrough_fh(S390PCIBusDevice *pbdev)
> +{
> + return -EINVAL;
> +}
> static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { }
> #endif
>
>
LGTM
With the corrections proposed by Thomas.
Mine... you see what you prefer.
Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
--
Pierre Morel
IBM Lab Boeblingen
On 1/31/22 9:46 AM, Pierre Morel wrote:
>
>
> On 1/14/22 21:38, Matthew Rosato wrote:
>> Use the associated vfio feature ioctl to enable interpretation for
>> devices
>> when requested. As part of this process, we must use the host function
>> handle rather than a QEMU-generated one -- this is provided as part of
>> the
>> ioctl payload.
>
> I wonder if we should not explain here that having interpretation as a
> default and silently fall back to interception allows backward
> compatibility while allowing performence be chosing by default.
> (You can say it better as I do :) )
Good suggestion, I'll think of something to add to the commit message.
>> @@ -1022,12 +1068,33 @@ static void s390_pcihost_plug(HotplugHandler
>> *hotplug_dev, DeviceState *dev,
>> set_pbdev_info(pbdev);
>> if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
>> - pbdev->fh |= FH_SHM_VFIO;
>> + /*
>> + * By default, interpretation is always requested; if the
>> available
>> + * facilities indicate it is not available, fallback to the
>> + * intercept model.
>
> s/intercept/interception/ ?
>
OK
>> + */
>> + if (pbdev->interp &&
>> !s390_has_feat(S390_FEAT_ZPCI_INTERP)) {
>> + DPRINTF("zPCI interpretation facilities
>> missing.\n");
>> + pbdev->interp = false;
>> + }
>> + if (pbdev->interp) {
>> + rc = s390_pci_interp_plug(s, pbdev);
>> + if (rc) {
>> + error_setg(errp, "zpci interp plug failed: %d", rc);
>> + return;
>> + }
>> + }
>
>
> Can't we rearrange that as
> if (pbdev->interp) {
> if (s390_has_feat) {
> } else {
> }
> }
Yep, sure
...
>
> LGTM
> With the corrections proposed by Thomas.
> Mine... you see what you prefer.
>
> Reviewed-by: Pierre Morel <pmorel@linux.ibm.com>
Thanks!
© 2016 - 2026 Red Hat, Inc.