Add a vfio_pci variant driver for the s390-specific Internal Shared
Memory (ISM) devices used for inter-VM communication.
This enables the development of vfio-pci-based user space drivers for
ISM devices.
On s390, kernel primitives such as ioread() and iowrite() are switched
over from function-handle-based PCI load/stores instructions to PCI
memory-I/O (MIO) loads/stores when these are available and not
explicitly disabled. Since these instructions cannot be used with ISM
devices, ensure that classic function-handle-based PCI instructions are
used instead.
The driver is still required even when MIO instructions are disabled, as
the ISM device relies on the PCI store block (PCISTB) instruction to
perform write operations.
Stores are not fragmented, therefore one ioctl corresponds to exactly
one PCISTB instruction. User space must ensure to not write more than
4096 bytes at once to an ISM BAR which is the maximum payload of the
PCISTB instruction.
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
---
drivers/vfio/pci/Kconfig | 2 +
drivers/vfio/pci/Makefile | 2 +
drivers/vfio/pci/ism/Kconfig | 10 ++
drivers/vfio/pci/ism/Makefile | 3 +
drivers/vfio/pci/ism/main.c | 408 ++++++++++++++++++++++++++++++++++++++++++
5 files changed, 425 insertions(+)
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 1e82b44bda1a0a544e1add7f4b06edecf35aaf81..296bf01e185ecacc388ebc69e92706c99e47c814 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -60,6 +60,8 @@ config VFIO_PCI_DMABUF
source "drivers/vfio/pci/mlx5/Kconfig"
+source "drivers/vfio/pci/ism/Kconfig"
+
source "drivers/vfio/pci/hisilicon/Kconfig"
source "drivers/vfio/pci/pds/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index e0a0757dd1d2b0bc69b7e4d79441d5cacf4e1cd8..6138f1bf241df04e7419f196b404abdf9b194050 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -11,6 +11,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/
+obj-$(CONFIG_ISM_VFIO_PCI) += ism/
+
obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
obj-$(CONFIG_PDS_VFIO_PCI) += pds/
diff --git a/drivers/vfio/pci/ism/Kconfig b/drivers/vfio/pci/ism/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..02f47d25fed2d34c732b67b3a3655b64a7625467
--- /dev/null
+++ b/drivers/vfio/pci/ism/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+config ISM_VFIO_PCI
+ tristate "VFIO support for ISM devices"
+ depends on S390
+ select VFIO_PCI_CORE
+ help
+ This provides user space support for IBM Internal Shared Memory (ISM)
+ Adapter devices using the VFIO framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/ism/Makefile b/drivers/vfio/pci/ism/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..32cc3c66dd11395da85a2b6f05b3d97036ed8a35
--- /dev/null
+++ b/drivers/vfio/pci/ism/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ISM_VFIO_PCI) += ism-vfio-pci.o
+ism-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/ism/main.c b/drivers/vfio/pci/ism/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..00bc81f7225f806eac1b99c4520ab5a68137885e
--- /dev/null
+++ b/drivers/vfio/pci/ism/main.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vfio-ISM driver for s390
+ *
+ * Copyright IBM Corp.
+ */
+
+#include "../vfio_pci_priv.h"
+#include "linux/slab.h"
+
+#define ISM_VFIO_PCI_OFFSET_SHIFT 48
+#define ISM_VFIO_PCI_OFFSET_TO_INDEX(off) ((off) >> ISM_VFIO_PCI_OFFSET_SHIFT)
+#define ISM_VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << ISM_VFIO_PCI_OFFSET_SHIFT)
+#define ISM_VFIO_PCI_OFFSET_MASK (((u64)(1) << ISM_VFIO_PCI_OFFSET_SHIFT) - 1)
+
+/*
+ * Use __zpci_load() to bypass automatic use of
+ * PCI MIO instructions which are not supported on ISM devices
+ */
+#define ISM_READ(size) \
+ static int ism_read##size(struct zpci_dev *zdev, int bar, \
+ size_t *filled, char __user *buf, \
+ loff_t off) \
+ { \
+ u64 req, tmp; \
+ u##size val; \
+ int ret; \
+ \
+ req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, sizeof(val)); \
+ ret = __zpci_load(&tmp, req, off); \
+ if (ret) \
+ return ret; \
+ val = (u##size)tmp; \
+ if (copy_to_user(buf, &val, sizeof(val))) \
+ return -EFAULT; \
+ *filled = sizeof(val); \
+ return 0; \
+ }
+
+ISM_READ(64);
+ISM_READ(32);
+ISM_READ(16);
+ISM_READ(8);
+
+struct ism_vfio_pci_core_device {
+ struct vfio_pci_core_device core_device;
+ struct kmem_cache *store_block_cache;
+};
+
+static int ism_vfio_pci_open_device(struct vfio_device *core_vdev)
+{
+ struct ism_vfio_pci_core_device *ivpcd;
+ struct vfio_pci_core_device *vdev;
+ int ret;
+
+ ivpcd = container_of(core_vdev, struct ism_vfio_pci_core_device,
+ core_device.vdev);
+ vdev = &ivpcd->core_device;
+
+ ret = vfio_pci_core_enable(vdev);
+ if (ret)
+ return ret;
+
+ vfio_pci_core_finish_enable(vdev);
+ return 0;
+}
+
+/*
+ * ism_vfio_pci_do_io_r()
+ *
+ * On s390, kernel primitives such as ioread() and iowrite() are switched over
+ * from function-handle-based PCI load/stores instructions to PCI memory-I/O (MIO)
+ * loads/stores when these are available and not explicitly disabled. Since these
+ * instructions cannot be used with ISM devices, ensure that classic
+ * function-handle-based PCI instructions are used instead.
+ */
+static ssize_t ism_vfio_pci_do_io_r(struct vfio_pci_core_device *vdev,
+ char __user *buf, loff_t off, size_t count,
+ int bar)
+{
+ struct zpci_dev *zdev = to_zpci(vdev->pdev);
+ ssize_t done = 0;
+ int ret;
+
+ while (count) {
+ size_t filled;
+
+ if (count >= 8 && IS_ALIGNED(off, 8)) {
+ ret = ism_read64(zdev, bar, &filled, buf, off);
+ if (ret)
+ return ret;
+ } else if (count >= 4 && IS_ALIGNED(off, 4)) {
+ ret = ism_read32(zdev, bar, &filled, buf, off);
+ if (ret)
+ return ret;
+ } else if (count >= 2 && IS_ALIGNED(off, 2)) {
+ ret = ism_read16(zdev, bar, &filled, buf, off);
+ if (ret)
+ return ret;
+ } else {
+ ret = ism_read8(zdev, bar, &filled, buf, off);
+ if (ret)
+ return ret;
+ }
+
+ count -= filled;
+ done += filled;
+ off += filled;
+ buf += filled;
+ }
+
+ return done;
+}
+
+/*
+ * ism_vfio_pci_do_io_w()
+ *
+ * Ensure that the PCI store block (PCISTB) instruction is used as required by the
+ * ISM device. The ISM device also uses a 256 TiB BAR 0 for write operations,
+ * which requires a 48bit region address space (ISM_VFIO_PCI_OFFSET_SHIFT).
+ */
+static ssize_t ism_vfio_pci_do_io_w(struct vfio_pci_core_device *vdev,
+ char __user *buf, loff_t off, size_t count,
+ int bar)
+{
+ struct zpci_dev *zdev = to_zpci(vdev->pdev);
+ struct ism_vfio_pci_core_device *ivpcd;
+ ssize_t ret;
+ void *data;
+ u64 req;
+
+ if (count > zdev->maxstbl)
+ return -EINVAL;
+ if (((off % PAGE_SIZE) + count) > PAGE_SIZE)
+ return -EINVAL;
+
+ ivpcd = container_of(vdev, struct ism_vfio_pci_core_device,
+ core_device);
+ data = kmem_cache_alloc(ivpcd->store_block_cache, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ if (copy_from_user(data, buf, count)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, count);
+ ret = __zpci_store_block(data, req, off);
+ if (ret)
+ goto out_free;
+
+ ret = count;
+
+out_free:
+ kmem_cache_free(ivpcd->store_block_cache, data);
+ return ret;
+}
+
+static ssize_t ism_vfio_pci_bar_rw(struct vfio_pci_core_device *vdev,
+ char __user *buf, size_t count, loff_t *ppos,
+ bool iswrite)
+{
+ int bar = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & ISM_VFIO_PCI_OFFSET_MASK;
+ resource_size_t end;
+ ssize_t done = 0;
+
+ if (pci_resource_start(vdev->pdev, bar))
+ end = pci_resource_len(vdev->pdev, bar);
+ else
+ return -EINVAL;
+
+ if (pos >= end)
+ return -EINVAL;
+
+ count = min(count, (size_t)(end - pos));
+
+ if (iswrite)
+ done = ism_vfio_pci_do_io_w(vdev, buf, pos, count, bar);
+ else
+ done = ism_vfio_pci_do_io_r(vdev, buf, pos, count, bar);
+
+ if (done >= 0)
+ *ppos += done;
+
+ return done;
+}
+
+static ssize_t ism_vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
+ char __user *buf, size_t count,
+ loff_t *ppos, bool iswrite)
+{
+ loff_t pos = *ppos;
+ size_t done = 0;
+ int ret = 0;
+
+ pos &= ISM_VFIO_PCI_OFFSET_MASK;
+
+ while (count) {
+ /*
+ * zPCI must not use MIO instructions for config space access,
+ * so we can use common code path here.
+ */
+ ret = vfio_pci_config_rw_single(vdev, buf, count, &pos, iswrite);
+ if (ret < 0)
+ return ret;
+
+ count -= ret;
+ done += ret;
+ buf += ret;
+ pos += ret;
+ }
+
+ *ppos += done;
+
+ return done;
+}
+
+static ssize_t ism_vfio_pci_rw(struct vfio_device *core_vdev, char __user *buf,
+ size_t count, loff_t *ppos, bool iswrite)
+{
+ unsigned int index = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ struct vfio_pci_core_device *vdev;
+ int ret;
+
+ vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+ if (!count)
+ return 0;
+
+ switch (index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ ret = ism_vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
+ break;
+
+ case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+ ret = ism_vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static ssize_t ism_vfio_pci_read(struct vfio_device *core_vdev,
+ char __user *buf, size_t count, loff_t *ppos)
+{
+ return ism_vfio_pci_rw(core_vdev, buf, count, ppos, false);
+}
+
+static ssize_t ism_vfio_pci_write(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ return ism_vfio_pci_rw(core_vdev, (char __user *)buf, count, ppos,
+ true);
+}
+
+static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
+ struct pci_dev *pdev = vdev->pdev;
+
+ switch (info->index) {
+ case VFIO_PCI_CONFIG_REGION_INDEX:
+ info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = pdev->cfg_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ break;
+ case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+ info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
+ info->size = pci_resource_len(pdev, info->index);
+ if (!info->size) {
+ info->flags = 0;
+ break;
+ }
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ break;
+ default:
+ info->offset = 0;
+ info->size = 0;
+ info->flags = 0;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ism_vfio_pci_init_dev(struct vfio_device *core_vdev)
+{
+ struct zpci_dev *zdev = to_zpci(to_pci_dev(core_vdev->dev));
+ struct ism_vfio_pci_core_device *ivpcd;
+ char cache_name[20];
+ int ret;
+
+ ivpcd = container_of(core_vdev, struct ism_vfio_pci_core_device,
+ core_device.vdev);
+
+ snprintf(cache_name, sizeof(cache_name), "ism_sb_fid_%08x", zdev->fid);
+
+ ivpcd->store_block_cache =
+ kmem_cache_create(cache_name, zdev->maxstbl,
+ (&(struct kmem_cache_args){
+ .align = PAGE_SIZE,
+ .useroffset = 0,
+ .usersize = zdev->maxstbl,
+ }),
+ (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT));
+ if (!ivpcd->store_block_cache)
+ return -ENOMEM;
+
+ ret = vfio_pci_core_init_dev(core_vdev);
+ if (ret)
+ kmem_cache_destroy(ivpcd->store_block_cache);
+
+ return ret;
+}
+
+static void ism_vfio_pci_release_dev(struct vfio_device *core_vdev)
+{
+ struct ism_vfio_pci_core_device *ivpcd = container_of(
+ core_vdev, struct ism_vfio_pci_core_device, core_device.vdev);
+
+ kmem_cache_destroy(ivpcd->store_block_cache);
+ vfio_pci_core_release_dev(core_vdev);
+}
+
+static const struct vfio_device_ops ism_pci_ops = {
+ .name = "ism-vfio-pci",
+ .init = ism_vfio_pci_init_dev,
+ .release = ism_vfio_pci_release_dev,
+ .open_device = ism_vfio_pci_open_device,
+ .close_device = vfio_pci_core_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = ism_vfio_pci_ioctl_get_region_info,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = ism_vfio_pci_read,
+ .write = ism_vfio_pci_write,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int ism_vfio_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct ism_vfio_pci_core_device *ivpcd;
+ int ret;
+
+ ivpcd = vfio_alloc_device(ism_vfio_pci_core_device, core_device.vdev,
+ &pdev->dev, &ism_pci_ops);
+ if (IS_ERR(ivpcd))
+ return PTR_ERR(ivpcd);
+
+ dev_set_drvdata(&pdev->dev, &ivpcd->core_device);
+
+ ret = vfio_pci_core_register_device(&ivpcd->core_device);
+ if (ret)
+ vfio_put_device(&ivpcd->core_device.vdev);
+
+ return ret;
+}
+
+static void ism_vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device;
+ struct ism_vfio_pci_core_device *ivpcd;
+
+ core_device = dev_get_drvdata(&pdev->dev);
+ ivpcd = container_of(core_device, struct ism_vfio_pci_core_device,
+ core_device);
+
+ vfio_pci_core_unregister_device(&ivpcd->core_device);
+ vfio_put_device(&ivpcd->core_device.vdev);
+}
+
+static const struct pci_device_id ism_device_table[] = {
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_IBM,
+ PCI_DEVICE_ID_IBM_ISM) },
+ {}
+};
+MODULE_DEVICE_TABLE(pci, ism_device_table);
+
+static struct pci_driver ism_vfio_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = ism_device_table,
+ .probe = ism_vfio_pci_probe,
+ .remove = ism_vfio_pci_remove,
+ .err_handler = &vfio_pci_core_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(ism_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("vfio-pci variant driver for the IBM Internal Shared Memory (ISM) device");
+MODULE_AUTHOR("IBM Corporation");
--
2.51.0
On Wed, 25 Mar 2026 14:31:24 +0100 Julian Ruess <julianr@linux.ibm.com> wrote: > diff --git a/drivers/vfio/pci/ism/main.c b/drivers/vfio/pci/ism/main.c > new file mode 100644 > index 0000000000000000000000000000000000000000..00bc81f7225f806eac1b99c4520ab5a68137885e > --- /dev/null > +++ b/drivers/vfio/pci/ism/main.c > @@ -0,0 +1,408 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * vfio-ISM driver for s390 > + * > + * Copyright IBM Corp. > + */ > + > +#include "../vfio_pci_priv.h" > +#include "linux/slab.h" Convention says this should be: +#include <linux/slab.h> #include "../vfio_pci_priv.h" -#include "linux/slab.h" Without objection, I'll make the change and push this to next tomorrow. Thanks, Alex
On Thu Apr 2, 2026 at 12:04 AM CEST, Alex Williamson wrote: > On Wed, 25 Mar 2026 14:31:24 +0100 > Julian Ruess <julianr@linux.ibm.com> wrote: >> diff --git a/drivers/vfio/pci/ism/main.c b/drivers/vfio/pci/ism/main.c >> new file mode 100644 >> index 0000000000000000000000000000000000000000..00bc81f7225f806eac1b99c4520ab5a68137885e >> --- /dev/null >> +++ b/drivers/vfio/pci/ism/main.c >> @@ -0,0 +1,408 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * vfio-ISM driver for s390 >> + * >> + * Copyright IBM Corp. >> + */ >> + >> +#include "../vfio_pci_priv.h" >> +#include "linux/slab.h" > > Convention says this should be: > > +#include <linux/slab.h> > #include "../vfio_pci_priv.h" > -#include "linux/slab.h" > > Without objection, I'll make the change and push this to next tomorrow. > Thanks, > > Alex Thank you very much!! Julian
On 3/25/2026 6:31 AM, Julian Ruess wrote: > Add a vfio_pci variant driver for the s390-specific Internal Shared > Memory (ISM) devices used for inter-VM communication. > > This enables the development of vfio-pci-based user space drivers for > ISM devices. > > On s390, kernel primitives such as ioread() and iowrite() are switched > over from function-handle-based PCI load/stores instructions to PCI > memory-I/O (MIO) loads/stores when these are available and not > explicitly disabled. Since these instructions cannot be used with ISM > devices, ensure that classic function-handle-based PCI instructions are > used instead. > > The driver is still required even when MIO instructions are disabled, as > the ISM device relies on the PCI store block (PCISTB) instruction to > perform write operations. > > Stores are not fragmented, therefore one ioctl corresponds to exactly > one PCISTB instruction. User space must ensure to not write more than > 4096 bytes at once to an ISM BAR which is the maximum payload of the > PCISTB instruction. > > Reviewed-by: Alexandra Winter<wintera@linux.ibm.com> > Reviewed-by: Niklas Schnelle<schnelle@linux.ibm.com> > Signed-off-by: Julian Ruess<julianr@linux.ibm.com> > --- I think we have decided to not support any additional regions with this patch. Feel free to add Reviewed-by: Farhan Ali<alifm@linux.ibm.com>
On Wed, 2026-03-25 at 14:31 +0100, Julian Ruess wrote:
> Add a vfio_pci variant driver for the s390-specific Internal Shared
> Memory (ISM) devices used for inter-VM communication.
>
> This enables the development of vfio-pci-based user space drivers for
> ISM devices.
>
> On s390, kernel primitives such as ioread() and iowrite() are switched
> over from function-handle-based PCI load/stores instructions to PCI
> memory-I/O (MIO) loads/stores when these are available and not
> explicitly disabled. Since these instructions cannot be used with ISM
> devices, ensure that classic function-handle-based PCI instructions are
> used instead.
>
> The driver is still required even when MIO instructions are disabled, as
> the ISM device relies on the PCI store block (PCISTB) instruction to
> perform write operations.
>
> Stores are not fragmented, therefore one ioctl corresponds to exactly
> one PCISTB instruction. User space must ensure to not write more than
> 4096 bytes at once to an ISM BAR which is the maximum payload of the
> PCISTB instruction.
>
> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
> ---
> drivers/vfio/pci/Kconfig | 2 +
> drivers/vfio/pci/Makefile | 2 +
> drivers/vfio/pci/ism/Kconfig | 10 ++
> drivers/vfio/pci/ism/Makefile | 3 +
> drivers/vfio/pci/ism/main.c | 408 ++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 425 insertions(+)
>
--- snip ---
> +
> +static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
> + struct vfio_region_info *info,
> + struct vfio_info_cap *caps)
> +{
> + struct vfio_pci_core_device *vdev =
> + container_of(core_vdev, struct vfio_pci_core_device, vdev);
> + struct pci_dev *pdev = vdev->pdev;
> +
> + switch (info->index) {
> + case VFIO_PCI_CONFIG_REGION_INDEX:
> + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> + info->size = pdev->cfg_size;
> + info->flags = VFIO_REGION_INFO_FLAG_READ |
> + VFIO_REGION_INFO_FLAG_WRITE;
> + break;
> + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> + info->size = pci_resource_len(pdev, info->index);
> + if (!info->size) {
> + info->flags = 0;
> + break;
> + }
> + info->flags = VFIO_REGION_INFO_FLAG_READ |
> + VFIO_REGION_INFO_FLAG_WRITE;
> + break;
> + default:
> + info->offset = 0;
> + info->size = 0;
> + info->flags = 0;
> + return -EINVAL;
Thinking more about this, the above default handling actually breaks
additional regions such as the one added by Farhan for the error
events. I think this needs to instead call the generic
vfio_pci_ioctl_get_region_info() for other regions.
> + }
> + return 0;
> +}
--- snip ---
On 3/26/2026 6:03 AM, Niklas Schnelle wrote:
> On Wed, 2026-03-25 at 14:31 +0100, Julian Ruess wrote:
>> Add a vfio_pci variant driver for the s390-specific Internal Shared
>> Memory (ISM) devices used for inter-VM communication.
>>
>> This enables the development of vfio-pci-based user space drivers for
>> ISM devices.
>>
>> On s390, kernel primitives such as ioread() and iowrite() are switched
>> over from function-handle-based PCI load/stores instructions to PCI
>> memory-I/O (MIO) loads/stores when these are available and not
>> explicitly disabled. Since these instructions cannot be used with ISM
>> devices, ensure that classic function-handle-based PCI instructions are
>> used instead.
>>
>> The driver is still required even when MIO instructions are disabled, as
>> the ISM device relies on the PCI store block (PCISTB) instruction to
>> perform write operations.
>>
>> Stores are not fragmented, therefore one ioctl corresponds to exactly
>> one PCISTB instruction. User space must ensure to not write more than
>> 4096 bytes at once to an ISM BAR which is the maximum payload of the
>> PCISTB instruction.
>>
>> Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
>> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
>> Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
>> ---
>> drivers/vfio/pci/Kconfig | 2 +
>> drivers/vfio/pci/Makefile | 2 +
>> drivers/vfio/pci/ism/Kconfig | 10 ++
>> drivers/vfio/pci/ism/Makefile | 3 +
>> drivers/vfio/pci/ism/main.c | 408 ++++++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 425 insertions(+)
>>
> --- snip ---
>> +
>> +static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
>> + struct vfio_region_info *info,
>> + struct vfio_info_cap *caps)
>> +{
>> + struct vfio_pci_core_device *vdev =
>> + container_of(core_vdev, struct vfio_pci_core_device, vdev);
>> + struct pci_dev *pdev = vdev->pdev;
>> +
>> + switch (info->index) {
>> + case VFIO_PCI_CONFIG_REGION_INDEX:
>> + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
>> + info->size = pdev->cfg_size;
>> + info->flags = VFIO_REGION_INFO_FLAG_READ |
>> + VFIO_REGION_INFO_FLAG_WRITE;
>> + break;
>> + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
>> + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
>> + info->size = pci_resource_len(pdev, info->index);
>> + if (!info->size) {
>> + info->flags = 0;
>> + break;
>> + }
>> + info->flags = VFIO_REGION_INFO_FLAG_READ |
>> + VFIO_REGION_INFO_FLAG_WRITE;
>> + break;
>> + default:
>> + info->offset = 0;
>> + info->size = 0;
>> + info->flags = 0;
>> + return -EINVAL;
> Thinking more about this, the above default handling actually breaks
> additional regions such as the one added by Farhan for the error
> events. I think this needs to instead call the generic
> vfio_pci_ioctl_get_region_info() for other regions.
Note for error events we are using a VFIO device feature and not a
region. At this point we actually don't have additional regions. I do
agree that if we were to add a zpci specific region we would need to
update here and ism_vfio_pci_rw() to allow region read/write ops. This
ISM driver would also need to support all the possible callbacks in
struct vfio_pci_regops.
Given that we don't have any additional regions yet, I am conflicted if
it makes sense to add all the code to support additional regions without
a good way to test it. Maybe its something we can defer when if we add
additional regions? Thanks
Farhan
>> + }
>> + return 0;
>> +}
> --- snip ---
On Thu, 2026-03-26 at 12:05 -0700, Farhan Ali wrote:
> On 3/26/2026 6:03 AM, Niklas Schnelle wrote:
> > On Wed, 2026-03-25 at 14:31 +0100, Julian Ruess wrote:
> > > Add a vfio_pci variant driver for the s390-specific Internal Shared
> > > Memory (ISM) devices used for inter-VM communication.
> > >
> > > This enables the development of vfio-pci-based user space drivers for
> > > ISM devices.
> > >
> > > On s390, kernel primitives such as ioread() and iowrite() are switched
> > > over from function-handle-based PCI load/stores instructions to PCI
> > > memory-I/O (MIO) loads/stores when these are available and not
> > > explicitly disabled. Since these instructions cannot be used with ISM
> > > devices, ensure that classic function-handle-based PCI instructions are
> > > used instead.
> > >
> > > The driver is still required even when MIO instructions are disabled, as
> > > the ISM device relies on the PCI store block (PCISTB) instruction to
> > > perform write operations.
> > >
> > > Stores are not fragmented, therefore one ioctl corresponds to exactly
> > > one PCISTB instruction. User space must ensure to not write more than
> > > 4096 bytes at once to an ISM BAR which is the maximum payload of the
> > > PCISTB instruction.
> > >
> > > Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
> > > Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> > > Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
> > > ---
> > > drivers/vfio/pci/Kconfig | 2 +
> > > drivers/vfio/pci/Makefile | 2 +
> > > drivers/vfio/pci/ism/Kconfig | 10 ++
> > > drivers/vfio/pci/ism/Makefile | 3 +
> > > drivers/vfio/pci/ism/main.c | 408 ++++++++++++++++++++++++++++++++++++++++++
> > > 5 files changed, 425 insertions(+)
> > >
> > --- snip ---
> > > +
> > > +static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
> > > + struct vfio_region_info *info,
> > > + struct vfio_info_cap *caps)
> > > +{
> > > + struct vfio_pci_core_device *vdev =
> > > + container_of(core_vdev, struct vfio_pci_core_device, vdev);
> > > + struct pci_dev *pdev = vdev->pdev;
> > > +
> > > + switch (info->index) {
> > > + case VFIO_PCI_CONFIG_REGION_INDEX:
> > > + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> > > + info->size = pdev->cfg_size;
> > > + info->flags = VFIO_REGION_INFO_FLAG_READ |
> > > + VFIO_REGION_INFO_FLAG_WRITE;
> > > + break;
> > > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> > > + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> > > + info->size = pci_resource_len(pdev, info->index);
> > > + if (!info->size) {
> > > + info->flags = 0;
> > > + break;
> > > + }
> > > + info->flags = VFIO_REGION_INFO_FLAG_READ |
> > > + VFIO_REGION_INFO_FLAG_WRITE;
> > > + break;
> > > + default:
> > > + info->offset = 0;
> > > + info->size = 0;
> > > + info->flags = 0;
> > > + return -EINVAL;
> > Thinking more about this, the above default handling actually breaks
> > additional regions such as the one added by Farhan for the error
> > events. I think this needs to instead call the generic
> > vfio_pci_ioctl_get_region_info() for other regions.
>
> Note for error events we are using a VFIO device feature and not a
> region. At this point we actually don't have additional regions. I do
> agree that if we were to add a zpci specific region we would need to
> update here and ism_vfio_pci_rw() to allow region read/write ops. This
> ISM driver would also need to support all the possible callbacks in
> struct vfio_pci_regops.
>
> Given that we don't have any additional regions yet, I am conflicted if
> it makes sense to add all the code to support additional regions without
> a good way to test it. Maybe its something we can defer when if we add
> additional regions? Thanks
>
> Farhan
@Alex, we experimented a bit and it turns out with the custom
ISM_VFIO_PCI_OFFSET_SHIFT it becomes quite tricky to reliably call into
common vfio-pci code for handling other regions as that again relies on
the normal VFIO_PCI_OFFSET_SHIFT.
So we came up with two options:
1) vfio-pci-ism will only support this basic set of regions. It seems
to us that new extensions like Farhan's error events should use device
features anyway, so this to us seems like an acceptable limitation and
would essentially mean we take this patch as is.
2) We revisit the original idea of adjusting VFIO_PCI_OFFSET_SHIFT
globally but do so only for CONFIG_64BIT so as not to break 32 bit
platforms. This would simplify vfio-pci-ism and cause less code
duplication but also affects everyone else.
Either option works for us, so I'm hoping for your input.
Thanks,
Niklas
On Fri, 27 Mar 2026 15:53:30 +0100
Niklas Schnelle <schnelle@linux.ibm.com> wrote:
> On Thu, 2026-03-26 at 12:05 -0700, Farhan Ali wrote:
> > On 3/26/2026 6:03 AM, Niklas Schnelle wrote:
> > > On Wed, 2026-03-25 at 14:31 +0100, Julian Ruess wrote:
> > > > Add a vfio_pci variant driver for the s390-specific Internal Shared
> > > > Memory (ISM) devices used for inter-VM communication.
> > > >
> > > > This enables the development of vfio-pci-based user space drivers for
> > > > ISM devices.
> > > >
> > > > On s390, kernel primitives such as ioread() and iowrite() are switched
> > > > over from function-handle-based PCI load/stores instructions to PCI
> > > > memory-I/O (MIO) loads/stores when these are available and not
> > > > explicitly disabled. Since these instructions cannot be used with ISM
> > > > devices, ensure that classic function-handle-based PCI instructions are
> > > > used instead.
> > > >
> > > > The driver is still required even when MIO instructions are disabled, as
> > > > the ISM device relies on the PCI store block (PCISTB) instruction to
> > > > perform write operations.
> > > >
> > > > Stores are not fragmented, therefore one ioctl corresponds to exactly
> > > > one PCISTB instruction. User space must ensure to not write more than
> > > > 4096 bytes at once to an ISM BAR which is the maximum payload of the
> > > > PCISTB instruction.
> > > >
> > > > Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
> > > > Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> > > > Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
> > > > ---
> > > > drivers/vfio/pci/Kconfig | 2 +
> > > > drivers/vfio/pci/Makefile | 2 +
> > > > drivers/vfio/pci/ism/Kconfig | 10 ++
> > > > drivers/vfio/pci/ism/Makefile | 3 +
> > > > drivers/vfio/pci/ism/main.c | 408 ++++++++++++++++++++++++++++++++++++++++++
> > > > 5 files changed, 425 insertions(+)
> > > >
> > > --- snip ---
> > > > +
> > > > +static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
> > > > + struct vfio_region_info *info,
> > > > + struct vfio_info_cap *caps)
> > > > +{
> > > > + struct vfio_pci_core_device *vdev =
> > > > + container_of(core_vdev, struct vfio_pci_core_device, vdev);
> > > > + struct pci_dev *pdev = vdev->pdev;
> > > > +
> > > > + switch (info->index) {
> > > > + case VFIO_PCI_CONFIG_REGION_INDEX:
> > > > + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> > > > + info->size = pdev->cfg_size;
> > > > + info->flags = VFIO_REGION_INFO_FLAG_READ |
> > > > + VFIO_REGION_INFO_FLAG_WRITE;
> > > > + break;
> > > > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> > > > + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> > > > + info->size = pci_resource_len(pdev, info->index);
> > > > + if (!info->size) {
> > > > + info->flags = 0;
> > > > + break;
> > > > + }
> > > > + info->flags = VFIO_REGION_INFO_FLAG_READ |
> > > > + VFIO_REGION_INFO_FLAG_WRITE;
> > > > + break;
> > > > + default:
> > > > + info->offset = 0;
> > > > + info->size = 0;
> > > > + info->flags = 0;
> > > > + return -EINVAL;
> > > Thinking more about this, the above default handling actually breaks
> > > additional regions such as the one added by Farhan for the error
> > > events. I think this needs to instead call the generic
> > > vfio_pci_ioctl_get_region_info() for other regions.
> >
> > Note for error events we are using a VFIO device feature and not a
> > region. At this point we actually don't have additional regions. I do
> > agree that if we were to add a zpci specific region we would need to
> > update here and ism_vfio_pci_rw() to allow region read/write ops. This
> > ISM driver would also need to support all the possible callbacks in
> > struct vfio_pci_regops.
> >
> > Given that we don't have any additional regions yet, I am conflicted if
> > it makes sense to add all the code to support additional regions without
> > a good way to test it. Maybe its something we can defer when if we add
> > additional regions? Thanks
> >
> > Farhan
>
> @Alex, we experimented a bit and it turns out with the custom
> ISM_VFIO_PCI_OFFSET_SHIFT it becomes quite tricky to reliably call into
> common vfio-pci code for handling other regions as that again relies on
> the normal VFIO_PCI_OFFSET_SHIFT.
>
> So we came up with two options:
>
> 1) vfio-pci-ism will only support this basic set of regions. It seems
> to us that new extensions like Farhan's error events should use device
> features anyway, so this to us seems like an acceptable limitation and
> would essentially mean we take this patch as is.
>
> 2) We revisit the original idea of adjusting VFIO_PCI_OFFSET_SHIFT
> globally but do so only for CONFIG_64BIT so as not to break 32 bit
> platforms. This would simplify vfio-pci-ism and cause less code
> duplication but also affects everyone else.
>
> Either option works for us, so I'm hoping for your input.
There's risk involved with changing the default shift. The fear is
there's userspace drivers that hard code the shift. DPDK was even such
a user at one point, iirc. Maybe it's ok to break such users, maybe
there are actually no such users left and it's all FUD at this point.
Either way, I have a hard time justifying that risk for a single,
obscure S390 device.
Rather than using CONFIG_64BIT as a basis for using a different
default, should we start out with a more narrow scope of CONFIG_S390?
It's likely inevitable that we'll hit this wall in general, but maybe
S390 can be the "crash test dummy" to prove to userspace that they
really need to use the uAPI rather than hard coded values.
Farhan's series is a bit of a straw-man since it doesn't actually
convey the error codes via regions, but I can certainly agree that
using a unique region shift becomes an ongoing burden relative to
vfio-pci-core helpers. I can accept an S390 specific (for now) change
to the region spacing if that's the most sensible path. Thanks,
Alex
On Mon, 2026-03-30 at 09:36 -0600, Alex Williamson wrote:
> On Fri, 27 Mar 2026 15:53:30 +0100
> Niklas Schnelle <schnelle@linux.ibm.com> wrote:
>
> > On Thu, 2026-03-26 at 12:05 -0700, Farhan Ali wrote:
> > > On 3/26/2026 6:03 AM, Niklas Schnelle wrote:
> > > > On Wed, 2026-03-25 at 14:31 +0100, Julian Ruess wrote:
> > > > > Add a vfio_pci variant driver for the s390-specific Internal Shared
> > > > > Memory (ISM) devices used for inter-VM communication.
> > > > >
> > > > > This enables the development of vfio-pci-based user space drivers for
> > > > > ISM devices.
> > > > >
> > > > > On s390, kernel primitives such as ioread() and iowrite() are switched
> > > > > over from function-handle-based PCI load/stores instructions to PCI
> > > > > memory-I/O (MIO) loads/stores when these are available and not
> > > > > explicitly disabled. Since these instructions cannot be used with ISM
> > > > > devices, ensure that classic function-handle-based PCI instructions are
> > > > > used instead.
> > > > >
> > > > > The driver is still required even when MIO instructions are disabled, as
> > > > > the ISM device relies on the PCI store block (PCISTB) instruction to
> > > > > perform write operations.
> > > > >
> > > > > Stores are not fragmented, therefore one ioctl corresponds to exactly
> > > > > one PCISTB instruction. User space must ensure to not write more than
> > > > > 4096 bytes at once to an ISM BAR which is the maximum payload of the
> > > > > PCISTB instruction.
> > > > >
> > > > > Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
> > > > > Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> > > > > Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
> > > > > ---
> > > > > drivers/vfio/pci/Kconfig | 2 +
> > > > > drivers/vfio/pci/Makefile | 2 +
> > > > > drivers/vfio/pci/ism/Kconfig | 10 ++
> > > > > drivers/vfio/pci/ism/Makefile | 3 +
> > > > > drivers/vfio/pci/ism/main.c | 408 ++++++++++++++++++++++++++++++++++++++++++
> > > > > 5 files changed, 425 insertions(+)
> > > > >
> > > > --- snip ---
> > > > > +
> > > > > +static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
> > > > > + struct vfio_region_info *info,
> > > > > + struct vfio_info_cap *caps)
> > > > > +{
> > > > > + struct vfio_pci_core_device *vdev =
> > > > > + container_of(core_vdev, struct vfio_pci_core_device, vdev);
> > > > > + struct pci_dev *pdev = vdev->pdev;
> > > > > +
> > > > > + switch (info->index) {
> > > > > + case VFIO_PCI_CONFIG_REGION_INDEX:
> > > > > + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> > > > > + info->size = pdev->cfg_size;
> > > > > + info->flags = VFIO_REGION_INFO_FLAG_READ |
> > > > > + VFIO_REGION_INFO_FLAG_WRITE;
> > > > > + break;
> > > > > + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> > > > > + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> > > > > + info->size = pci_resource_len(pdev, info->index);
> > > > > + if (!info->size) {
> > > > > + info->flags = 0;
> > > > > + break;
> > > > > + }
> > > > > + info->flags = VFIO_REGION_INFO_FLAG_READ |
> > > > > + VFIO_REGION_INFO_FLAG_WRITE;
> > > > > + break;
> > > > > + default:
> > > > > + info->offset = 0;
> > > > > + info->size = 0;
> > > > > + info->flags = 0;
> > > > > + return -EINVAL;
> > > > Thinking more about this, the above default handling actually breaks
> > > > additional regions such as the one added by Farhan for the error
> > > > events. I think this needs to instead call the generic
> > > > vfio_pci_ioctl_get_region_info() for other regions.
> > >
> > > Note for error events we are using a VFIO device feature and not a
> > > region. At this point we actually don't have additional regions. I do
> > > agree that if we were to add a zpci specific region we would need to
> > > update here and ism_vfio_pci_rw() to allow region read/write ops. This
> > > ISM driver would also need to support all the possible callbacks in
> > > struct vfio_pci_regops.
> > >
> > > Given that we don't have any additional regions yet, I am conflicted if
> > > it makes sense to add all the code to support additional regions without
> > > a good way to test it. Maybe its something we can defer when if we add
> > > additional regions? Thanks
> > >
> > > Farhan
> >
> > @Alex, we experimented a bit and it turns out with the custom
> > ISM_VFIO_PCI_OFFSET_SHIFT it becomes quite tricky to reliably call into
> > common vfio-pci code for handling other regions as that again relies on
> > the normal VFIO_PCI_OFFSET_SHIFT.
> >
> > So we came up with two options:
> >
> > 1) vfio-pci-ism will only support this basic set of regions. It seems
> > to us that new extensions like Farhan's error events should use device
> > features anyway, so this to us seems like an acceptable limitation and
> > would essentially mean we take this patch as is.
> >
> > 2) We revisit the original idea of adjusting VFIO_PCI_OFFSET_SHIFT
> > globally but do so only for CONFIG_64BIT so as not to break 32 bit
> > platforms. This would simplify vfio-pci-ism and cause less code
> > duplication but also affects everyone else.
> >
> > Either option works for us, so I'm hoping for your input.
>
> There's risk involved with changing the default shift. The fear is
> there's userspace drivers that hard code the shift. DPDK was even such
> a user at one point, iirc. Maybe it's ok to break such users, maybe
> there are actually no such users left and it's all FUD at this point.
> Either way, I have a hard time justifying that risk for a single,
> obscure S390 device.
>
> Rather than using CONFIG_64BIT as a basis for using a different
> default, should we start out with a more narrow scope of CONFIG_S390?
>
> It's likely inevitable that we'll hit this wall in general, but maybe
> S390 can be the "crash test dummy" to prove to userspace that they
> really need to use the uAPI rather than hard coded values.
I fear we'd not be a very good crash test dummy / canary in the coal
mine. We don't have DPDK support, nor a good selection of devices.
Also, apart from us developers almost our entire user base is on slow
moving enterprise distributions running critical but also slowly
updated enterprise workloads. If we want to do this long term maybe a
better approach would be a specific experimental/sanitizing Kconfig
option? As a user of a fast moving rolling distribution on my private
systems I'm all for trying things out in the right environments and
maybe such a distro would be willing to play canary ;)
>
> Farhan's series is a bit of a straw-man since it doesn't actually
> convey the error codes via regions, but I can certainly agree that
> using a unique region shift becomes an ongoing burden relative to
> vfio-pci-core helpers. I can accept an S390 specific (for now) change
> to the region spacing if that's the most sensible path. Thanks,
>
> Alex
Yes, as Farhan noted this was a false alarm. We did actually have a
patch in internal review that did use an extra region but we already
decided that this should instead take the same approach as Farhan's
work.
So overall I think I think we should go with the first option.
We don't know of any extra regions we'll have to support and we can
document that this is not supported for vfio-pci-ism. Then if we do
need it we can still revisit option 2.
Thanks,
Niklas
On Mon, Mar 30, 2026 at 09:36:46AM -0600, Alex Williamson wrote: > There's risk involved with changing the default shift. The fear is > there's userspace drivers that hard code the shift. DPDK was even such > a user at one point, iirc. Maybe it's ok to break such users, maybe > there are actually no such users left and it's all FUD at this point. > Either way, I have a hard time justifying that risk for a single, > obscure S390 device. If we go ahead with that DMABUF series could obscure cases like this be told to just get a DMABUF FD and then mmap it? Avoid this whole issue? Jason
On Mon, 30 Mar 2026 12:56:51 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Mon, Mar 30, 2026 at 09:36:46AM -0600, Alex Williamson wrote: > > There's risk involved with changing the default shift. The fear is > > there's userspace drivers that hard code the shift. DPDK was even such > > a user at one point, iirc. Maybe it's ok to break such users, maybe > > there are actually no such users left and it's all FUD at this point. > > Either way, I have a hard time justifying that risk for a single, > > obscure S390 device. > > If we go ahead with that DMABUF series could obscure cases like this > be told to just get a DMABUF FD and then mmap it? Avoid this whole > issue? The ISM device here doesn't support mmap, so that much is still a problem. However, if we imagine a future where we've fully converted to dma-buf for both DMA and CPU access to BARs, then maybe... I don't know how we'd describe a region overflow in the "legacy" ioctls to maintain some consistency between what's exposed through the region versus the dma-buf. Thanks, Alex
On Mon, Mar 30, 2026 at 12:09:45PM -0600, Alex Williamson wrote: > On Mon, 30 Mar 2026 12:56:51 -0300 > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > On Mon, Mar 30, 2026 at 09:36:46AM -0600, Alex Williamson wrote: > > > There's risk involved with changing the default shift. The fear is > > > there's userspace drivers that hard code the shift. DPDK was even such > > > a user at one point, iirc. Maybe it's ok to break such users, maybe > > > there are actually no such users left and it's all FUD at this point. > > > Either way, I have a hard time justifying that risk for a single, > > > obscure S390 device. > > > > If we go ahead with that DMABUF series could obscure cases like this > > be told to just get a DMABUF FD and then mmap it? Avoid this whole > > issue? > > The ISM device here doesn't support mmap, so that much is still a > problem. However, if we imagine a future where we've fully converted > to dma-buf for both DMA and CPU access to BARs, then maybe... I don't > know how we'd describe a region overflow in the "legacy" ioctls to > maintain some consistency between what's exposed through the region > versus the dma-buf. Thanks, Oh, Ok.. I go back to what I've said before many times, if we fix this we should fix it by making the region offset fully dynamic and managed by a maple tree. We can detect at runtime when the vfio device is created if it is compatible with the legacy indexs and use them by default to maintain compatibility. But cases like this where the region is just too big (and GPUs will get there soon too!) can run a full dynamic mode. Jason
On Mon, 30 Mar 2026 15:16:45 -0300 Jason Gunthorpe <jgg@ziepe.ca> wrote: > On Mon, Mar 30, 2026 at 12:09:45PM -0600, Alex Williamson wrote: > > On Mon, 30 Mar 2026 12:56:51 -0300 > > Jason Gunthorpe <jgg@ziepe.ca> wrote: > > > > > On Mon, Mar 30, 2026 at 09:36:46AM -0600, Alex Williamson wrote: > > > > There's risk involved with changing the default shift. The fear is > > > > there's userspace drivers that hard code the shift. DPDK was even such > > > > a user at one point, iirc. Maybe it's ok to break such users, maybe > > > > there are actually no such users left and it's all FUD at this point. > > > > Either way, I have a hard time justifying that risk for a single, > > > > obscure S390 device. > > > > > > If we go ahead with that DMABUF series could obscure cases like this > > > be told to just get a DMABUF FD and then mmap it? Avoid this whole > > > issue? > > > > The ISM device here doesn't support mmap, so that much is still a > > problem. However, if we imagine a future where we've fully converted > > to dma-buf for both DMA and CPU access to BARs, then maybe... I don't > > know how we'd describe a region overflow in the "legacy" ioctls to > > maintain some consistency between what's exposed through the region > > versus the dma-buf. Thanks, > > Oh, Ok.. I go back to what I've said before many times, if we fix this > we should fix it by making the region offset fully dynamic and managed > by a maple tree. > > We can detect at runtime when the vfio device is created if it is > compatible with the legacy indexs and use them by default to maintain > compatibility. But cases like this where the region is just too big > (and GPUs will get there soon too!) can run a full dynamic mode. Yes, that would be the ideal balance of enabling larger BARs while retaining compatibility for existing devices. Sounds like we'll continue down the variant driver path for this one-off device for now. It remains fairly self contained so long as we don't add more regions for vfio-pci-core to manage. This could be a good project if someone is looking though, especially if it had a build-time config option to set the minimum region size so we can exercise it before we hit 1TB BARs. Thanks, Alex
On Mon, Mar 30, 2026 at 12:39:25PM -0600, Alex Williamson wrote: > Yes, that would be the ideal balance of enabling larger BARs while > retaining compatibility for existing devices. Sounds like we'll > continue down the variant driver path for this one-off device for > now. It remains fairly self contained so long as we don't add more > regions for vfio-pci-core to manage. > > This could be a good project if someone is looking though, especially > if it had a build-time config option to set the minimum region size so > we can exercise it before we hit 1TB BARs. Thanks, Well, it took claude an hour to vibe code a draft from a four sentence prompt.. I see many things that need changing/cleaning in here, but the patch strategy and general idea looks sound. I guess another day of effort would probably get things into something presentable. We are not so far away - maybe the variant driver path is not necessary? https://github.com/jgunthorpe/linux/commits/vfio_maple_tree/ Jason
On Mon, 2026-03-30 at 21:03 -0300, Jason Gunthorpe wrote: > On Mon, Mar 30, 2026 at 12:39:25PM -0600, Alex Williamson wrote: > > Yes, that would be the ideal balance of enabling larger BARs while > > retaining compatibility for existing devices. Sounds like we'll > > continue down the variant driver path for this one-off device for > > now. It remains fairly self contained so long as we don't add more > > regions for vfio-pci-core to manage. > > > > This could be a good project if someone is looking though, especially > > if it had a build-time config option to set the minimum region size so > > we can exercise it before we hit 1TB BARs. Thanks, > > Well, it took claude an hour to vibe code a draft from a four sentence > prompt.. I see many things that need changing/cleaning in here, but > the patch strategy and general idea looks sound. > > I guess another day of effort would probably get things into something > presentable. We are not so far away - maybe the variant driver path is > not necessary? > > https://github.com/jgunthorpe/linux/commits/vfio_maple_tree/ > > Jason I'm in no way against changing the offset handling in general but for what it's worth, we need a variant driver for ISM regardless, though it would be smaller with large offset support. This is because besides the large BAR we also have to use our classic function handle based PCI instructions to access that BAR instead of the newer memory-I/O (MIO) PCI instructions. And sadly the way the ISM device uses the PCI Store Block instruction there is no straight forward way to enable the newer instructions even if we changed the device implementation. This also means that we would want to keep the no-mmap() restriction. Thanks, Niklas
On Tue, 31 Mar 2026 10:29:31 +0200 Niklas Schnelle <schnelle@linux.ibm.com> wrote: > On Mon, 2026-03-30 at 21:03 -0300, Jason Gunthorpe wrote: > > On Mon, Mar 30, 2026 at 12:39:25PM -0600, Alex Williamson wrote: > > > Yes, that would be the ideal balance of enabling larger BARs while > > > retaining compatibility for existing devices. Sounds like we'll > > > continue down the variant driver path for this one-off device for > > > now. It remains fairly self contained so long as we don't add more > > > regions for vfio-pci-core to manage. > > > > > > This could be a good project if someone is looking though, especially > > > if it had a build-time config option to set the minimum region size so > > > we can exercise it before we hit 1TB BARs. Thanks, > > > > Well, it took claude an hour to vibe code a draft from a four sentence > > prompt.. I see many things that need changing/cleaning in here, but > > the patch strategy and general idea looks sound. > > > > I guess another day of effort would probably get things into something > > presentable. We are not so far away - maybe the variant driver path is > > not necessary? > > > > https://github.com/jgunthorpe/linux/commits/vfio_maple_tree/ > > > > Jason > > I'm in no way against changing the offset handling in general but for > what it's worth, we need a variant driver for ISM regardless, though it > would be smaller with large offset support. This is because besides the > large BAR we also have to use our classic function handle based PCI > instructions to access that BAR instead of the newer memory-I/O (MIO) > PCI instructions. And sadly the way the ISM device uses the PCI Store > Block instruction there is no straight forward way to enable the newer > instructions even if we changed the device implementation. This also > means that we would want to keep the no-mmap() restriction. It feels a bit late in the cycle for an untested maple tree conversion anyway, though I am interested in it for v7.2. So I think we're agreed in the variant driver here to handle the excessively large BAR with no currently foreseeable need for additional regions, and maybe it gets simplified when that conversion occurs later. Thanks, Alex
© 2016 - 2026 Red Hat, Inc.