[PATCH v3 2/3] vfio/ism: Implement vfio_pci driver for ISM devices

Julian Ruess posted 3 patches 1 month ago
There is a newer version of this series
[PATCH v3 2/3] vfio/ism: Implement vfio_pci driver for ISM devices
Posted by Julian Ruess 1 month ago
Add a vfio_pci variant driver for the s390-specific Internal Shared
Memory (ISM) devices used for inter-VM communication.

This enables the development of vfio-pci-based user space drivers for
ISM devices.

On s390, kernel primitives such as ioread() and iowrite() are switched
over from function handle based PCI load/stores instructions to PCI
memory-I/O (MIO) loads/stores when these are available and not
explicitly disabled. Since these instructions cannot be used with ISM
devices, ensure that classic function handle-based PCI instructions are
used instead.

The driver is still required even when MIO instructions are disabled, as
the ISM device relies on the PCI store block (PCISTB) instruction to
perform write operations.

Stores are not fragmented, therefore one ioctl corresponds to exactly
one PCISTB instruction. User space must ensure to not write more than
4096 bytes at once to an ISM BAR which is the maximum payload of the
PCISTB instruction.

Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
---
 drivers/vfio/pci/Kconfig      |   2 +
 drivers/vfio/pci/Makefile     |   2 +
 drivers/vfio/pci/ism/Kconfig  |  10 ++
 drivers/vfio/pci/ism/Makefile |   3 +
 drivers/vfio/pci/ism/main.c   | 343 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 360 insertions(+)

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 1e82b44bda1a0a544e1add7f4b06edecf35aaf81..296bf01e185ecacc388ebc69e92706c99e47c814 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -60,6 +60,8 @@ config VFIO_PCI_DMABUF
 
 source "drivers/vfio/pci/mlx5/Kconfig"
 
+source "drivers/vfio/pci/ism/Kconfig"
+
 source "drivers/vfio/pci/hisilicon/Kconfig"
 
 source "drivers/vfio/pci/pds/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index e0a0757dd1d2b0bc69b7e4d79441d5cacf4e1cd8..6138f1bf241df04e7419f196b404abdf9b194050 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -11,6 +11,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
 
 obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
 
+obj-$(CONFIG_ISM_VFIO_PCI)           += ism/
+
 obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
 
 obj-$(CONFIG_PDS_VFIO_PCI) += pds/
diff --git a/drivers/vfio/pci/ism/Kconfig b/drivers/vfio/pci/ism/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..02f47d25fed2d34c732b67b3a3655b64a7625467
--- /dev/null
+++ b/drivers/vfio/pci/ism/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+config ISM_VFIO_PCI
+	tristate "VFIO support for ISM devices"
+	depends on S390
+	select VFIO_PCI_CORE
+	help
+	  This provides user space support for IBM Internal Shared Memory (ISM)
+	  Adapter devices using the VFIO framework.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/ism/Makefile b/drivers/vfio/pci/ism/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..32cc3c66dd11395da85a2b6f05b3d97036ed8a35
--- /dev/null
+++ b/drivers/vfio/pci/ism/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ISM_VFIO_PCI) += ism-vfio-pci.o
+ism-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/ism/main.c b/drivers/vfio/pci/ism/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..54a378140aa3300bd1ff9f6d9c626ef56f6be067
--- /dev/null
+++ b/drivers/vfio/pci/ism/main.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * vfio-ISM driver for s390
+ *
+ * Copyright IBM Corp.
+ */
+
+#include "../vfio_pci_priv.h"
+#include "linux/slab.h"
+
+#define ISM_VFIO_PCI_OFFSET_SHIFT   48
+#define ISM_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> ISM_VFIO_PCI_OFFSET_SHIFT)
+#define ISM_VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << ISM_VFIO_PCI_OFFSET_SHIFT)
+#define ISM_VFIO_PCI_OFFSET_MASK (((u64)(1) << ISM_VFIO_PCI_OFFSET_SHIFT) - 1)
+
+struct kmem_cache *store_block_cache;
+
+struct ism_vfio_pci_core_device {
+	struct vfio_pci_core_device core_device;
+};
+
+static int ism_pci_open_device(struct vfio_device *core_vdev)
+{
+	struct ism_vfio_pci_core_device *ivdev;
+	struct vfio_pci_core_device *vdev;
+	int ret;
+
+	ivdev = container_of(core_vdev, struct ism_vfio_pci_core_device,
+			     core_device.vdev);
+	vdev = &ivdev->core_device;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	vfio_pci_core_finish_enable(vdev);
+	return 0;
+}
+
+/*
+ * ism_vfio_pci_do_io_r()
+ *
+ * On s390, kernel primitives such as ioread() and iowrite() are switched over
+ * from function handle based PCI load/stores instructions to PCI memory-I/O (MIO)
+ * loads/stores when these are available and not explicitly disabled. Since these
+ * instructions cannot be used with ISM devices, ensure that classic function
+ * handle-based PCI instructions are used instead.
+ */
+static ssize_t ism_vfio_pci_do_io_r(struct vfio_pci_core_device *vdev,
+				    char __user *buf, loff_t off, size_t count,
+				    int bar)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	ssize_t ret, done = 0;
+	u64 req, length, tmp;
+
+	while (count) {
+		if (count >= 8 && IS_ALIGNED(off, 8))
+			length = 8;
+		else if (count >= 4 && IS_ALIGNED(off, 4))
+			length = 4;
+		else if (count >= 2 && IS_ALIGNED(off, 2))
+			length = 2;
+		else
+			length = 1;
+		req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, length);
+		/*
+		 * Use __zpci_load() to bypass automatic use of PCI MIO instructions
+		 * which are not supported on ISM devices
+		 */
+		ret = __zpci_load(&tmp, req, off);
+		if (ret)
+			return ret;
+		if (copy_to_user(buf, &tmp, length))
+			return -EFAULT;
+		count -= length;
+		done += length;
+		off += length;
+		buf += length;
+	}
+	return done;
+}
+
+/*
+ * ism_vfio_pci_do_io_w()
+ *
+ * Ensure that the PCI store block (PCISTB) instruction is used as required by the
+ * ISM device. The ISM device also uses a 256 TiB BAR 0 for write operations,
+ * which requires a 48bit region address space (ISM_VFIO_PCI_OFFSET_SHIFT).
+ */
+static ssize_t ism_vfio_pci_do_io_w(struct vfio_pci_core_device *vdev,
+				    char __user *buf, loff_t off, size_t count,
+				    int bar)
+{
+	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	ssize_t ret;
+	void *data;
+	u64 req;
+
+	if (count > zdev->maxstbl)
+		return -EINVAL;
+	if (((off % PAGE_SIZE) + count) > PAGE_SIZE)
+		return -EINVAL;
+
+	data = kmem_cache_zalloc(store_block_cache, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	if (copy_from_user(data, buf, count)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, count);
+	ret = __zpci_store_block(data, req, off);
+	if (ret)
+		goto out_free;
+
+	ret = count;
+
+out_free:
+	kmem_cache_free(store_block_cache, data);
+	return ret;
+}
+
+static ssize_t ism_vfio_pci_bar_rw(struct vfio_pci_core_device *vdev,
+				   char __user *buf, size_t count, loff_t *ppos,
+				   bool iswrite)
+{
+	int bar = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	loff_t pos = *ppos & ISM_VFIO_PCI_OFFSET_MASK;
+	resource_size_t end;
+	ssize_t done = 0;
+
+	if (pci_resource_start(vdev->pdev, bar))
+		end = pci_resource_len(vdev->pdev, bar);
+	else
+		return -EINVAL;
+
+	if (pos >= end)
+		return -EINVAL;
+
+	count = min(count, (size_t)(end - pos));
+
+	if (iswrite)
+		done = ism_vfio_pci_do_io_w(vdev, buf, pos, count, bar);
+	else
+		done = ism_vfio_pci_do_io_r(vdev, buf, pos, count, bar);
+
+	if (done >= 0)
+		*ppos += done;
+
+	return done;
+}
+
+static ssize_t ism_vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
+				      char __user *buf, size_t count,
+				      loff_t *ppos, bool iswrite)
+{
+	loff_t pos = *ppos;
+	size_t done = 0;
+	int ret = 0;
+
+	pos &= ISM_VFIO_PCI_OFFSET_MASK;
+
+	while (count) {
+		/*
+		 * zPCI must not use MIO instructions for config space access,
+		 * so we can use common code path here.
+		 */
+		ret = vfio_pci_config_rw_single(vdev, buf, count, &pos, iswrite);
+		if (ret < 0)
+			return ret;
+
+		count -= ret;
+		done += ret;
+		buf += ret;
+		pos += ret;
+	}
+
+	*ppos += done;
+
+	return done;
+}
+
+static ssize_t ism_vfio_pci_rw(struct vfio_device *core_vdev, char __user *buf,
+			       size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int index = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	struct vfio_pci_core_device *vdev;
+	int ret;
+
+	vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
+	if (!count)
+		return 0;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		ret = ism_vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
+		break;
+
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+		ret = ism_vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static ssize_t ism_vfio_pci_read(struct vfio_device *core_vdev,
+				 char __user *buf, size_t count, loff_t *ppos)
+{
+	return ism_vfio_pci_rw(core_vdev, buf, count, ppos, false);
+}
+
+static ssize_t ism_vfio_pci_write(struct vfio_device *core_vdev,
+				  const char __user *buf, size_t count,
+				  loff_t *ppos)
+{
+	return ism_vfio_pci_rw(core_vdev, (char __user *)buf, count, ppos,
+			       true);
+}
+
+static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+					      struct vfio_region_info *info,
+					      struct vfio_info_cap *caps)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+
+	switch (info->index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
+		info->size = pdev->cfg_size;
+		info->flags = VFIO_REGION_INFO_FLAG_READ |
+			      VFIO_REGION_INFO_FLAG_WRITE;
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+		info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
+		info->size = pci_resource_len(pdev, info->index);
+		if (!info->size) {
+			info->flags = 0;
+			break;
+		}
+		info->flags = VFIO_REGION_INFO_FLAG_READ |
+			      VFIO_REGION_INFO_FLAG_WRITE;
+		break;
+	default:
+		info->offset = 0;
+		info->size = 0;
+		info->flags = 0;
+	}
+	return 0;
+}
+
+static const struct vfio_device_ops ism_pci_ops = {
+	.name = "ism-vfio-pci",
+	.init = vfio_pci_core_init_dev,
+	.release = vfio_pci_core_release_dev,
+	.open_device = ism_pci_open_device,
+	.close_device = vfio_pci_core_close_device,
+	.ioctl = vfio_pci_core_ioctl,
+	.get_region_info_caps = ism_vfio_pci_ioctl_get_region_info,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read = ism_vfio_pci_read,
+	.write = ism_vfio_pci_write,
+	.request = vfio_pci_core_request,
+	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
+	.bind_iommufd = vfio_iommufd_physical_bind,
+	.unbind_iommufd = vfio_iommufd_physical_unbind,
+	.attach_ioas = vfio_iommufd_physical_attach_ioas,
+	.detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+static int ism_vfio_pci_probe(struct pci_dev *pdev,
+			      const struct pci_device_id *id)
+{
+	struct ism_vfio_pci_core_device *ivpcd;
+	struct zpci_dev *zdev = to_zpci(pdev);
+	int ret;
+
+	ivpcd = vfio_alloc_device(ism_vfio_pci_core_device, core_device.vdev,
+				  &pdev->dev, &ism_pci_ops);
+	if (IS_ERR(ivpcd))
+		return PTR_ERR(ivpcd);
+
+	store_block_cache = kmem_cache_create("store_block_cache",
+					      zdev->maxstbl, 0, 0, NULL);
+	if (!store_block_cache)
+		return -ENOMEM;
+
+	dev_set_drvdata(&pdev->dev, &ivpcd->core_device);
+	ret = vfio_pci_core_register_device(&ivpcd->core_device);
+	if (ret) {
+		kmem_cache_destroy(store_block_cache);
+		vfio_put_device(&ivpcd->core_device.vdev);
+	}
+
+	return ret;
+}
+
+static void ism_vfio_pci_remove(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device;
+	struct ism_vfio_pci_core_device *ivpcd;
+
+	core_device = dev_get_drvdata(&pdev->dev);
+	ivpcd = container_of(core_device, struct ism_vfio_pci_core_device,
+			     core_device);
+
+	vfio_pci_core_unregister_device(&ivpcd->core_device);
+	vfio_put_device(&ivpcd->core_device.vdev);
+
+	kmem_cache_destroy(store_block_cache);
+}
+
+static const struct pci_device_id ism_device_table[] = {
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_IBM,
+					  PCI_DEVICE_ID_IBM_ISM) },
+	{}
+};
+MODULE_DEVICE_TABLE(pci, ism_device_table);
+
+static struct pci_driver ism_vfio_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = ism_device_table,
+	.probe = ism_vfio_pci_probe,
+	.remove = ism_vfio_pci_remove,
+	.err_handler = &vfio_pci_core_err_handlers,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(ism_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("vfio-pci variant driver for the IBM Internal Shared Memory (ISM) device");
+MODULE_AUTHOR("IBM Corporation");

-- 
2.51.0
Re: [PATCH v3 2/3] vfio/ism: Implement vfio_pci driver for ISM devices
Posted by Alex Williamson 1 month ago
On Thu, 05 Mar 2026 13:20:14 +0100
Julian Ruess <julianr@linux.ibm.com> wrote:

> Add a vfio_pci variant driver for the s390-specific Internal Shared
> Memory (ISM) devices used for inter-VM communication.
> 
> This enables the development of vfio-pci-based user space drivers for
> ISM devices.
> 
> On s390, kernel primitives such as ioread() and iowrite() are switched
> over from function handle based PCI load/stores instructions to PCI
> memory-I/O (MIO) loads/stores when these are available and not
> explicitly disabled. Since these instructions cannot be used with ISM
> devices, ensure that classic function handle-based PCI instructions are
> used instead.
> 
> The driver is still required even when MIO instructions are disabled, as
> the ISM device relies on the PCI store block (PCISTB) instruction to
> perform write operations.
> 
> Stores are not fragmented, therefore one ioctl corresponds to exactly
> one PCISTB instruction. User space must ensure to not write more than
> 4096 bytes at once to an ISM BAR which is the maximum payload of the
> PCISTB instruction.
> 
> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
> ---
>  drivers/vfio/pci/Kconfig      |   2 +
>  drivers/vfio/pci/Makefile     |   2 +
>  drivers/vfio/pci/ism/Kconfig  |  10 ++
>  drivers/vfio/pci/ism/Makefile |   3 +
>  drivers/vfio/pci/ism/main.c   | 343 ++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 360 insertions(+)
> 
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 1e82b44bda1a0a544e1add7f4b06edecf35aaf81..296bf01e185ecacc388ebc69e92706c99e47c814 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -60,6 +60,8 @@ config VFIO_PCI_DMABUF
>  
>  source "drivers/vfio/pci/mlx5/Kconfig"
>  
> +source "drivers/vfio/pci/ism/Kconfig"
> +
>  source "drivers/vfio/pci/hisilicon/Kconfig"
>  
>  source "drivers/vfio/pci/pds/Kconfig"
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index e0a0757dd1d2b0bc69b7e4d79441d5cacf4e1cd8..6138f1bf241df04e7419f196b404abdf9b194050 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -11,6 +11,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
>  
>  obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
>  
> +obj-$(CONFIG_ISM_VFIO_PCI)           += ism/
> +
>  obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
>  
>  obj-$(CONFIG_PDS_VFIO_PCI) += pds/
> diff --git a/drivers/vfio/pci/ism/Kconfig b/drivers/vfio/pci/ism/Kconfig
> new file mode 100644
> index 0000000000000000000000000000000000000000..02f47d25fed2d34c732b67b3a3655b64a7625467
> --- /dev/null
> +++ b/drivers/vfio/pci/ism/Kconfig
> @@ -0,0 +1,10 @@
> +# SPDX-License-Identifier: GPL-2.0
> +config ISM_VFIO_PCI
> +	tristate "VFIO support for ISM devices"
> +	depends on S390
> +	select VFIO_PCI_CORE
> +	help
> +	  This provides user space support for IBM Internal Shared Memory (ISM)
> +	  Adapter devices using the VFIO framework.
> +
> +	  If you don't know what to do here, say N.
> diff --git a/drivers/vfio/pci/ism/Makefile b/drivers/vfio/pci/ism/Makefile
> new file mode 100644
> index 0000000000000000000000000000000000000000..32cc3c66dd11395da85a2b6f05b3d97036ed8a35
> --- /dev/null
> +++ b/drivers/vfio/pci/ism/Makefile
> @@ -0,0 +1,3 @@
> +# SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_ISM_VFIO_PCI) += ism-vfio-pci.o
> +ism-vfio-pci-y := main.o
> diff --git a/drivers/vfio/pci/ism/main.c b/drivers/vfio/pci/ism/main.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..54a378140aa3300bd1ff9f6d9c626ef56f6be067
> --- /dev/null
> +++ b/drivers/vfio/pci/ism/main.c
> @@ -0,0 +1,343 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * vfio-ISM driver for s390
> + *
> + * Copyright IBM Corp.
> + */
> +
> +#include "../vfio_pci_priv.h"
> +#include "linux/slab.h"
> +
> +#define ISM_VFIO_PCI_OFFSET_SHIFT   48
> +#define ISM_VFIO_PCI_OFFSET_TO_INDEX(off) (off >> ISM_VFIO_PCI_OFFSET_SHIFT)
> +#define ISM_VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << ISM_VFIO_PCI_OFFSET_SHIFT)
> +#define ISM_VFIO_PCI_OFFSET_MASK (((u64)(1) << ISM_VFIO_PCI_OFFSET_SHIFT) - 1)
> +
> +struct kmem_cache *store_block_cache;
> +
> +struct ism_vfio_pci_core_device {
> +	struct vfio_pci_core_device core_device;
> +};
> +
> +static int ism_pci_open_device(struct vfio_device *core_vdev)
> +{
> +	struct ism_vfio_pci_core_device *ivdev;
> +	struct vfio_pci_core_device *vdev;
> +	int ret;
> +
> +	ivdev = container_of(core_vdev, struct ism_vfio_pci_core_device,
> +			     core_device.vdev);
> +	vdev = &ivdev->core_device;
> +
> +	ret = vfio_pci_core_enable(vdev);
> +	if (ret)
> +		return ret;
> +
> +	vfio_pci_core_finish_enable(vdev);
> +	return 0;
> +}
> +
> +/*
> + * ism_vfio_pci_do_io_r()
> + *
> + * On s390, kernel primitives such as ioread() and iowrite() are switched over
> + * from function handle based PCI load/stores instructions to PCI memory-I/O (MIO)
> + * loads/stores when these are available and not explicitly disabled. Since these
> + * instructions cannot be used with ISM devices, ensure that classic function
> + * handle-based PCI instructions are used instead.
> + */
> +static ssize_t ism_vfio_pci_do_io_r(struct vfio_pci_core_device *vdev,
> +				    char __user *buf, loff_t off, size_t count,
> +				    int bar)
> +{
> +	struct zpci_dev *zdev = to_zpci(vdev->pdev);
> +	ssize_t ret, done = 0;
> +	u64 req, length, tmp;
> +
> +	while (count) {
> +		if (count >= 8 && IS_ALIGNED(off, 8))
> +			length = 8;
> +		else if (count >= 4 && IS_ALIGNED(off, 4))
> +			length = 4;
> +		else if (count >= 2 && IS_ALIGNED(off, 2))
> +			length = 2;
> +		else
> +			length = 1;
> +		req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, length);
> +		/*
> +		 * Use __zpci_load() to bypass automatic use of PCI MIO instructions
> +		 * which are not supported on ISM devices
> +		 */
> +		ret = __zpci_load(&tmp, req, off);
> +		if (ret)
> +			return ret;
> +		if (copy_to_user(buf, &tmp, length))

Is there an endian issue here for 1/2/4 byte reads?  zpci_read_single()
uses a u64 target for zpci_load(), but then casts the result into the
destination buffer to place the low-order bytes.  AIUI, copy_to_user()
would start from the high-order bytes of the u64.

> +			return -EFAULT;
> +		count -= length;
> +		done += length;
> +		off += length;
> +		buf += length;
> +	}
> +	return done;
> +}
> +
> +/*
> + * ism_vfio_pci_do_io_w()
> + *
> + * Ensure that the PCI store block (PCISTB) instruction is used as required by the
> + * ISM device. The ISM device also uses a 256 TiB BAR 0 for write operations,
> + * which requires a 48bit region address space (ISM_VFIO_PCI_OFFSET_SHIFT).
> + */
> +static ssize_t ism_vfio_pci_do_io_w(struct vfio_pci_core_device *vdev,
> +				    char __user *buf, loff_t off, size_t count,
> +				    int bar)
> +{
> +	struct zpci_dev *zdev = to_zpci(vdev->pdev);
> +	ssize_t ret;
> +	void *data;
> +	u64 req;
> +
> +	if (count > zdev->maxstbl)
> +		return -EINVAL;
> +	if (((off % PAGE_SIZE) + count) > PAGE_SIZE)
> +		return -EINVAL;
> +
> +	data = kmem_cache_zalloc(store_block_cache, GFP_KERNEL);
> +	if (!data)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(data, buf, count)) {
> +		ret = -EFAULT;
> +		goto out_free;
> +	}
> +
> +	req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, count);
> +	ret = __zpci_store_block(data, req, off);
> +	if (ret)
> +		goto out_free;
> +
> +	ret = count;
> +
> +out_free:
> +	kmem_cache_free(store_block_cache, data);
> +	return ret;
> +}
> +
> +static ssize_t ism_vfio_pci_bar_rw(struct vfio_pci_core_device *vdev,
> +				   char __user *buf, size_t count, loff_t *ppos,
> +				   bool iswrite)
> +{
> +	int bar = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos);
> +	loff_t pos = *ppos & ISM_VFIO_PCI_OFFSET_MASK;
> +	resource_size_t end;
> +	ssize_t done = 0;
> +
> +	if (pci_resource_start(vdev->pdev, bar))
> +		end = pci_resource_len(vdev->pdev, bar);
> +	else
> +		return -EINVAL;
> +
> +	if (pos >= end)
> +		return -EINVAL;
> +
> +	count = min(count, (size_t)(end - pos));
> +
> +	if (iswrite)
> +		done = ism_vfio_pci_do_io_w(vdev, buf, pos, count, bar);
> +	else
> +		done = ism_vfio_pci_do_io_r(vdev, buf, pos, count, bar);
> +
> +	if (done >= 0)
> +		*ppos += done;
> +
> +	return done;
> +}
> +
> +static ssize_t ism_vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
> +				      char __user *buf, size_t count,
> +				      loff_t *ppos, bool iswrite)
> +{
> +	loff_t pos = *ppos;
> +	size_t done = 0;
> +	int ret = 0;
> +
> +	pos &= ISM_VFIO_PCI_OFFSET_MASK;
> +
> +	while (count) {
> +		/*
> +		 * zPCI must not use MIO instructions for config space access,
> +		 * so we can use common code path here.
> +		 */
> +		ret = vfio_pci_config_rw_single(vdev, buf, count, &pos, iswrite);
> +		if (ret < 0)
> +			return ret;
> +
> +		count -= ret;
> +		done += ret;
> +		buf += ret;
> +		pos += ret;
> +	}
> +
> +	*ppos += done;
> +
> +	return done;
> +}
> +
> +static ssize_t ism_vfio_pci_rw(struct vfio_device *core_vdev, char __user *buf,
> +			       size_t count, loff_t *ppos, bool iswrite)
> +{
> +	unsigned int index = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos);
> +	struct vfio_pci_core_device *vdev;
> +	int ret;
> +
> +	vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev);
> +
> +	if (!count)
> +		return 0;
> +
> +	switch (index) {
> +	case VFIO_PCI_CONFIG_REGION_INDEX:
> +		ret = ism_vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
> +		break;
> +
> +	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> +		ret = ism_vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
> +		break;
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +
> +static ssize_t ism_vfio_pci_read(struct vfio_device *core_vdev,
> +				 char __user *buf, size_t count, loff_t *ppos)
> +{
> +	return ism_vfio_pci_rw(core_vdev, buf, count, ppos, false);
> +}
> +
> +static ssize_t ism_vfio_pci_write(struct vfio_device *core_vdev,
> +				  const char __user *buf, size_t count,
> +				  loff_t *ppos)
> +{
> +	return ism_vfio_pci_rw(core_vdev, (char __user *)buf, count, ppos,
> +			       true);
> +}
> +
> +static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
> +					      struct vfio_region_info *info,
> +					      struct vfio_info_cap *caps)
> +{
> +	struct vfio_pci_core_device *vdev =
> +		container_of(core_vdev, struct vfio_pci_core_device, vdev);
> +	struct pci_dev *pdev = vdev->pdev;
> +
> +	switch (info->index) {
> +	case VFIO_PCI_CONFIG_REGION_INDEX:
> +		info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> +		info->size = pdev->cfg_size;
> +		info->flags = VFIO_REGION_INFO_FLAG_READ |
> +			      VFIO_REGION_INFO_FLAG_WRITE;
> +		break;
> +	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> +		info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index);
> +		info->size = pci_resource_len(pdev, info->index);
> +		if (!info->size) {
> +			info->flags = 0;
> +			break;
> +		}
> +		info->flags = VFIO_REGION_INFO_FLAG_READ |
> +			      VFIO_REGION_INFO_FLAG_WRITE;
> +		break;
> +	default:
> +		info->offset = 0;
> +		info->size = 0;
> +		info->flags = 0;
> +	}
> +	return 0;
> +}
> +
> +static const struct vfio_device_ops ism_pci_ops = {
> +	.name = "ism-vfio-pci",
> +	.init = vfio_pci_core_init_dev,
> +	.release = vfio_pci_core_release_dev,
> +	.open_device = ism_pci_open_device,
> +	.close_device = vfio_pci_core_close_device,
> +	.ioctl = vfio_pci_core_ioctl,
> +	.get_region_info_caps = ism_vfio_pci_ioctl_get_region_info,
> +	.device_feature = vfio_pci_core_ioctl_feature,
> +	.read = ism_vfio_pci_read,
> +	.write = ism_vfio_pci_write,
> +	.request = vfio_pci_core_request,
> +	.match = vfio_pci_core_match,
> +	.match_token_uuid = vfio_pci_core_match_token_uuid,
> +	.bind_iommufd = vfio_iommufd_physical_bind,
> +	.unbind_iommufd = vfio_iommufd_physical_unbind,
> +	.attach_ioas = vfio_iommufd_physical_attach_ioas,
> +	.detach_ioas = vfio_iommufd_physical_detach_ioas,
> +};
> +
> +static int ism_vfio_pci_probe(struct pci_dev *pdev,
> +			      const struct pci_device_id *id)
> +{
> +	struct ism_vfio_pci_core_device *ivpcd;
> +	struct zpci_dev *zdev = to_zpci(pdev);
> +	int ret;
> +
> +	ivpcd = vfio_alloc_device(ism_vfio_pci_core_device, core_device.vdev,
> +				  &pdev->dev, &ism_pci_ops);
> +	if (IS_ERR(ivpcd))
> +		return PTR_ERR(ivpcd);
> +
> +	store_block_cache = kmem_cache_create("store_block_cache",
> +					      zdev->maxstbl, 0, 0, NULL);
> +	if (!store_block_cache)
> +		return -ENOMEM;

ivpcd is leaked here, we need a vfio_put_device().  Thanks,

Alex

> +
> +	dev_set_drvdata(&pdev->dev, &ivpcd->core_device);
> +	ret = vfio_pci_core_register_device(&ivpcd->core_device);
> +	if (ret) {
> +		kmem_cache_destroy(store_block_cache);
> +		vfio_put_device(&ivpcd->core_device.vdev);
> +	}
> +
> +	return ret;
> +}
> +
> +static void ism_vfio_pci_remove(struct pci_dev *pdev)
> +{
> +	struct vfio_pci_core_device *core_device;
> +	struct ism_vfio_pci_core_device *ivpcd;
> +
> +	core_device = dev_get_drvdata(&pdev->dev);
> +	ivpcd = container_of(core_device, struct ism_vfio_pci_core_device,
> +			     core_device);
> +
> +	vfio_pci_core_unregister_device(&ivpcd->core_device);
> +	vfio_put_device(&ivpcd->core_device.vdev);
> +
> +	kmem_cache_destroy(store_block_cache);
> +}
> +
> +static const struct pci_device_id ism_device_table[] = {
> +	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_IBM,
> +					  PCI_DEVICE_ID_IBM_ISM) },
> +	{}
> +};
> +MODULE_DEVICE_TABLE(pci, ism_device_table);
> +
> +static struct pci_driver ism_vfio_pci_driver = {
> +	.name = KBUILD_MODNAME,
> +	.id_table = ism_device_table,
> +	.probe = ism_vfio_pci_probe,
> +	.remove = ism_vfio_pci_remove,
> +	.err_handler = &vfio_pci_core_err_handlers,
> +	.driver_managed_dma = true,
> +};
> +
> +module_pci_driver(ism_vfio_pci_driver);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("vfio-pci variant driver for the IBM Internal Shared Memory (ISM) device");
> +MODULE_AUTHOR("IBM Corporation");
>
Re: [PATCH v3 2/3] vfio/ism: Implement vfio_pci driver for ISM devices
Posted by Niklas Schnelle 4 weeks, 1 day ago
On Tue, 2026-03-10 at 13:20 -0600, Alex Williamson wrote:
> On Thu, 05 Mar 2026 13:20:14 +0100
> Julian Ruess <julianr@linux.ibm.com> wrote:
> 
> > Add a vfio_pci variant driver for the s390-specific Internal Shared
> > Memory (ISM) devices used for inter-VM communication.
> > 
> > This enables the development of vfio-pci-based user space drivers for
> > ISM devices.
> > 
> > On s390, kernel primitives such as ioread() and iowrite() are switched
> > over from function handle based PCI load/stores instructions to PCI
> > memory-I/O (MIO) loads/stores when these are available and not
> > explicitly disabled. Since these instructions cannot be used with ISM
> > devices, ensure that classic function handle-based PCI instructions are
> > used instead.
> > 
> > The driver is still required even when MIO instructions are disabled, as
> > the ISM device relies on the PCI store block (PCISTB) instruction to
> > perform write operations.
> > 
> > Stores are not fragmented, therefore one ioctl corresponds to exactly
> > one PCISTB instruction. User space must ensure to not write more than
> > 4096 bytes at once to an ISM BAR which is the maximum payload of the
> > PCISTB instruction.
> > 
> > Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> > Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
> > ---
> > 
--- snip ---
> > 
> > +static ssize_t ism_vfio_pci_do_io_r(struct vfio_pci_core_device *vdev,
> > +				    char __user *buf, loff_t off, size_t count,
> > +				    int bar)
> > +{
> > +	struct zpci_dev *zdev = to_zpci(vdev->pdev);
> > +	ssize_t ret, done = 0;
> > +	u64 req, length, tmp;
> > +
> > +	while (count) {
> > +		if (count >= 8 && IS_ALIGNED(off, 8))
> > +			length = 8;
> > +		else if (count >= 4 && IS_ALIGNED(off, 4))
> > +			length = 4;
> > +		else if (count >= 2 && IS_ALIGNED(off, 2))
> > +			length = 2;
> > +		else
> > +			length = 1;
> > +		req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, length);
> > +		/*
> > +		 * Use __zpci_load() to bypass automatic use of PCI MIO instructions
> > +		 * which are not supported on ISM devices
> > +		 */
> > +		ret = __zpci_load(&tmp, req, off);
> > +		if (ret)
> > +			return ret;
> > +		if (copy_to_user(buf, &tmp, length))
> 
> Is there an endian issue here for 1/2/4 byte reads?  zpci_read_single()
> uses a u64 target for zpci_load(), but then casts the result into the
> destination buffer to place the low-order bytes.  AIUI, copy_to_user()
> would start from the high-order bytes of the u64.

Great find Alex, thank you! One should think that us s390 people would
be the ones extra careful with Endianess. I did a bit of digging how
this slipped through our review process and it seems we actually had it
right up to our internal v5 using a macro generated ism_read##size()
and using a temporary u##size. Then we messed up pulling the
copy_to_user() out of the individual ism_read() macros which then got
eliminated still without noticing the endianess issue. We actually also
don't need the sizes < 8 in the current user-space driver so didn't
notice in testing either. And as a side note LLM review also didn't
catch it. So unless vfio-pci requires reads < 8 bytes to work we'd opt
to just support the 8 byte case.

> 
> > +			return -EFAULT;
> > +		count -= length;
> > +		done += length;
> > +		off += length;
> > +		buf += length;
> > +	}
> > +	return done;
> > +}
--- snip ---
> > +
> > +static int ism_vfio_pci_probe(struct pci_dev *pdev,
> > +			      const struct pci_device_id *id)
> > +{
> > +	struct ism_vfio_pci_core_device *ivpcd;
> > +	struct zpci_dev *zdev = to_zpci(pdev);
> > +	int ret;
> > +
> > +	ivpcd = vfio_alloc_device(ism_vfio_pci_core_device, core_device.vdev,
> > +				  &pdev->dev, &ism_pci_ops);
> > +	if (IS_ERR(ivpcd))
> > +		return PTR_ERR(ivpcd);
> > +
> > +	store_block_cache = kmem_cache_create("store_block_cache",
> > +					      zdev->maxstbl, 0, 0, NULL);
> > +	if (!store_block_cache)
> > +		return -ENOMEM;
> 
> ivpcd is leaked here, we need a vfio_put_device().  Thanks,
> 
> Alex

Good find again. For completeness we also noticed another issue here in
internal review. The store_block_cache gets replaced and the old one
leaked when multiple devices are used with this driver. So we'll likely
go to a per-device kvmem_cache in v4.

> 
> > +
> > +	dev_set_drvdata(&pdev->dev, &ivpcd->core_device);
> > +	ret = vfio_pci_core_register_device(&ivpcd->core_device);
> > +	if (ret) {
> > +		kmem_cache_destroy(store_block_cache);
> > +		vfio_put_device(&ivpcd->core_device.vdev);
> > +	}
> > +
> > +	return ret;
> > +}
> > 
Re: [PATCH v3 2/3] vfio/ism: Implement vfio_pci driver for ISM devices
Posted by Farhan Ali 1 month ago
On 3/5/2026 4:20 AM, Julian Ruess wrote:
> Add a vfio_pci variant driver for the s390-specific Internal Shared
> Memory (ISM) devices used for inter-VM communication.
>
> This enables the development of vfio-pci-based user space drivers for
> ISM devices.
>
> On s390, kernel primitives such as ioread() and iowrite() are switched
> over from function handle based PCI load/stores instructions to PCI
> memory-I/O (MIO) loads/stores when these are available and not
> explicitly disabled. Since these instructions cannot be used with ISM
> devices, ensure that classic function handle-based PCI instructions are
> used instead.
>
> The driver is still required even when MIO instructions are disabled, as
> the ISM device relies on the PCI store block (PCISTB) instruction to
> perform write operations.
>
> Stores are not fragmented, therefore one ioctl corresponds to exactly
> one PCISTB instruction. User space must ensure to not write more than
> 4096 bytes at once to an ISM BAR which is the maximum payload of the
> PCISTB instruction.
>
> Reviewed-by: Niklas Schnelle<schnelle@linux.ibm.com>
> Signed-off-by: Julian Ruess<julianr@linux.ibm.com>
> ---
>   drivers/vfio/pci/Kconfig      |   2 +
>   drivers/vfio/pci/Makefile     |   2 +
>   drivers/vfio/pci/ism/Kconfig  |  10 ++
>   drivers/vfio/pci/ism/Makefile |   3 +
>   drivers/vfio/pci/ism/main.c   | 343 ++++++++++++++++++++++++++++++++++++++++++
>   5 files changed, 360 insertions(+)

Reviewed-by: Farhan Ali<alifm@linux.ibm.com>
Re: [PATCH v3 2/3] vfio/ism: Implement vfio_pci driver for ISM devices
Posted by Niklas Schnelle 1 month ago
On Thu, 2026-03-05 at 13:20 +0100, Julian Ruess wrote:
> Add a vfio_pci variant driver for the s390-specific Internal Shared
> Memory (ISM) devices used for inter-VM communication.
> 
> This enables the development of vfio-pci-based user space drivers for
> ISM devices.
> 
> On s390, kernel primitives such as ioread() and iowrite() are switched
> over from function handle based PCI load/stores instructions to PCI
> memory-I/O (MIO) loads/stores when these are available and not
> explicitly disabled. Since these instructions cannot be used with ISM
> devices, ensure that classic function handle-based PCI instructions are
> used instead.
> 
> The driver is still required even when MIO instructions are disabled, as
> the ISM device relies on the PCI store block (PCISTB) instruction to
> perform write operations.
> 
> Stores are not fragmented, therefore one ioctl corresponds to exactly
> one PCISTB instruction. User space must ensure to not write more than
> 4096 bytes at once to an ISM BAR which is the maximum payload of the
> PCISTB instruction.
> 
> Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
> Signed-off-by: Julian Ruess <julianr@linux.ibm.com>
> ---

Just to make it explicit, my R-b stands based on having talked through
the changes since v2 with Julian off list.

Thanks,
Niklas