[RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl

Wathsala Vithanage posted 1 patch 9 months, 3 weeks ago
drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
include/uapi/linux/vfio.h        |  68 +++++++++++++
2 files changed, 231 insertions(+)
[RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
Posted by Wathsala Vithanage 9 months, 3 weeks ago
Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
direct cache injection. As described in the relevant patch set [1],
direct cache injection in supported hardware allows optimal platform
resource utilization for specific requests on the PCIe bus. This feature
is currently available only for kernel device drivers. However,
user space applications, especially those whose performance is sensitive
to the latency of inbound writes as seen by a CPU core, may benefit from
using this information (E.g., DPDK cache stashing RFC [2] or an HPC
application running in a VM).

This patch enables configuring of TPH from the user space via
VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
drivers and VMMs to enable/disable the TPH feature on PCIe devices and
set steering tags in MSI-X or steering-tag table entries using
VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.

[1] lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
[2] inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com

Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        |  68 +++++++++++++
 2 files changed, 231 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 586e49efb81b..d6dd0495b08b 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -29,6 +29,7 @@
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
 #include <linux/iommufd.h>
+#include <linux/pci-tph.h>
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
@@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
 	return 0;
 }
 
+static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
+				      void __user *arg, size_t argsz,
+				      struct vfio_pci_tph_info **info)
+{
+	size_t minsz;
+
+	if (tph->count > VFIO_TPH_INFO_MAX)
+		return -EINVAL;
+	if (!tph->count)
+		return 0;
+
+	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
+	if (minsz < argsz)
+		return -EINVAL;
+
+	*info = memdup_user(arg, minsz);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
+
+	return minsz;
+}
+
+static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
+				      struct vfio_pci_tph *tph,
+				      void __user *arg, size_t argsz)
+{
+	int i, mtype, err = 0;
+	u32 cpu_uid;
+	struct vfio_pci_tph_info *info = NULL;
+	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, &info);
+
+	if (data_size <= 0)
+		return data_size;
+
+	for (i = 0; i < tph->count; i++) {
+		if (!(info[i].cpu_id < nr_cpu_ids && cpu_present(info[i].cpu_id))) {
+			info[i].err = -EINVAL;
+			continue;
+		}
+		cpu_uid = topology_core_id(info[i].cpu_id);
+		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
+			VFIO_TPH_MEM_TYPE_SHIFT;
+
+		/* processing hints are always ignored */
+		info[i].ph_ignore = 1;
+
+		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
+						  &info[i].st);
+		if (info[i].err)
+			continue;
+
+		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
+			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
+							    info[i].index,
+							    info[i].st);
+		}
+	}
+
+	if (copy_to_user(arg, info, data_size))
+		err = -EFAULT;
+
+	kfree(info);
+	return err;
+}
+
+
+static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
+				       struct vfio_pci_tph *arg)
+{
+	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
+
+	switch (mode) {
+	case VFIO_TPH_ST_NS_MODE:
+		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
+
+	case VFIO_TPH_ST_IV_MODE:
+		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
+
+	case VFIO_TPH_ST_DS_MODE:
+		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
+
+	default:
+		return -EINVAL;
+	}
+
+}
+
+static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device *vdev)
+{
+	pcie_disable_tph(vdev->pdev);
+	return 0;
+}
+
+static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
+					size_t argsz, u32 flags,
+					struct vfio_pci_tph *tph)
+{
+	u32 op;
+	int err = vfio_check_feature(flags, argsz,
+				 VFIO_DEVICE_FEATURE_SET |
+				 VFIO_DEVICE_FEATURE_GET,
+				 sizeof(struct vfio_pci_tph));
+	if (err != 1)
+		return err;
+
+	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
+		return -EFAULT;
+
+	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
+
+	switch (op) {
+	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
+	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
+	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
+		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
+
+	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
+		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
+				     struct vfio_pci_tph __user *arg,
+				     size_t argsz)
+{
+	u32 op;
+	struct vfio_pci_tph tph;
+	void __user *uinfo;
+	size_t infosz;
+	struct vfio_pci_core_device *vdev =
+		container_of(device, struct vfio_pci_core_device, vdev);
+	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
+
+	if (err)
+		return err;
+
+	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
+
+	switch (op) {
+	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
+		return vfio_pci_feature_tph_enable(vdev, &tph);
+
+	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
+		return vfio_pci_feature_tph_disable(vdev);
+
+	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
+	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
+		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
+		infosz = argsz - sizeof(struct vfio_pci_tph);
+		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 				void __user *arg, size_t argsz)
 {
@@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
 		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
+	case VFIO_DEVICE_FEATURE_PCI_TPH:
+		return vfio_pci_core_feature_tph(device, flags,
+						 arg, argsz);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c8dbf8219c4f..608d57dfe279 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set steering tags
+ * on the device. Data provided when setting this feature is a __u32 with the
+ * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH in
+ * no-steering-tag, interrupt-vector, or device-specific mode when feature flags
+ * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE are set
+ * respectively.
+ * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
+ * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at an index in
+ * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used and device
+ * capabilities. The caller can set multiple steering tags by passing an array
+ * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
+ * MSI-X/ST-table index. The caller can also set the intended memory type and
+ * the processing hint by setting VFIO_TPH_MEM_TYPE_x and VFIO_TPH_HINT_x flags,
+ * respectively. The return value for each vfio_pci_tph_info object is stored in
+ * err, with the steering-tag set on the device and the ph_ignore status bit
+ * resulting from the steering-tag lookup operation. If err < 0, the values
+ * stored in the st and ph_ignore fields should be considered invalid.
+ *
+ * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
+ * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
+ * The return values per vfio_pci_tph_info object are stored in the st,
+ * ph_ignore, and err fields.
+ */
+struct vfio_pci_tph_info {
+	/* in */
+	__u32 cpu_id;
+	__u32 cache_level;
+	__u8  flags;
+#define VFIO_TPH_MEM_TYPE_MASK		0x1
+#define VFIO_TPH_MEM_TYPE_SHIFT		0
+#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request volatile memory ST */
+#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request persistent memory ST */
+
+#define VFIO_TPH_HINT_MASK		0x3
+#define VFIO_TPH_HINT_SHIFT		1
+#define VFIO_TPH_HINT_BIDIR		0
+#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
+	__u16 index;			/* MSI-X/ST-table index to set ST */
+	/* out */
+	__u16 st;			/* Steering-Tag */
+	__u8  ph_ignore;		/* Processing hint was ignored by */
+	__s32 err;			/* Error on getting/setting Steering-Tag*/
+};
+
+struct vfio_pci_tph {
+	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
+	__u32 flags;
+#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
+#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
+#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/* Enable TPH on device */
+#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH on device */
+#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get steering-tags */
+#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set steering-rags */
+
+#define	VFIO_TPH_ST_MODE_MASK	(0x3 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+#define	VFIO_TPH_ST_NS_MODE	(0 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+#define	VFIO_TPH_ST_IV_MODE	(1 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+#define	VFIO_TPH_ST_DS_MODE	(2 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
+	__u32 count;				/* Number of entries in info[] */
+	struct vfio_pci_tph_info info[];
+#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed in info[] */
+};
+
+#define VFIO_DEVICE_FEATURE_PCI_TPH 11
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.43.0

[RFC PATCH v2 0/1] VFIO ioctl for TLP Processing Hints
Posted by Wathsala Vithanage 6 months, 2 weeks ago
Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
direct cache injection. As described in the relevant patch set [1],
direct cache injection in supported hardware allows optimal platform
resource utilization for specific requests on the PCIe bus. This feature
is currently available only for kernel device drivers. However, user
space applications, especially those whose performance is sensitive to
the latency of inbound writes as seen by a CPU core, may benefit from
using this information (E.g., DPDK cache stashing RFC [2] or an HPC
application running in a VM). For such applications to enable and
configure TPH on PCIe devices, this RFC proposes a new VFIO ioctl.

[1] lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
[2] inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com

V1->V2:
 * Rewrite as a VFIO IOCTL
 * Add missing padding for structs
 * Increase maximum steering tags per IOCTL to 2048

Wathsala Vithanage (1):
  vfio/pci: add PCIe TPH device ioctl

 drivers/vfio/pci/vfio_pci_core.c | 153 +++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        |  83 +++++++++++++++++
 2 files changed, 236 insertions(+)

-- 
2.43.0
[RFC PATCH v2 1/1] vfio/pci: add PCIe TPH device ioctl
Posted by Wathsala Vithanage 6 months, 2 weeks ago
This patch introduces VFIO_DEVICE_PCI_TPH IOCTL to enable configuring of
TPH from the user space. It provides an interface to user space drivers
and VMMs to enable/disable TPH capability on PCIe devices and set
steering tags in MSI-X or steering-tag table entries or read steering
tags from the kernel to use them in device-specific mode.

Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 153 +++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h        |  83 +++++++++++++++++
 2 files changed, 236 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 6328c3a05bcd..efdc2f09aa75 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -29,6 +29,7 @@
 #include <linux/nospec.h>
 #include <linux/sched/mm.h>
 #include <linux/iommufd.h>
+#include <linux/pci-tph.h>
 #if IS_ENABLED(CONFIG_EEH)
 #include <asm/eeh.h>
 #endif
@@ -1444,6 +1445,156 @@ static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
 				  ioeventfd.fd);
 }
 
+static struct vfio_pci_tph_entry *vfio_pci_tph_get_ents(struct vfio_pci_tph *tph,
+							void __user *tph_ents,
+							size_t *ents_size)
+{
+	unsigned long minsz;
+	size_t size;
+
+	if (!ents_size)
+		return ERR_PTR(-EINVAL);
+
+	minsz = offsetofend(struct vfio_pci_tph, count);
+
+	size = tph->count * sizeof(struct vfio_pci_tph_entry);
+
+	if (tph->argsz - minsz < size)
+		return ERR_PTR(-EINVAL);
+
+	*ents_size = size;
+
+	return memdup_user(tph_ents, size);
+}
+
+static int vfio_pci_tph_set_st(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_tph_entry *ents, int count)
+{
+	int i, err = 0;
+
+	for (i = 0; i < count && !err; i++)
+		err = pcie_tph_set_st_entry(vdev->pdev, ents[i].index,
+					    ents[i].st);
+
+	return err;
+}
+
+static int vfio_pci_tph_get_st(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_tph_entry *ents, int count)
+{
+	int i, mtype, err = 0;
+	u32 cpu_uid;
+
+	for (i = 0; i < count && !err; i++) {
+		if (ents[i].cpu_id >= nr_cpu_ids || !cpu_present(ents[i].cpu_id)) {
+			err = -EINVAL;
+			break;
+		}
+
+		cpu_uid = topology_core_id(ents[i].cpu_id);
+		mtype = (ents[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
+			VFIO_TPH_MEM_TYPE_SHIFT;
+
+		/*
+		 * ph_ignore is always set.
+		 * TPH implementation of the PCI subsystem forces processing
+		 * hint to bi-directional by setting PH bits to 0 when
+		 * acquiring Steering Tags from the platform firmware. It also
+		 * discards the ph_ignore bit returned by firmware.
+		 */
+		ents[i].ph_ignore = 1;
+
+		err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
+					  &ents[i].st);
+	}
+
+	return err;
+}
+
+static int vfio_pci_tph_st_op(struct vfio_pci_core_device *vdev,
+			      struct vfio_pci_tph *tph, void __user *tph_ents)
+{
+	int err = 0;
+	struct vfio_pci_tph_entry *ents;
+	size_t ents_size;
+
+	if (!tph->count || tph->count > VFIO_TPH_INFO_MAX) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	ents = vfio_pci_tph_get_ents(tph, tph_ents, &ents_size);
+	if (IS_ERR(ents)) {
+		err = PTR_ERR(ents);
+		goto out;
+	}
+
+	err = vfio_pci_tph_get_st(vdev, ents, tph->count);
+	if (err)
+		goto out_free_ents;
+
+	/*
+	 * Set Steering tags. TPH will be disabled on the device by the PCI
+	 * subsystem if there is an error.
+	 */
+	if (tph->flags & VFIO_DEVICE_TPH_SET_ST) {
+		err = vfio_pci_tph_set_st(vdev, ents, tph->count);
+		if (err)
+			goto out_free_ents;
+	}
+
+	if (copy_to_user(tph_ents, ents, ents_size))
+		err = -EFAULT;
+
+out_free_ents:
+	kfree(ents);
+out:
+	return err;
+}
+
+static int vfio_pci_tph_enable(struct vfio_pci_core_device *vdev,
+			       struct vfio_pci_tph *arg)
+{
+	return pcie_enable_tph(vdev->pdev, arg->flags & VFIO_TPH_ST_MODE_MASK);
+}
+
+static int vfio_pci_tph_disable(struct vfio_pci_core_device *vdev)
+{
+	pcie_disable_tph(vdev->pdev);
+	return 0;
+}
+
+static int vfio_pci_ioctl_tph(struct vfio_pci_core_device *vdev,
+			      void __user *uarg)
+{
+	u32 op;
+	struct vfio_pci_tph tph;
+	size_t minsz = offsetofend(struct vfio_pci_tph, count);
+
+	if (copy_from_user(&tph, uarg, minsz))
+		return -EFAULT;
+
+	if (tph.argsz < minsz)
+		return -EINVAL;
+
+	op = tph.flags & VFIO_DEVICE_TPH_OP_MASK;
+
+	switch (op) {
+	case VFIO_DEVICE_TPH_ENABLE:
+		return vfio_pci_tph_enable(vdev, &tph);
+
+	case VFIO_DEVICE_TPH_DISABLE:
+		return vfio_pci_tph_disable(vdev);
+
+	case VFIO_DEVICE_TPH_GET_ST:
+	case VFIO_DEVICE_TPH_SET_ST:
+		return vfio_pci_tph_st_op(vdev, &tph, (u8 *)(uarg) + minsz);
+
+	default:
+		return -EINVAL;
+	}
+}
+
 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 			 unsigned long arg)
 {
@@ -1468,6 +1619,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
 		return vfio_pci_ioctl_reset(vdev, uarg);
 	case VFIO_DEVICE_SET_IRQS:
 		return vfio_pci_ioctl_set_irqs(vdev, uarg);
+	case VFIO_DEVICE_PCI_TPH:
+		return vfio_pci_ioctl_tph(vdev, uarg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5764f315137f..1a56abdf9deb 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -873,6 +873,88 @@ struct vfio_device_ioeventfd {
 
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)
 
+/**
+ * VFIO_DEVICE_PCI_TPH	- _IO(VFIO_TYPE, VFIO_BASE + 22)
+ *
+ * This command is used to control PCIe TLP Processing Hints (TPH)
+ * capability in a PCIe device.
+ * It supports following operations on a PCIe device with respect to TPH
+ * capability.
+ *
+ * - Enabling/disabling TPH capability in a PCIe device.
+ *
+ *   Setting VFIO_DEVICE_TPH_ENABLE flag enables TPH in no-steering-tag,
+ *   interrupt-vector, or device-specific mode defined in the PCIe specficiation
+ *   when feature flags TPH_ST_NS_MODE, TPH_ST_IV_MODE, and TPH_ST_DS_MODE are
+ *   set respectively. TPH_ST_xx_MODE macros are defined in
+ *   uapi/linux/pci_regs.h.
+ *
+ *   VFIO_DEVICE_TPH_DISABLE disables PCIe TPH on the device.
+ *
+ * - Writing STs to MSI-X or ST table in a PCIe device.
+ *
+ *   VFIO_DEVICE_TPH_SET_ST flag set steering tags on a device at an index in
+ *   MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used and
+ *   returns the programmed steering tag values. The caller can set one or more
+ *   steering tags by passing an array of vfio_pci_tph_entry objects containing
+ *   cpu_id, cache_level, and MSI-X/ST-table index. The caller can also set the
+ *   intended memory type and the processing hint by setting VFIO_TPH_MEM_TYPE_x
+ *   and VFIO_TPH_HINT_x flags, respectively.
+ *
+ * - Reading Steering Tags (ST) from the host platform.
+ *
+ *   VFIO_DEVICE_TPH_GET_ST flags returns steering tags to the caller. Caller
+ *   can request one or more steering tags by passing an array of
+ *   vfio_pci_tph_entry objects. Steering Tag for each request is returned via
+ *   the st field in vfio_pci_tph_entry.
+ */
+struct vfio_pci_tph_entry {
+	/* in */
+	__u32 cpu_id;			/* CPU logical ID */
+	__u32 cache_level;		/* Cache level. L1 D= 0, L2D = 2, ...*/
+	__u8  flags;
+#define VFIO_TPH_MEM_TYPE_MASK		0x1
+#define VFIO_TPH_MEM_TYPE_SHIFT		0
+#define VFIO_TPH_MEM_TYPE_VMEM		0   /* Request volatile memory ST */
+#define VFIO_TPH_MEM_TYPE_PMEM		1   /* Request persistent memory ST */
+
+#define VFIO_TPH_HINT_MASK		0x3
+#define VFIO_TPH_HINT_SHIFT		1
+#define VFIO_TPH_HINT_BIDIR		0
+#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
+#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
+	__u8  pad0;
+	__u16 index;			/* MSI-X/ST-table index to set ST */
+	/* out */
+	__u16 st;			/* Steering-Tag */
+	__u8  ph_ignore;		/* Platform ignored the Processing */
+	__u8  pad1;
+};
+
+struct vfio_pci_tph {
+	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
+	__u32 flags;
+#define VFIO_TPH_ST_MODE_MASK		0x7
+
+#define VFIO_DEVICE_TPH_OP_SHIFT	3
+#define VFIO_DEVICE_TPH_OP_MASK		(0x7 << VFIO_DEVICE_TPH_OP_SHIFT)
+/* Enable TPH on device */
+#define VFIO_DEVICE_TPH_ENABLE		0
+/* Disable TPH on device */
+#define VFIO_DEVICE_TPH_DISABLE		(1 << VFIO_DEVICE_TPH_OP_SHIFT)
+/* Get steering-tags */
+#define VFIO_DEVICE_TPH_GET_ST		(2 << VFIO_DEVICE_TPH_OP_SHIFT)
+/* Set steering-tags */
+#define VFIO_DEVICE_TPH_SET_ST		(4 << VFIO_DEVICE_TPH_OP_SHIFT)
+	__u32 count;			/* Number of entries in ents[] */
+	struct vfio_pci_tph_entry ents[];
+#define VFIO_TPH_INFO_MAX	2048	/* Max entries in ents[] */
+};
+
+#define VFIO_DEVICE_PCI_TPH	_IO(VFIO_TYPE, VFIO_BASE + 22)
+
+
 /**
  * VFIO_DEVICE_FEATURE - _IOWR(VFIO_TYPE, VFIO_BASE + 17,
  *			       struct vfio_device_feature)
@@ -1468,6 +1550,7 @@ struct vfio_device_feature_bus_master {
 };
 #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
 
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**
-- 
2.43.0
Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
Posted by Jason Gunthorpe 9 months, 2 weeks ago
On Fri, Feb 21, 2025 at 10:46:33PM +0000, Wathsala Vithanage wrote:
> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
> 
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.

What level of protection do we expect to have here? Is it OK for
userspace to make up any old tag value or is there some security
concern with that?

Jason
Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
Posted by Philipp Stanner 9 months, 2 weeks ago
On Fri, 2025-02-21 at 22:46 +0000, Wathsala Vithanage wrote:
> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature
> for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This
> feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is
> sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit
> from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
> 
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices
> and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel
> using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.
> 
> [1] 
> lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
> [2] 
> inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com
> 
> Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 163
> +++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        |  68 +++++++++++++
>  2 files changed, 231 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c
> b/drivers/vfio/pci/vfio_pci_core.c
> index 586e49efb81b..d6dd0495b08b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,6 +29,7 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#include <linux/pci-tph.h>
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct
> vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t
> argsz,
> +				      struct vfio_pci_tph_info
> **info)
> +{
> +	size_t minsz;
> +
> +	if (tph->count > VFIO_TPH_INFO_MAX)
> +		return -EINVAL;
> +	if (!tph->count)
> +		return 0;
> +
> +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> +	if (minsz < argsz)
> +		return -EINVAL;
> +
> +	*info = memdup_user(arg, minsz);

You can use memdup_array_user() instead of the lines above. It does the
multiplication plus overflow check for you and will make your code more
compact.

> +	if (IS_ERR(info))
> +		return PTR_ERR(info);
> +
> +	return minsz;

see below…

> +}
> +
> +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device
> *vdev,
> +				      struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t
> argsz)
> +{
> +	int i, mtype, err = 0;
> +	u32 cpu_uid;
> +	struct vfio_pci_tph_info *info = NULL;
> +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz,
> &info);
> +
> +	if (data_size <= 0)
> +		return data_size;

So it seems you return here in case of an error. However, that would
result in a length of 0 being an error?

I would try to avoid to return 0 for an error whenever possible. That
breaks convention.

How about you return the result value of memdup_array_user() in
…uinfo_dup()?

The only thing I can't tell is whether tph->count == 0 should be
treated as an error. Maybe map it to -EINVAL?


Regards,
P.

> +
> +	for (i = 0; i < tph->count; i++) {
> +		if (!(info[i].cpu_id < nr_cpu_ids &&
> cpu_present(info[i].cpu_id))) {
> +			info[i].err = -EINVAL;
> +			continue;
> +		}
> +		cpu_uid = topology_core_id(info[i].cpu_id);
> +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> +			VFIO_TPH_MEM_TYPE_SHIFT;
> +
> +		/* processing hints are always ignored */
> +		info[i].ph_ignore = 1;
> +
> +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype,
> cpu_uid,
> +						  &info[i].st);
> +		if (info[i].err)
> +			continue;
> +
> +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> +			info[i].err = pcie_tph_set_st_entry(vdev-
> >pdev,
> +							   
> info[i].index,
> +							   
> info[i].st);
> +		}
> +	}
> +
> +	if (copy_to_user(arg, info, data_size))
> +		err = -EFAULT;
> +
> +	kfree(info);
> +	return err;
> +}
> +
> +
> +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device
> *vdev,
> +				       struct vfio_pci_tph *arg)
> +{
> +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> +
> +	switch (mode) {
> +	case VFIO_TPH_ST_NS_MODE:
> +		return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_NS_MODE);
> +
> +	case VFIO_TPH_ST_IV_MODE:
> +		return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_IV_MODE);
> +
> +	case VFIO_TPH_ST_DS_MODE:
> +		return pcie_enable_tph(vdev->pdev,
> PCI_TPH_ST_DS_MODE);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +}
> +
> +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device
> *vdev)
> +{
> +	pcie_disable_tph(vdev->pdev);
> +	return 0;
> +}
> +
> +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user
> *arg,
> +					size_t argsz, u32 flags,
> +					struct vfio_pci_tph *tph)
> +{
> +	u32 op;
> +	int err = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_SET |
> +				 VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(struct vfio_pci_tph));
> +	if (err != 1)
> +		return err;
> +
> +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> +		return -EFAULT;
> +
> +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -
> EINVAL;
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -
> EINVAL;
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32
> flags,
> +				     struct vfio_pci_tph __user
> *arg,
> +				     size_t argsz)
> +{
> +	u32 op;
> +	struct vfio_pci_tph tph;
> +	void __user *uinfo;
> +	size_t infosz;
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device,
> vdev);
> +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags,
> &tph);
> +
> +	if (err)
> +		return err;
> +
> +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +		return vfio_pci_feature_tph_enable(vdev, &tph);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +		return vfio_pci_feature_tph_disable(vdev);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph,
> info);
> +		infosz = argsz - sizeof(struct vfio_pci_tph);
> +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo,
> infosz);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32
> flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct
> vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg,
> argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags,
> arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> +		return vfio_pci_core_feature_tph(device, flags,
> +						 arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..608d57dfe279 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/*
> + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set
> steering tags
> + * on the device. Data provided when setting this feature is a __u32
> with the
> + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH
> in
> + * no-steering-tag, interrupt-vector, or device-specific mode when
> feature flags
> + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE
> are set
> + * respectively.
> + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at
> an index in
> + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used
> and device
> + * capabilities. The caller can set multiple steering tags by
> passing an array
> + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> + * MSI-X/ST-table index. The caller can also set the intended memory
> type and
> + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and
> VFIO_TPH_HINT_x flags,
> + * respectively. The return value for each vfio_pci_tph_info object
> is stored in
> + * err, with the steering-tag set on the device and the ph_ignore
> status bit
> + * resulting from the steering-tag lookup operation. If err < 0, the
> values
> + * stored in the st and ph_ignore fields should be considered
> invalid.
> + *
> + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the
> caller.
> + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the
> caller.
> + * The return values per vfio_pci_tph_info object are stored in the
> st,
> + * ph_ignore, and err fields.
> + */
> +struct vfio_pci_tph_info {
> +	/* in */
> +	__u32 cpu_id;
> +	__u32 cache_level;
> +	__u8  flags;
> +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request volatile
> memory ST */
> +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request
> persistent memory ST */
> +
> +#define VFIO_TPH_HINT_MASK		0x3
> +#define VFIO_TPH_HINT_SHIFT		1
> +#define VFIO_TPH_HINT_BIDIR		0
> +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)
> +	__u16 index;			/* MSI-X/ST-table index to
> set ST */
> +	/* out */
> +	__u16 st;			/* Steering-Tag */
> +	__u8  ph_ignore;		/* Processing hint was
> ignored by */
> +	__s32 err;			/* Error on getting/setting
> Steering-Tag*/
> +};
> +
> +struct vfio_pci_tph {
> +	__u32 argsz;			/* Size of vfio_pci_tph and
> info[] */
> +	__u32 flags;
> +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/* Enable
> TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable
> TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get
> steering-tags */
> +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set
> steering-rags */
> +
> +#define	VFIO_TPH_ST_MODE_MASK	(0x3 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_NS_MODE	(0 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_IV_MODE	(1 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_DS_MODE	(2 <<
> VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +	__u32 count;				/* Number of entries
> in info[] */
> +	struct vfio_pci_tph_info info[];
> +#define VFIO_TPH_INFO_MAX	64		/* Max entries
> allowed in info[] */
> +};
> +
> +#define VFIO_DEVICE_FEATURE_PCI_TPH 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**
Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl
Posted by Alex Williamson 9 months, 2 weeks ago
On Fri, 21 Feb 2025 22:46:33 +0000
Wathsala Vithanage <wathsala.vithanage@arm.com> wrote:

> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
> 
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.

Unless I'm missing it, the RFC in [2] doesn't make use of this
proposal.  Is there published code anywhere that does use this
interface?

> [1] lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@amd.com
> [2] inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@arm.com
> 
> Signed-off-by: Wathsala Vithanage <wathsala.vithanage@arm.com>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        |  68 +++++++++++++
>  2 files changed, 231 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 586e49efb81b..d6dd0495b08b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,6 +29,7 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#include <linux/pci-tph.h>
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t argsz,
> +				      struct vfio_pci_tph_info **info)
> +{
> +	size_t minsz;
> +
> +	if (tph->count > VFIO_TPH_INFO_MAX)
> +		return -EINVAL;
> +	if (!tph->count)
> +		return 0;
> +
> +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> +	if (minsz < argsz)
> +		return -EINVAL;
> +
> +	*info = memdup_user(arg, minsz);
> +	if (IS_ERR(info))
> +		return PTR_ERR(info);
> +
> +	return minsz;
> +}
> +
> +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
> +				      struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t argsz)
> +{
> +	int i, mtype, err = 0;
> +	u32 cpu_uid;
> +	struct vfio_pci_tph_info *info = NULL;
> +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, &info);
> +
> +	if (data_size <= 0)
> +		return data_size;
> +
> +	for (i = 0; i < tph->count; i++) {
> +		if (!(info[i].cpu_id < nr_cpu_ids && cpu_present(info[i].cpu_id))) {

Not intuitive logic, imo.  This could easily be:

		if (info[i].cpu_id >= nr_cpu_ids || !cpu_present(info[i].cpu_id))

> +			info[i].err = -EINVAL;
> +			continue;
> +		}
> +		cpu_uid = topology_core_id(info[i].cpu_id);
> +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> +			VFIO_TPH_MEM_TYPE_SHIFT;
> +
> +		/* processing hints are always ignored */
> +		info[i].ph_ignore = 1;
> +
> +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
> +						  &info[i].st);
> +		if (info[i].err)
> +			continue;
> +
> +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> +			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
> +							    info[i].index,
> +							    info[i].st);
> +		}
> +	}
> +
> +	if (copy_to_user(arg, info, data_size))
> +		err = -EFAULT;
> +
> +	kfree(info);
> +	return err;
> +}
> +
> +
> +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
> +				       struct vfio_pci_tph *arg)
> +{
> +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> +
> +	switch (mode) {
> +	case VFIO_TPH_ST_NS_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
> +
> +	case VFIO_TPH_ST_IV_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
> +
> +	case VFIO_TPH_ST_DS_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +}
> +
> +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device *vdev)
> +{
> +	pcie_disable_tph(vdev->pdev);
> +	return 0;
> +}
> +
> +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
> +					size_t argsz, u32 flags,
> +					struct vfio_pci_tph *tph)
> +{
> +	u32 op;
> +	int err = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_SET |
> +				 VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(struct vfio_pci_tph));
> +	if (err != 1)
> +		return err;

We don't seem to account for a host booted with pci=notph.

> +
> +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> +		return -EFAULT;
> +
> +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;

This is a convoluted mangling of an ioctl into a vfio feature.

> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
> +				     struct vfio_pci_tph __user *arg,
> +				     size_t argsz)
> +{
> +	u32 op;
> +	struct vfio_pci_tph tph;
> +	void __user *uinfo;
> +	size_t infosz;
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
> +
> +	if (err)
> +		return err;
> +
> +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +		return vfio_pci_feature_tph_enable(vdev, &tph);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +		return vfio_pci_feature_tph_disable(vdev);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
> +		infosz = argsz - sizeof(struct vfio_pci_tph);
> +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);

This is effectively encoding a regular ioctl as a feature.  See below.

> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> +		return vfio_pci_core_feature_tph(device, flags,
> +						 arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..608d57dfe279 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/*
> + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set steering tags
> + * on the device. Data provided when setting this feature is a __u32 with the
> + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH in
> + * no-steering-tag, interrupt-vector, or device-specific mode when feature flags
> + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE are set
> + * respectively.
> + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at an index in
> + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used and device
> + * capabilities. The caller can set multiple steering tags by passing an array
> + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> + * MSI-X/ST-table index. The caller can also set the intended memory type and
> + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and VFIO_TPH_HINT_x flags,
> + * respectively. The return value for each vfio_pci_tph_info object is stored in
> + * err, with the steering-tag set on the device and the ph_ignore status bit
> + * resulting from the steering-tag lookup operation. If err < 0, the values
> + * stored in the st and ph_ignore fields should be considered invalid.
> + *

Sorry, I don't understand ph_ignore as described here.  It's only ever
set to 1.  I guess we assume the incoming state is zero.  But what does
it mean?

> + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
> + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
> + * The return values per vfio_pci_tph_info object are stored in the st,
> + * ph_ignore, and err fields.

Why do we need to provide an interface to return the steering tags set
by the user?  Seems like this could be a write-only, SET, interface.

> + */
> +struct vfio_pci_tph_info {

This seems more of an _entry than an _info.  Note that vfio has various
INFO ioctls which make this nomenclature confusing.

> +	/* in */
> +	__u32 cpu_id;

The logical ID?

> +	__u32 cache_level;

Zero based?  1-based?  How many cache levels are there?  What's valid
here?

> +	__u8  flags;
> +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request volatile memory ST */
> +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request persistent memory ST */

Is there a relation to the cache_level here?  Spec references here and
below, assuming those are relevant to these values.

> +
> +#define VFIO_TPH_HINT_MASK		0x3
> +#define VFIO_TPH_HINT_SHIFT		1
> +#define VFIO_TPH_HINT_BIDIR		0
> +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)

There needs to be a __u8 padding in here somewhere or flags extended to
__u16.

> +	__u16 index;			/* MSI-X/ST-table index to set ST */
> +	/* out */
> +	__u16 st;			/* Steering-Tag */

Sorry if I'm just not familiar with TPH, but why do we need to return
the ST?  Doesn't hardware make use of the ST index and the physical
value gets applied automatically in HW?

> +	__u8  ph_ignore;		/* Processing hint was ignored by */

Padding/alignment, same as above.  "ignored by..."  By what?  Is that
an error?

> +	__s32 err;			/* Error on getting/setting Steering-Tag*/
> +};

Generally we'd expect a system call either works or fails.  Why do we
need per entry error report?  Can't we validate and prepare the entire
operation before setting any of it into the device?  Ultimately we're
just writing hints to config space or MSI-X table space, so the write
operation itself is not likely to be the point of failure.

> +
> +struct vfio_pci_tph {
> +	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
> +	__u32 flags;
> +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/* Enable TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get steering-tags */
> +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set steering-rags */

s/rags/tags/

vfio device features already have GET and SET as part of their base
structure, why are they duplicated here?  It really seems like there
are two separate features here, one that allows enabling with a given
mode or disable, and another that allows writing specific steering
tags.  Both seem like they could be SET-only features.  It's also not
clear to me that there's a significant advantage to providing an array
of steering tags.  Isn't updating STs an infrequent operation and
likely bound to at most 2K tags in the case of MSI-X?

> +
> +#define	VFIO_TPH_ST_MODE_MASK	(0x3 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_NS_MODE	(0 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_IV_MODE	(1 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_DS_MODE	(2 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +	__u32 count;				/* Number of entries in info[] */
> +	struct vfio_pci_tph_info info[];
> +#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed in info[] */

This seems to match the limit if the table is located in extended
config space, but it's not particularly relevant if the table is
located in MSI-X space.  Why is this a good limit?

Also, try to keep these all in 80 column.  Thanks,

Alex

> +};
> +
> +#define VFIO_DEVICE_FEATURE_PCI_TPH 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**