[RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE

Aneesh Kumar K.V (Arm) posted 38 patches 2 months, 1 week ago
[RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Aneesh Kumar K.V (Arm) 2 months, 1 week ago
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 drivers/iommu/iommufd/device.c          | 54 +++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  4 ++
 drivers/iommu/iommufd/main.c            |  3 ++
 drivers/vfio/pci/vfio_pci_core.c        |  9 ++++-
 include/linux/iommufd.h                 |  1 +
 include/uapi/linux/iommufd.h            |  1 +
 6 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 65fbd098f9e9..3bd6972836a1 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1660,3 +1660,57 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 	return rc;
 }
+
+static int iommufd_device_option_exclusive_ranges(struct iommu_option *cmd,
+					  struct iommufd_device *idev)
+{
+	if (cmd->op == IOMMU_OPTION_OP_GET) {
+		cmd->val64 = idev->flags & IOMMUFD_DEVICE_EXCLUSIVE_RANGE;
+		return 0;
+	}
+
+	if (cmd->op == IOMMU_OPTION_OP_SET) {
+		if (cmd->val64 == 0) {
+			idev->flags &= ~IOMMUFD_DEVICE_EXCLUSIVE_RANGE;
+			return 0;
+		} else if (cmd->val64 & IOMMUFD_DEVICE_EXCLUSIVE_RANGE) {
+			idev->flags |= IOMMUFD_DEVICE_EXCLUSIVE_RANGE;
+			return 0;
+		}
+		return -EINVAL;
+	}
+	return -EOPNOTSUPP;
+}
+bool iommufd_device_need_exclusive_range(struct iommufd_device *idev)
+{
+	return !!(idev->flags & IOMMUFD_DEVICE_EXCLUSIVE_RANGE);
+}
+
+int iommufd_device_option(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_option *cmd = ucmd->cmd;
+	struct iommufd_device *idev;
+	int rc = 0;
+
+	if (cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	idev = iommufd_get_device(ucmd, cmd->object_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	mutex_lock(&idev->igroup->lock);
+
+
+	switch (cmd->option_id) {
+	case IOMMU_OPTION_EXCLUSIVE_RANGES:
+		rc = iommufd_device_option_exclusive_ranges(cmd, idev);
+		break;
+	default:
+		rc = -EOPNOTSUPP;
+	}
+
+	mutex_unlock(&idev->igroup->lock);
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 0da2a81eedfa..fce68714c80f 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -346,6 +346,7 @@ int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
+int iommufd_device_option(struct iommufd_ucmd *ucmd);
 int iommufd_option_rlimit_mode(struct iommu_option *cmd,
 			       struct iommufd_ctx *ictx);
 
@@ -489,10 +490,13 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	unsigned long flags;
 	struct iommufd_vdevice *vdev;
 	bool destroying;
 };
 
+#define IOMMUFD_DEVICE_EXCLUSIVE_RANGE		BIT(0)
+
 static inline struct iommufd_device *
 iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id)
 {
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 15af7ced0501..89830da8b418 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -376,6 +376,9 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
 	case IOMMU_OPTION_HUGE_PAGES:
 		rc = iommufd_ioas_option(ucmd);
 		break;
+	case IOMMU_OPTION_EXCLUSIVE_RANGES:
+		rc = iommufd_device_option(ucmd);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 6328c3a05bcd..bee3cf3226e9 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1753,8 +1753,15 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
 	 * we need to request the region and the barmap tracks that.
 	 */
 	if (!vdev->barmap[index]) {
-		ret = pci_request_selected_regions(pdev,
+
+		if (core_vdev->iommufd_device &&
+		    iommufd_device_need_exclusive_range(core_vdev->iommufd_device))
+			ret = pci_request_selected_regions_exclusive(pdev,
+							1 << index, "vfio-pci");
+		else
+			ret = pci_request_selected_regions(pdev,
 						   1 << index, "vfio-pci");
+
 		if (ret)
 			return ret;
 
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 6e7efe83bc5d..55ae02581f9b 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -70,6 +70,7 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid);
 
 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev);
 u32 iommufd_device_to_id(struct iommufd_device *idev);
+bool iommufd_device_need_exclusive_range(struct iommufd_device *idev);
 
 struct iommufd_access_ops {
 	u8 needs_pin_pages : 1;
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index c218c89e0e2e..548d4b5afcd4 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -310,6 +310,7 @@ struct iommu_ioas_unmap {
 enum iommufd_option {
 	IOMMU_OPTION_RLIMIT_MODE = 0,
 	IOMMU_OPTION_HUGE_PAGES = 1,
+	IOMMU_OPTION_EXCLUSIVE_RANGES = 2,
 };
 
 /**
-- 
2.43.0
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by dan.j.williams@intel.com 2 months ago
Aneesh Kumar K.V (Arm) wrote:
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
>  drivers/iommu/iommufd/device.c          | 54 +++++++++++++++++++++++++
>  drivers/iommu/iommufd/iommufd_private.h |  4 ++
>  drivers/iommu/iommufd/main.c            |  3 ++
>  drivers/vfio/pci/vfio_pci_core.c        |  9 ++++-
>  include/linux/iommufd.h                 |  1 +
>  include/uapi/linux/iommufd.h            |  1 +
>  6 files changed, 71 insertions(+), 1 deletion(-)

I think we simply make the rule be something like this:

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 700addee8f62..d84158aacabf 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -138,6 +138,7 @@ config PCI_IDE_STREAM_MAX
 
 config PCI_TSM
 	bool "PCI TSM: Device security protocol support"
+	depends on IO_STRICT_DEVMEM
 	select PCI_IDE
 	select PCI_DOE
 	help

...i.e. the base expectation is there is only ever one owner of
potentially private MMIO space.
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Xu Yilun 2 months, 1 week ago
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 6328c3a05bcd..bee3cf3226e9 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -1753,8 +1753,15 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
>  	 * we need to request the region and the barmap tracks that.
>  	 */
>  	if (!vdev->barmap[index]) {
> -		ret = pci_request_selected_regions(pdev,
> +
> +		if (core_vdev->iommufd_device &&
> +		    iommufd_device_need_exclusive_range(core_vdev->iommufd_device))
> +			ret = pci_request_selected_regions_exclusive(pdev,
> +							1 << index, "vfio-pci");
> +		else
> +			ret = pci_request_selected_regions(pdev,
>  						   1 << index, "vfio-pci");
> +
>  		if (ret)
>  			return ret;

I did't get the idea.

The purpose of my original patch [1] is not to make VFIO choose between
pci_request_regions_exclusive() or pci_request_regions(). It is mainly
to prevent userspace mmap/read/write against a vfio_cdev FD. For
example:

  If pci_request_selected_regions() is succesfully executed on mmap(),
  later TSM Bind would fail on its pci_request_regions_exclusive(). It
  means userspace should not mmap otherwise you can't do private
  assignment. Vice versa, if you've done TSM Bind, you cannot mmap
  anymore.

The _exclusive is just a bonus that further prevents "/dev/mem and the
sysfs MMIO access"

[1]: https://lore.kernel.org/all/20250529053513.1592088-20-yilun.xu@linux.intel.com/

Thanks,
Yilun
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Jason Gunthorpe 2 months, 1 week ago
On Mon, Jul 28, 2025 at 07:21:43PM +0530, Aneesh Kumar K.V (Arm) wrote:
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>

Why would we need this?

I can sort of understand why Intel would need it due to their issues
with MCE, but ARM shouldn't care either way, should it?

But also why is it an iommufd option? That doesn't seem right..

Jason
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Aneesh Kumar K.V 2 months, 1 week ago
Jason Gunthorpe <jgg@ziepe.ca> writes:

> On Mon, Jul 28, 2025 at 07:21:43PM +0530, Aneesh Kumar K.V (Arm) wrote:
>> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
>
> Why would we need this?
>
> I can sort of understand why Intel would need it due to their issues
> with MCE, but ARM shouldn't care either way, should it?
>
> But also why is it an iommufd option? That doesn't seem right..
>
> Jason

This is based on our previous discussion https://lore.kernel.org/all/20250606120919.GH19710@nvidia.com

IIUC, we intend to request the resource in exclusive mode for secure
guests—regardless of whether the platform is Intel or ARM. Could you
help clarify the MCE issue observed on Intel platforms in this context?

-aneesh
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Jason Gunthorpe 2 months, 1 week ago
On Tue, Jul 29, 2025 at 01:58:54PM +0530, Aneesh Kumar K.V wrote:
> Jason Gunthorpe <jgg@ziepe.ca> writes:
> 
> > On Mon, Jul 28, 2025 at 07:21:43PM +0530, Aneesh Kumar K.V (Arm) wrote:
> >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> >
> > Why would we need this?
> >
> > I can sort of understand why Intel would need it due to their issues
> > with MCE, but ARM shouldn't care either way, should it?
> >
> > But also why is it an iommufd option? That doesn't seem right..
> >
> > Jason
> 
> This is based on our previous discussion https://lore.kernel.org/all/20250606120919.GH19710@nvidia.com

I suggested a global option, this is a per-device option, and that
especially seems wrong for iommufd. If it is per-device that is vfio,
if it is global then vfio can pick it up during the early phases of
opening the device.

> IIUC, we intend to request the resource in exclusive mode for secure
> guests—regardless of whether the platform is Intel or ARM. Could you
> help clarify the MCE issue observed on Intel platforms in this context?

As I understand it Intel MCEs if the non-secure side ever reads from
secure'd address space. So there is alot of emphasis there to ensure
there are no CPU mappings.

Jason
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Xu Yilun 2 months, 1 week ago
On Tue, Jul 29, 2025 at 11:29:17AM -0300, Jason Gunthorpe wrote:
> On Tue, Jul 29, 2025 at 01:58:54PM +0530, Aneesh Kumar K.V wrote:
> > Jason Gunthorpe <jgg@ziepe.ca> writes:
> > 
> > > On Mon, Jul 28, 2025 at 07:21:43PM +0530, Aneesh Kumar K.V (Arm) wrote:
> > >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> > >
> > > Why would we need this?
> > >
> > > I can sort of understand why Intel would need it due to their issues
> > > with MCE, but ARM shouldn't care either way, should it?
> > >
> > > But also why is it an iommufd option? That doesn't seem right..
> > >
> > > Jason
> > 
> > This is based on our previous discussion https://lore.kernel.org/all/20250606120919.GH19710@nvidia.com
> 
> I suggested a global option, this is a per-device option, and that
> especially seems wrong for iommufd. If it is per-device that is vfio,

I think this should be per-device. The original purpose of this
pci_region_request_*() is to prevent further mmap/read/write against
a vfio_cdev FD which would be used for private assignment. You shouldn't
prevent all other devices from working with userspace APPs (e.g. DPDK)
if there is one private assignment in system.

> if it is global then vfio can pick it up during the early phases of
> opening the device.
> 
> > IIUC, we intend to request the resource in exclusive mode for secure
> > guests—regardless of whether the platform is Intel or ARM. Could you
> > help clarify the MCE issue observed on Intel platforms in this context?
> 
> As I understand it Intel MCEs if the non-secure side ever reads from
> secure'd address space. So there is alot of emphasis there to ensure

Yeah, Intel TDX doesn't have a lower access control table for CC. So if
host reads, the TLP sends and MCE happens.

Thanks,
Yilun

> there are no CPU mappings.
> 
> Jason
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Jason Gunthorpe 2 months ago
On Wed, Jul 30, 2025 at 02:55:02PM +0800, Xu Yilun wrote:
> On Tue, Jul 29, 2025 at 11:29:17AM -0300, Jason Gunthorpe wrote:
> > On Tue, Jul 29, 2025 at 01:58:54PM +0530, Aneesh Kumar K.V wrote:
> > > Jason Gunthorpe <jgg@ziepe.ca> writes:
> > > 
> > > > On Mon, Jul 28, 2025 at 07:21:43PM +0530, Aneesh Kumar K.V (Arm) wrote:
> > > >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> > > >
> > > > Why would we need this?
> > > >
> > > > I can sort of understand why Intel would need it due to their issues
> > > > with MCE, but ARM shouldn't care either way, should it?
> > > >
> > > > But also why is it an iommufd option? That doesn't seem right..
> > > >
> > > > Jason
> > > 
> > > This is based on our previous discussion https://lore.kernel.org/all/20250606120919.GH19710@nvidia.com
> > 
> > I suggested a global option, this is a per-device option, and that
> > especially seems wrong for iommufd. If it is per-device that is vfio,
> 
> I think this should be per-device.

IMHO there is no use case for that, it should arguably be global to
the whole kernel.

> The original purpose of this pci_region_request_*() is to prevent
> further mmap/read/write against a vfio_cdev FD which would be used

No way, the VFIO internal mmap should be controled by VFIO not by
request region. If you want to block that it should be blocked by
iommufd telling VFIO that the device is bound which revokes the
mmaps/dmabufs/etc and prevents opening new ones.

The only thing request region should do is prevent /sys/../resource,
/dev/mem users and so on, which is why it can and should be
global. Arguably VFIO should always block those things but
historically hasn't..

There should only be one request region call in VFIO, it should
ideally happen when the VFIO driver probes the device.

Jason
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Xu Yilun 2 months ago
On Thu, Jul 31, 2025 at 09:22:17AM -0300, Jason Gunthorpe wrote:
> On Wed, Jul 30, 2025 at 02:55:02PM +0800, Xu Yilun wrote:
> > On Tue, Jul 29, 2025 at 11:29:17AM -0300, Jason Gunthorpe wrote:
> > > On Tue, Jul 29, 2025 at 01:58:54PM +0530, Aneesh Kumar K.V wrote:
> > > > Jason Gunthorpe <jgg@ziepe.ca> writes:
> > > > 
> > > > > On Mon, Jul 28, 2025 at 07:21:43PM +0530, Aneesh Kumar K.V (Arm) wrote:
> > > > >> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> > > > >
> > > > > Why would we need this?
> > > > >
> > > > > I can sort of understand why Intel would need it due to their issues
> > > > > with MCE, but ARM shouldn't care either way, should it?
> > > > >
> > > > > But also why is it an iommufd option? That doesn't seem right..
> > > > >
> > > > > Jason
> > > > 
> > > > This is based on our previous discussion https://lore.kernel.org/all/20250606120919.GH19710@nvidia.com
> > > 
> > > I suggested a global option, this is a per-device option, and that
> > > especially seems wrong for iommufd. If it is per-device that is vfio,
> > 
> > I think this should be per-device.
> 
> IMHO there is no use case for that, it should arguably be global to
> the whole kernel.

I think there are 2 topics here:

1. Prevent VFIO mmap
2. Prevent /sys/../resource, /dev/mem users

I assume you are refering to the 2nd, then I agree.

> 
> > The original purpose of this pci_region_request_*() is to prevent
> > further mmap/read/write against a vfio_cdev FD which would be used
> 
> No way, the VFIO internal mmap should be controled by VFIO not by

I assume your point is never to use more than one request region in the
same driver to achieve some mutual exclusion. I'm good to it. We could
switch to some bound flag.

> request region. If you want to block that it should be blocked by
> iommufd telling VFIO that the device is bound which revokes the
> mmaps/dmabufs/etc and prevents opening new ones.

Agree.

> 
> The only thing request region should do is prevent /sys/../resource,
> /dev/mem users and so on, which is why it can and should be
> global. Arguably VFIO should always block those things but
> historically hasn't..

Agree. So seems no need a global option?

Thanks,
Yilun

> 
> There should only be one request region call in VFIO, it should
> ideally happen when the VFIO driver probes the device.
> 
> Jason
Re: [RFC PATCH v1 06/38] iommufd: Add and option to request for bar mapping with IORESOURCE_EXCLUSIVE
Posted by Jason Gunthorpe 2 months ago
On Tue, Aug 05, 2025 at 10:26:06AM +0800, Xu Yilun wrote:
> > IMHO there is no use case for that, it should arguably be global to
> > the whole kernel.
> 
> I think there are 2 topics here:
> 
> 1. Prevent VFIO mmap
> 2. Prevent /sys/../resource, /dev/mem users
> 
> I assume you are refering to the 2nd, then I agree.

Yes, the region stuff is only about #2.

> > 
> > > The original purpose of this pci_region_request_*() is to prevent
> > > further mmap/read/write against a vfio_cdev FD which would be used
> > 
> > No way, the VFIO internal mmap should be controled by VFIO not by
> 
> I assume your point is never to use more than one request region in the
> same driver to achieve some mutual exclusion. I'm good to it. We could
> switch to some bound flag.

Yes and yes

> > The only thing request region should do is prevent /sys/../resource,
> > /dev/mem users and so on, which is why it can and should be
> > global. Arguably VFIO should always block those things but
> > historically hasn't..
> 
> Agree. So seems no need a global option?

I'd ask Alex if he is OK with a global behavior change to make vfio
exclusive, after any required fixing to make vfio only request the
regions once at driver probe time..

Jason