For systems that require MSI pages to be mapped into the IOMMU translation
the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default
recommended IOVA window to place these mappings. However, there is nothing
special about this address. And to support the RMR trick in VMM for nested
translation, the VMM needs to know what sw_msi window the kernel is using.
As there is no particular reason to force VMM to adopt the kernel default,
provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use
to directly specify the sw_msi window that it wants to use, which replaces
and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having
to build an API to discover the default IOMMU_RESV_SW_MSI.
Since iommufd now has its own sw_msi function, this is easy to implement.
To keep things simple, the parameters are global to the entire iommufd FD,
and will directly replace the IOMMU_RESV_SW_MSI values. The VMM must set
the values before creating any hwpt's to have any effect.
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/iommufd/iommufd_private.h | 4 +++
include/uapi/linux/iommufd.h | 18 ++++++++++++-
drivers/iommu/iommufd/device.c | 4 +++
drivers/iommu/iommufd/io_pagetable.c | 4 ++-
drivers/iommu/iommufd/ioas.c | 34 +++++++++++++++++++++++++
drivers/iommu/iommufd/main.c | 6 +++++
6 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 3e83bbb5912c..9f071609f00b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -45,6 +45,9 @@ struct iommufd_ctx {
struct mutex sw_msi_lock;
struct list_head sw_msi_list;
unsigned int sw_msi_id;
+ /* User-programmed SW_MSI region, to override igroup->sw_msi_start */
+ phys_addr_t sw_msi_start;
+ size_t sw_msi_size;
u8 account_mode;
/* Compatibility with VFIO no iommu */
@@ -281,6 +284,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
+int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx);
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
int iommufd_check_iova_range(struct io_pagetable *iopt,
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 34810f6ae2b5..c864a201e502 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -294,7 +294,9 @@ struct iommu_ioas_unmap {
/**
* enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
- * ioctl(IOMMU_OPTION_HUGE_PAGES)
+ * ioctl(IOMMU_OPTION_HUGE_PAGES) and
+ * ioctl(IOMMU_OPTION_SW_MSI_START) and
+ * ioctl(IOMMU_OPTION_SW_MSI_SIZE)
* @IOMMU_OPTION_RLIMIT_MODE:
* Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
* to invoke this. Value 0 (default) is user based accounting, 1 uses process
@@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
* iommu mappings. Value 0 disables combining, everything is mapped to
* PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
* option, the object_id must be the IOAS ID.
+ * @IOMMU_OPTION_SW_MSI_START:
+ * Change the base address of the IOMMU mapping region for MSI doorbell(s).
+ * It must be set this before attaching a device to an IOAS/HWPT, otherwise
+ * this option will be not effective on that IOAS/HWPT. User can choose to
+ * let kernel pick a base address, by simply ignoring this option or setting
+ * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0
+ * @IOMMU_OPTION_SW_MSI_SIZE:
+ * Change the size of the IOMMU mapping region for MSI doorbell(s). It must
+ * be set this before attaching a device to an IOAS/HWPT, otherwise it won't
+ * be effective on that IOAS/HWPT. The value is in MB, and the minimum value
+ * is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
+ * value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id must be 0
*/
enum iommufd_option {
IOMMU_OPTION_RLIMIT_MODE = 0,
IOMMU_OPTION_HUGE_PAGES = 1,
+ IOMMU_OPTION_SW_MSI_START = 2,
+ IOMMU_OPTION_SW_MSI_SIZE = 3,
};
/**
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index f75b3c23cd41..093a3bd798db 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -445,10 +445,14 @@ static int
iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
struct iommufd_hwpt_paging *hwpt_paging)
{
+ struct iommufd_ctx *ictx = idev->ictx;
int rc;
lockdep_assert_held(&idev->igroup->lock);
+ /* Override it with a user-programmed SW_MSI region */
+ if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
+ idev->igroup->sw_msi_start = ictx->sw_msi_start;
rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
idev->dev,
&idev->igroup->sw_msi_start);
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 8a790e597e12..5d7f5ca1eecf 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
num_hw_msi++;
if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
- *sw_msi_start = resv->start;
+ /* Bypass the driver-defined SW_MSI region, if preset */
+ if (*sw_msi_start == PHYS_ADDR_MAX)
+ *sw_msi_start = resv->start;
num_sw_msi++;
}
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 1542c5fd10a8..3f4e25b660f9 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -620,6 +620,40 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
return -EOPNOTSUPP;
}
+int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx)
+{
+ if (cmd->object_id)
+ return -EOPNOTSUPP;
+
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_SW_MSI_START:
+ cmd->val64 = (u64)ictx->sw_msi_start;
+ break;
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ cmd->val64 = (u64)ictx->sw_msi_size;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_SW_MSI_START:
+ ictx->sw_msi_start = (phys_addr_t)cmd->val64;
+ break;
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ ictx->sw_msi_size = (size_t)cmd->val64;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
struct iommufd_ioas *ioas)
{
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 7cc9497b7193..026297265c71 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -229,6 +229,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
init_waitqueue_head(&ictx->destroy_wait);
mutex_init(&ictx->sw_msi_lock);
INIT_LIST_HEAD(&ictx->sw_msi_list);
+ ictx->sw_msi_start = PHYS_ADDR_MAX;
+ ictx->sw_msi_size = 0;
filp->private_data = ictx;
return 0;
}
@@ -287,6 +289,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
case IOMMU_OPTION_RLIMIT_MODE:
rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
break;
+ case IOMMU_OPTION_SW_MSI_START:
+ case IOMMU_OPTION_SW_MSI_SIZE:
+ rc = iommufd_option_sw_msi(cmd, ucmd->ictx);
+ break;
case IOMMU_OPTION_HUGE_PAGES:
rc = iommufd_ioas_option(ucmd);
break;
--
2.43.0
Hi,
On 1/11/25 4:32 AM, Nicolin Chen wrote:
> For systems that require MSI pages to be mapped into the IOMMU translation
> the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default
> recommended IOVA window to place these mappings. However, there is nothing
> special about this address. And to support the RMR trick in VMM for nested
well at least it shall not overlap VMM's RAM. So it was not random either.
> translation, the VMM needs to know what sw_msi window the kernel is using.
> As there is no particular reason to force VMM to adopt the kernel default,
> provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use
> to directly specify the sw_msi window that it wants to use, which replaces
> and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having
> to build an API to discover the default IOMMU_RESV_SW_MSI.
IIUC the MSI window will then be different when using legacy VFIO
assignment and iommufd backend.
MSI reserved regions are exposed in
/sys/kernel/iommu_groups/<n>/reserved_regions
0x0000000008000000 0x00000000080fffff msi
Is that configurability reflected accordingly?
How do you make sure it does not collide with other resv regions? I
don't see any check here.
>
> Since iommufd now has its own sw_msi function, this is easy to implement.
>
> To keep things simple, the parameters are global to the entire iommufd FD,
> and will directly replace the IOMMU_RESV_SW_MSI values. The VMM must set
> the values before creating any hwpt's to have any effect.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
> drivers/iommu/iommufd/iommufd_private.h | 4 +++
> include/uapi/linux/iommufd.h | 18 ++++++++++++-
> drivers/iommu/iommufd/device.c | 4 +++
> drivers/iommu/iommufd/io_pagetable.c | 4 ++-
> drivers/iommu/iommufd/ioas.c | 34 +++++++++++++++++++++++++
> drivers/iommu/iommufd/main.c | 6 +++++
> 6 files changed, 68 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
> index 3e83bbb5912c..9f071609f00b 100644
> --- a/drivers/iommu/iommufd/iommufd_private.h
> +++ b/drivers/iommu/iommufd/iommufd_private.h
> @@ -45,6 +45,9 @@ struct iommufd_ctx {
> struct mutex sw_msi_lock;
> struct list_head sw_msi_list;
> unsigned int sw_msi_id;
> + /* User-programmed SW_MSI region, to override igroup->sw_msi_start */
> + phys_addr_t sw_msi_start;
> + size_t sw_msi_size;
>
> u8 account_mode;
> /* Compatibility with VFIO no iommu */
> @@ -281,6 +284,7 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
> int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
> int iommufd_option_rlimit_mode(struct iommu_option *cmd,
> struct iommufd_ctx *ictx);
> +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx);
>
> int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
> int iommufd_check_iova_range(struct io_pagetable *iopt,
> diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
> index 34810f6ae2b5..c864a201e502 100644
> --- a/include/uapi/linux/iommufd.h
> +++ b/include/uapi/linux/iommufd.h
> @@ -294,7 +294,9 @@ struct iommu_ioas_unmap {
>
> /**
> * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
> - * ioctl(IOMMU_OPTION_HUGE_PAGES)
> + * ioctl(IOMMU_OPTION_HUGE_PAGES) and
> + * ioctl(IOMMU_OPTION_SW_MSI_START) and
> + * ioctl(IOMMU_OPTION_SW_MSI_SIZE)
> * @IOMMU_OPTION_RLIMIT_MODE:
> * Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
> * to invoke this. Value 0 (default) is user based accounting, 1 uses process
> @@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
> * iommu mappings. Value 0 disables combining, everything is mapped to
> * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
> * option, the object_id must be the IOAS ID.
> + * @IOMMU_OPTION_SW_MSI_START:
> + * Change the base address of the IOMMU mapping region for MSI doorbell(s).
> + * It must be set this before attaching a device to an IOAS/HWPT, otherwise
> + * this option will be not effective on that IOAS/HWPT. User can choose to
> + * let kernel pick a base address, by simply ignoring this option or setting
> + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0
I think we should document it cannot be put at a random place either.
> + * @IOMMU_OPTION_SW_MSI_SIZE:
> + * Change the size of the IOMMU mapping region for MSI doorbell(s). It must
> + * be set this before attaching a device to an IOAS/HWPT, otherwise it won't
> + * be effective on that IOAS/HWPT. The value is in MB, and the minimum value
> + * is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
> + * value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id must be 0
> */
> enum iommufd_option {
> IOMMU_OPTION_RLIMIT_MODE = 0,
> IOMMU_OPTION_HUGE_PAGES = 1,
> + IOMMU_OPTION_SW_MSI_START = 2,
> + IOMMU_OPTION_SW_MSI_SIZE = 3,
> };
>
> /**
> diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
> index f75b3c23cd41..093a3bd798db 100644
> --- a/drivers/iommu/iommufd/device.c
> +++ b/drivers/iommu/iommufd/device.c
> @@ -445,10 +445,14 @@ static int
> iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
> struct iommufd_hwpt_paging *hwpt_paging)
> {
> + struct iommufd_ctx *ictx = idev->ictx;
> int rc;
>
> lockdep_assert_held(&idev->igroup->lock);
>
> + /* Override it with a user-programmed SW_MSI region */
> + if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
> + idev->igroup->sw_msi_start = ictx->sw_msi_start;
> rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
> idev->dev,
> &idev->igroup->sw_msi_start);
> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
> index 8a790e597e12..5d7f5ca1eecf 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
> if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
> num_hw_msi++;
> if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
> - *sw_msi_start = resv->start;
> + /* Bypass the driver-defined SW_MSI region, if preset */
> + if (*sw_msi_start == PHYS_ADDR_MAX)
> + *sw_msi_start = resv->start;
> num_sw_msi++;
> }
>
> diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
> index 1542c5fd10a8..3f4e25b660f9 100644
> --- a/drivers/iommu/iommufd/ioas.c
> +++ b/drivers/iommu/iommufd/ioas.c
> @@ -620,6 +620,40 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
> return -EOPNOTSUPP;
> }
>
> +int iommufd_option_sw_msi(struct iommu_option *cmd, struct iommufd_ctx *ictx)
> +{
> + if (cmd->object_id)
> + return -EOPNOTSUPP;
> +
> + if (cmd->op == IOMMU_OPTION_OP_GET) {
> + switch (cmd->option_id) {
> + case IOMMU_OPTION_SW_MSI_START:
> + cmd->val64 = (u64)ictx->sw_msi_start;
> + break;
> + case IOMMU_OPTION_SW_MSI_SIZE:
> + cmd->val64 = (u64)ictx->sw_msi_size;
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> + return 0;
> + }
> + if (cmd->op == IOMMU_OPTION_OP_SET) {
> + switch (cmd->option_id) {
> + case IOMMU_OPTION_SW_MSI_START:
> + ictx->sw_msi_start = (phys_addr_t)cmd->val64;
> + break;
> + case IOMMU_OPTION_SW_MSI_SIZE:
> + ictx->sw_msi_size = (size_t)cmd->val64;
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> + return 0;
> + }
> + return -EOPNOTSUPP;
> +}
> +
> static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
> struct iommufd_ioas *ioas)
> {
> diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
> index 7cc9497b7193..026297265c71 100644
> --- a/drivers/iommu/iommufd/main.c
> +++ b/drivers/iommu/iommufd/main.c
> @@ -229,6 +229,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
> init_waitqueue_head(&ictx->destroy_wait);
> mutex_init(&ictx->sw_msi_lock);
> INIT_LIST_HEAD(&ictx->sw_msi_list);
> + ictx->sw_msi_start = PHYS_ADDR_MAX;
> + ictx->sw_msi_size = 0;
> filp->private_data = ictx;
> return 0;
> }
> @@ -287,6 +289,10 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
> case IOMMU_OPTION_RLIMIT_MODE:
> rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
> break;
> + case IOMMU_OPTION_SW_MSI_START:
> + case IOMMU_OPTION_SW_MSI_SIZE:
> + rc = iommufd_option_sw_msi(cmd, ucmd->ictx);
> + break;
> case IOMMU_OPTION_HUGE_PAGES:
> rc = iommufd_ioas_option(ucmd);
> break;
Eric
On Wed, Jan 29, 2025 at 02:44:12PM +0100, Eric Auger wrote: > Hi, > > > On 1/11/25 4:32 AM, Nicolin Chen wrote: > > For systems that require MSI pages to be mapped into the IOMMU translation > > the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default > > recommended IOVA window to place these mappings. However, there is nothing > > special about this address. And to support the RMR trick in VMM for nested > well at least it shall not overlap VMM's RAM. So it was not random either. > > translation, the VMM needs to know what sw_msi window the kernel is using. > > As there is no particular reason to force VMM to adopt the kernel default, > > provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use > > to directly specify the sw_msi window that it wants to use, which replaces > > and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having > > to build an API to discover the default IOMMU_RESV_SW_MSI. > IIUC the MSI window will then be different when using legacy VFIO > assignment and iommufd backend. ? They use the same, iommufd can have userspace override it. Then it will ignore the reserved region. > MSI reserved regions are exposed in > /sys/kernel/iommu_groups/<n>/reserved_regions > 0x0000000008000000 0x00000000080fffff msi > Is that configurability reflected accordingly? ? Nothing using iommufd should parse that sysfs file. > How do you make sure it does not collide with other resv regions? I > don't see any check here. Yes this does need to be checked, it does look missing. It still needs to create a reserved region in the ioas when attaching to keep the areas safe and it has to intersect with the incoming reserved regions from the driver. > > + * @IOMMU_OPTION_SW_MSI_START: > > + * Change the base address of the IOMMU mapping region for MSI doorbell(s). > > + * It must be set this before attaching a device to an IOAS/HWPT, otherwise > > + * this option will be not effective on that IOAS/HWPT. User can choose to > > + * let kernel pick a base address, by simply ignoring this option or setting > > + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0 > I think we should document it cannot be put at a random place either. It can be put at any place a map can be placed. That also needs to be checked when creating a domain, it can't be outside the geometry. Jason
On Wed, Jan 29, 2025 at 10:58:00AM -0400, Jason Gunthorpe wrote: > On Wed, Jan 29, 2025 at 02:44:12PM +0100, Eric Auger wrote: > > On 1/11/25 4:32 AM, Nicolin Chen wrote: > > > For systems that require MSI pages to be mapped into the IOMMU translation > > > the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default > > > recommended IOVA window to place these mappings. However, there is nothing > > > special about this address. And to support the RMR trick in VMM for nested > > well at least it shall not overlap VMM's RAM. So it was not random either. > > > translation, the VMM needs to know what sw_msi window the kernel is using. > > > As there is no particular reason to force VMM to adopt the kernel default, > > > provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use > > > to directly specify the sw_msi window that it wants to use, which replaces > > > and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having > > > to build an API to discover the default IOMMU_RESV_SW_MSI. > > IIUC the MSI window will then be different when using legacy VFIO > > assignment and iommufd backend. > > ? They use the same, iommufd can have userspace override it. Then it > will ignore the reserved region. > > > MSI reserved regions are exposed in > > /sys/kernel/iommu_groups/<n>/reserved_regions > > 0x0000000008000000 0x00000000080fffff msi > > > Is that configurability reflected accordingly? > > ? > > Nothing using iommufd should parse that sysfs file. > > > How do you make sure it does not collide with other resv regions? I > > don't see any check here. > > Yes this does need to be checked, it does look missing. It still needs > to create a reserved region in the ioas when attaching to keep the > areas safe and it has to intersect with the incoming reserved > regions from the driver. Yea, I found iopt_reserve_iova() is actually missed entirely... While fixing this, I see a way to turn the OPTIONs back to per- idev, if you still prefer them to be per-idev(?). Then, we can check a given input in the set_option() against the device's reserved region list from the driver, prior to device attaching to any HWPT. Otherwise, we just rely on iopt_enforce_device_reserve_region() during an attach, keeping the option global to simplify VMMs. Thanks Nicolin
On Thu, Feb 06, 2025 at 08:26:05PM -0800, Nicolin Chen wrote: > Yea, I found iopt_reserve_iova() is actually missed entirely... > > While fixing this, I see a way to turn the OPTIONs back to per- > idev, if you still prefer them to be per-idev(?). Then, we can > check a given input in the set_option() against the device's > reserved region list from the driver, prior to device attaching > to any HWPT. I didn't have a strong opinion, if the idev works without complexity then I'd stick with that on the basis of narrower scope is usually better. Jason
On Fri, Feb 07, 2025 at 10:30:20AM -0400, Jason Gunthorpe wrote: > On Thu, Feb 06, 2025 at 08:26:05PM -0800, Nicolin Chen wrote: > > Yea, I found iopt_reserve_iova() is actually missed entirely... > > > > While fixing this, I see a way to turn the OPTIONs back to per- > > idev, if you still prefer them to be per-idev(?). Then, we can > > check a given input in the set_option() against the device's > > reserved region list from the driver, prior to device attaching > > to any HWPT. > > I didn't have a strong opinion, if the idev works without complexity > then I'd stick with that on the basis of narrower scope is usually > better. If you make it per-idev then it is also implicitly per-GIC as well since each idev has exactly one GIC This would make it useful as a way to get each ITS page mapped into a single fixed location.. Really hard to use from the VMM though Jason
On Fri, Feb 07, 2025 at 11:28:01AM -0400, Jason Gunthorpe wrote: > On Fri, Feb 07, 2025 at 10:30:20AM -0400, Jason Gunthorpe wrote: > > On Thu, Feb 06, 2025 at 08:26:05PM -0800, Nicolin Chen wrote: > > > Yea, I found iopt_reserve_iova() is actually missed entirely... > > > > > > While fixing this, I see a way to turn the OPTIONs back to per- > > > idev, if you still prefer them to be per-idev(?). Then, we can > > > check a given input in the set_option() against the device's > > > reserved region list from the driver, prior to device attaching > > > to any HWPT. > > > > I didn't have a strong opinion, if the idev works without complexity > > then I'd stick with that on the basis of narrower scope is usually > > better. We could forward the "SW_MSI_START" location or vITS's IPA into each vSMMU module in the QEMU, then vSMMU module would initiate SET_OPTION when an iommufd-enabled VFIO device attaching to it. That's how I tested it. And this is one vITS in the VM, so we'd only have one fixed location for all devices. > If you make it per-idev then it is also implicitly per-GIC as well > since each idev has exactly one GIC > > This would make it useful as a way to get each ITS page mapped into a > single fixed location.. Hmm, is it for the approach-2 (i.e. vITS solution)? Do you mean a use case: Multiple vITS pages <=> One pITS page? > Really hard to use from the VMM though I could imagine. The caller initiating a SET_OPTION call in VMM will have to know what vITS page for what device. So, this info has to go through the KVM/IRQ module to get processed and then forwarded to the caller (vSMMU module at this moment).. Thanks Nicolin
On Fri, Feb 07, 2025 at 10:59:48AM -0800, Nicolin Chen wrote: > On Fri, Feb 07, 2025 at 11:28:01AM -0400, Jason Gunthorpe wrote: > > On Fri, Feb 07, 2025 at 10:30:20AM -0400, Jason Gunthorpe wrote: > > > On Thu, Feb 06, 2025 at 08:26:05PM -0800, Nicolin Chen wrote: > > > > Yea, I found iopt_reserve_iova() is actually missed entirely... > > > > > > > > While fixing this, I see a way to turn the OPTIONs back to per- > > > > idev, if you still prefer them to be per-idev(?). Then, we can > > > > check a given input in the set_option() against the device's > > > > reserved region list from the driver, prior to device attaching > > > > to any HWPT. > > > > > > I didn't have a strong opinion, if the idev works without complexity > > > then I'd stick with that on the basis of narrower scope is usually > > > better. On reflection I don't think a per-idev is going to work very well.. Part of the design was to keep track of a bitmap of already mapped pages in the single hpwt that unions all of the devices. If it is per-device then that basic thing doesn't work and it becomes much more complicated > I could imagine. The caller initiating a SET_OPTION call in VMM > will have to know what vITS page for what device. So, this info > has to go through the KVM/IRQ module to get processed and then > forwarded to the caller (vSMMU module at this moment).. Ultimately, as we saw in the other conversation, the qemu command line will need to describe the GIC(s) and all their ITS pages directly, somehow. Jason
On 1/29/25 3:58 PM, Jason Gunthorpe wrote: > On Wed, Jan 29, 2025 at 02:44:12PM +0100, Eric Auger wrote: >> Hi, >> >> >> On 1/11/25 4:32 AM, Nicolin Chen wrote: >>> For systems that require MSI pages to be mapped into the IOMMU translation >>> the IOMMU driver provides an IOMMU_RESV_SW_MSI range, which is the default >>> recommended IOVA window to place these mappings. However, there is nothing >>> special about this address. And to support the RMR trick in VMM for nested >> well at least it shall not overlap VMM's RAM. So it was not random either. >>> translation, the VMM needs to know what sw_msi window the kernel is using. >>> As there is no particular reason to force VMM to adopt the kernel default, >>> provide a simple IOMMU_OPTION_SW_MSI_START/SIZE ioctl that the VMM can use >>> to directly specify the sw_msi window that it wants to use, which replaces >>> and disables the default IOMMU_RESV_SW_MSI from the driver to avoid having >>> to build an API to discover the default IOMMU_RESV_SW_MSI. >> IIUC the MSI window will then be different when using legacy VFIO >> assignment and iommufd backend. > ? They use the same, iommufd can have userspace override it. Then it > will ignore the reserved region. In current arm-smmu-v3.c you have region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, prot, IOMMU_RESV_SW_MSI, GFP_KERNEL); in arm_smmu_get_resv_regions() If you overwrite the default region, don't you need to expose the user defined resv region? > >> MSI reserved regions are exposed in >> /sys/kernel/iommu_groups/<n>/reserved_regions >> 0x0000000008000000 0x00000000080fffff msi > >> Is that configurability reflected accordingly? > ? > > Nothing using iommufd should parse that sysfs file. Right but aren't you still supposed to populate the sysfs files properly. This region must be carved out from the IOVA space, right? > >> How do you make sure it does not collide with other resv regions? I >> don't see any check here. > Yes this does need to be checked, it does look missing. It still needs > to create a reserved region in the ioas when attaching to keep the > areas safe and it has to intersect with the incoming reserved > regions from the driver. > >>> + * @IOMMU_OPTION_SW_MSI_START: >>> + * Change the base address of the IOMMU mapping region for MSI doorbell(s). >>> + * It must be set this before attaching a device to an IOAS/HWPT, otherwise >>> + * this option will be not effective on that IOAS/HWPT. User can choose to >>> + * let kernel pick a base address, by simply ignoring this option or setting >>> + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0 >> I think we should document it cannot be put at a random place either. > It can be put at any place a map can be placed. to me It cannot overlap with guest RAM IPA so userspace needs to be cautious about that Eric > > That also needs to be checked when creating a domain, it can't be > outside the geometry. > > Jason >
On Wed, Jan 29, 2025 at 06:23:33PM +0100, Eric Auger wrote: > >> IIUC the MSI window will then be different when using legacy VFIO > >> assignment and iommufd backend. > > ? They use the same, iommufd can have userspace override it. Then it > > will ignore the reserved region. > In current arm-smmu-v3.c you have > region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, > prot, IOMMU_RESV_SW_MSI, > GFP_KERNEL); > > in arm_smmu_get_resv_regions() > If you overwrite the default region, don't you need to expose the user > defined resv region? If it was overriden inside iommufd then the user told the kernel what range to use to override it. I don't need to go back and report back to userspace information that it already gave to the kernel.. > > Nothing using iommufd should parse that sysfs file. > Right but aren't you still supposed to populate the sysfs files > properly. This region must be carved out from the IOVA space, right? The sysfs shouldn't be changed here based on how iommufd decides to use the iova space. The sysfs reflects the information reported from the driver and sw_msi should be understood as the driver's recommendation when you view it from sysfs. The actual reserved regions in effect for an iommufd object are queried directly in iommufd and do not have a sysfs representation. > >>> + * @IOMMU_OPTION_SW_MSI_START: > >>> + * Change the base address of the IOMMU mapping region for MSI doorbell(s). > >>> + * It must be set this before attaching a device to an IOAS/HWPT, otherwise > >>> + * this option will be not effective on that IOAS/HWPT. User can choose to > >>> + * let kernel pick a base address, by simply ignoring this option or setting > >>> + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0 > >> I think we should document it cannot be put at a random place either. > > It can be put at any place a map can be placed. > to me It cannot overlap with guest RAM IPA so userspace needs to be > cautious about that Yes, userspace needs to manage its own VM memory map to avoid overlaps, but from an API perspective it can be placed anywhere that a map can be placed. Jason
On 1/29/25 6:39 PM, Jason Gunthorpe wrote: > On Wed, Jan 29, 2025 at 06:23:33PM +0100, Eric Auger wrote: >>>> IIUC the MSI window will then be different when using legacy VFIO >>>> assignment and iommufd backend. >>> ? They use the same, iommufd can have userspace override it. Then it >>> will ignore the reserved region. >> In current arm-smmu-v3.c you have >> region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, >> prot, IOMMU_RESV_SW_MSI, >> GFP_KERNEL); >> >> in arm_smmu_get_resv_regions() >> If you overwrite the default region, don't you need to expose the user >> defined resv region? > If it was overriden inside iommufd then the user told the kernel what > range to use to override it. I don't need to go back and report back > to userspace information that it already gave to the kernel.. Looks strange to me because info exposed in sysfs is wrong then. What if someone else relies on this info, either at kernel level through the get_resv_regions callback or from user space. > >>> Nothing using iommufd should parse that sysfs file. >> Right but aren't you still supposed to populate the sysfs files >> properly. This region must be carved out from the IOVA space, right? > The sysfs shouldn't be changed here based on how iommufd decides to > use the iova space. The sysfs reflects the information reported from > the driver and sw_msi should be understood as the driver's > recommendation when you view it from sysfs. > > The actual reserved regions in effect for an iommufd object are > queried directly in iommufd and do not have a sysfs representation. > >>>>> + * @IOMMU_OPTION_SW_MSI_START: >>>>> + * Change the base address of the IOMMU mapping region for MSI doorbell(s). >>>>> + * It must be set this before attaching a device to an IOAS/HWPT, otherwise >>>>> + * this option will be not effective on that IOAS/HWPT. User can choose to >>>>> + * let kernel pick a base address, by simply ignoring this option or setting >>>>> + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id must be 0 >>>> I think we should document it cannot be put at a random place either. >>> It can be put at any place a map can be placed. >> to me It cannot overlap with guest RAM IPA so userspace needs to be >> cautious about that > Yes, userspace needs to manage its own VM memory map to avoid > overlaps, but from an API perspective it can be placed anywhere that a > map can be placed. OK Eric > > Jason >
On Wed, Jan 29, 2025 at 06:49:22PM +0100, Eric Auger wrote: > > If it was overriden inside iommufd then the user told the kernel what > > range to use to override it. I don't need to go back and report back > > to userspace information that it already gave to the kernel.. > > Looks strange to me because info exposed in sysfs is wrong then. What if > someone else relies on this info, either at kernel level through the > get_resv_regions callback or from user space. Nothing else should call get_resv_regions() because VFIO is bound to the device and iommufd owns the domain. We expect some exclusivity here :) sysfs for sw_msi should be understood as reporting the driver recommendation, not anything to do with the current MSI operation of the device. Jason
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Saturday, January 11, 2025 11:32 AM
>
> @@ -294,7 +294,9 @@ struct iommu_ioas_unmap {
>
> /**
> * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
> - * ioctl(IOMMU_OPTION_HUGE_PAGES)
> + * ioctl(IOMMU_OPTION_HUGE_PAGES) and
> + * ioctl(IOMMU_OPTION_SW_MSI_START) and
> + * ioctl(IOMMU_OPTION_SW_MSI_SIZE)
> * @IOMMU_OPTION_RLIMIT_MODE:
> * Change how RLIMIT_MEMLOCK accounting works. The caller must have
> privilege
> * to invoke this. Value 0 (default) is user based accounting, 1 uses process
> @@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
> * iommu mappings. Value 0 disables combining, everything is mapped to
> * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
> * option, the object_id must be the IOAS ID.
> + * @IOMMU_OPTION_SW_MSI_START:
> + * Change the base address of the IOMMU mapping region for MSI
> doorbell(s).
> + * It must be set this before attaching a device to an IOAS/HWPT,
remove 'this'
> otherwise
> + * this option will be not effective on that IOAS/HWPT. User can
Do we want to explicitly check this instead of leaving it no effect
silently?
> choose to
> + * let kernel pick a base address, by simply ignoring this option or setting
> + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id
> must be 0
> + * @IOMMU_OPTION_SW_MSI_SIZE:
> + * Change the size of the IOMMU mapping region for MSI doorbell(s). It
> must
> + * be set this before attaching a device to an IOAS/HWPT, otherwise it
> won't
> + * be effective on that IOAS/HWPT. The value is in MB, and the minimum
> value
> + * is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
> + * value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id
> must be 0
hmm there is no check on the minimal value and enable the effect
of value 0 in this patch.
> iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
> struct iommufd_hwpt_paging
> *hwpt_paging)
> {
> + struct iommufd_ctx *ictx = idev->ictx;
> int rc;
>
> lockdep_assert_held(&idev->igroup->lock);
>
> + /* Override it with a user-programmed SW_MSI region */
> + if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
> + idev->igroup->sw_msi_start = ictx->sw_msi_start;
> rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
> idev->dev,
> &idev->igroup-
> >sw_msi_start);
what about moving above additions into
iopt_table_enforce_dev_resv_regions() which is all about finding
a sw_msi address and can check the user setting internally?
> diff --git a/drivers/iommu/iommufd/io_pagetable.c
> b/drivers/iommu/iommufd/io_pagetable.c
> index 8a790e597e12..5d7f5ca1eecf 100644
> --- a/drivers/iommu/iommufd/io_pagetable.c
> +++ b/drivers/iommu/iommufd/io_pagetable.c
> @@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct
> io_pagetable *iopt,
> if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
> num_hw_msi++;
> if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
> - *sw_msi_start = resv->start;
> + /* Bypass the driver-defined SW_MSI region, if preset
> */
> + if (*sw_msi_start == PHYS_ADDR_MAX)
> + *sw_msi_start = resv->start;
the code is not about bypass. Instead it's to use the driver-defined
region if user doesn't set it.
On Thu, Jan 23, 2025 at 10:07:13AM +0000, Tian, Kevin wrote:
> > From: Nicolin Chen <nicolinc@nvidia.com>
> > Sent: Saturday, January 11, 2025 11:32 AM
> >
> > @@ -294,7 +294,9 @@ struct iommu_ioas_unmap {
> >
> > /**
> > * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
> > - * ioctl(IOMMU_OPTION_HUGE_PAGES)
> > + * ioctl(IOMMU_OPTION_HUGE_PAGES) and
> > + * ioctl(IOMMU_OPTION_SW_MSI_START) and
> > + * ioctl(IOMMU_OPTION_SW_MSI_SIZE)
> > * @IOMMU_OPTION_RLIMIT_MODE:
> > * Change how RLIMIT_MEMLOCK accounting works. The caller must have
> > privilege
> > * to invoke this. Value 0 (default) is user based accounting, 1 uses process
> > @@ -304,10 +306,24 @@ struct iommu_ioas_unmap {
> > * iommu mappings. Value 0 disables combining, everything is mapped to
> > * PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
> > * option, the object_id must be the IOAS ID.
> > + * @IOMMU_OPTION_SW_MSI_START:
> > + * Change the base address of the IOMMU mapping region for MSI
> > doorbell(s).
> > + * It must be set this before attaching a device to an IOAS/HWPT,
>
> remove 'this'
Ack.
> > otherwise
> > + * this option will be not effective on that IOAS/HWPT. User can
>
> Do we want to explicitly check this instead of leaving it no effect
> silently?
So, the idea here is:
If this option is unset, use the default SW_MSI from the driver
If this option is set, use it over the default SW_MSI from the driver
That's what the following statement "User can choose to let.." means.
> > choose to
> > + * let kernel pick a base address, by simply ignoring this option or setting
> > + * a value 0 to IOMMU_OPTION_SW_MSI_SIZE. Global option, object_id
> > must be 0
> > + * @IOMMU_OPTION_SW_MSI_SIZE:
> > + * Change the size of the IOMMU mapping region for MSI doorbell(s). It
> > must
> > + * be set this before attaching a device to an IOAS/HWPT, otherwise it
> > won't
> > + * be effective on that IOAS/HWPT. The value is in MB, and the minimum
> > value
> > + * is 1 MB. A value 0 (default) will invalidate the MSI doorbell base address
> > + * value set to IOMMU_OPTION_SW_MSI_START. Global option, object_id
> > must be 0
>
> hmm there is no check on the minimal value and enable the effect
> of value 0 in this patch.
Well, it's somewhat enforced by __aligned_u64 since it can't be any
value between 0 (disable) and 1 (minimal)?
And the override code checks "ctx->sw_msi_size".
> > iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
> > struct iommufd_hwpt_paging
> > *hwpt_paging)
> > {
> > + struct iommufd_ctx *ictx = idev->ictx;
> > int rc;
> >
> > lockdep_assert_held(&idev->igroup->lock);
> >
> > + /* Override it with a user-programmed SW_MSI region */
> > + if (ictx->sw_msi_size && ictx->sw_msi_start != PHYS_ADDR_MAX)
> > + idev->igroup->sw_msi_start = ictx->sw_msi_start;
> > rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
> > idev->dev,
> > &idev->igroup-
> > >sw_msi_start);
>
> what about moving above additions into
> iopt_table_enforce_dev_resv_regions() which is all about finding
> a sw_msi address and can check the user setting internally?
We could. Probably would be cleaner by doing that in one place.
> > diff --git a/drivers/iommu/iommufd/io_pagetable.c
> > b/drivers/iommu/iommufd/io_pagetable.c
> > index 8a790e597e12..5d7f5ca1eecf 100644
> > --- a/drivers/iommu/iommufd/io_pagetable.c
> > +++ b/drivers/iommu/iommufd/io_pagetable.c
> > @@ -1446,7 +1446,9 @@ int iopt_table_enforce_dev_resv_regions(struct
> > io_pagetable *iopt,
> > if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
> > num_hw_msi++;
> > if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
> > - *sw_msi_start = resv->start;
> > + /* Bypass the driver-defined SW_MSI region, if preset
> > */
> > + if (*sw_msi_start == PHYS_ADDR_MAX)
> > + *sw_msi_start = resv->start;
>
> the code is not about bypass. Instead it's to use the driver-defined
> region if user doesn't set it.
Ack:
/* If being unset, Use the default IOMMU_RESV_SW_MSI */
Thanks
Nicolin
© 2016 - 2026 Red Hat, Inc.