Add a new iommufd_viommu core structure to represent a vIOMMU instance in
the user space, typically backed by a HW-accelerated feature of an IOMMU,
e.g. NVIDIA CMDQ-Virtualization (an ARM SMMUv3 extension) and AMD Hardware
Accelerated Virtualized IOMMU (vIOMMU).
Define a new iommufd_viommu_ops to hold the free op and any future op.
A driver should embed this core structure in its driver viommu structure
and call the new iommufd_viommu_alloc() helper to allocate a core/driver
structure bundle and fill its core viommu->ops:
struct my_driver_viommu {
struct iommufd_viommu core;
....
};
static const struct iommufd_viommu_ops my_driver_viommu_ops = {
.free = my_driver_viommu_free,
};
struct my_driver_viommu *my_viommu =
iommufd_viommu_alloc(my_driver_viommu, core);
my_viommu->core.ops = &my_driver_viommu_ops;
Lastly, add viommu_alloc to iommu_ops so iommufd can link to the driver to
allocate a viommu object.
Any future viommu op will be added to the new struct iommufd_viommu_ops.
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/iommufd/viommu.c | 2 ++
include/linux/iommu.h | 15 ++++++++++++++
include/linux/iommufd.h | 38 ++++++++++++++++++++++++++++++++++
3 files changed, 55 insertions(+)
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index f77d6972d552..3886b1dd1f13 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -17,3 +17,5 @@
return NULL; \
return container_of(obj, struct iommufd_##name, obj); \
}
+
+viommu_struct_alloc(viommu);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2e925b5eba53..4d4461146288 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -145,6 +145,8 @@ struct iopf_queue {
struct mutex lock;
};
+struct iommufd_viommu;
+
/* iommu fault flags */
#define IOMMU_FAULT_READ 0x0
#define IOMMU_FAULT_WRITE 0x1
@@ -527,6 +529,14 @@ static inline int __iommu_copy_struct_from_user_array(
* @of_xlate: add OF master IDs to iommu grouping
* @is_attach_deferred: Check if domain attach should be deferred from iommu
* driver init to device driver init (default no)
+ * @viommu_alloc: Allocate an iommufd_viommu as a user space IOMMU instance,
+ * associated to a nested parent @domain, for iommu-specific
+ * hardware acceleration. The @viommu_type must be defined in
+ * the include/uapi/linux/iommufd.h header.
+ * It is suggested to call iommufd_viommu_alloc() helper for
+ * a bundled allocation of the core and the driver structure,
+ * and a driver in gernal should assign an iommufd_viommu_ops
+ * to the core structure's viommu->ops.
* @dev_enable/disable_feat: per device entries to enable/disable
* iommu specific features.
* @page_response: handle page request response
@@ -570,6 +580,11 @@ struct iommu_ops {
int (*of_xlate)(struct device *dev, const struct of_phandle_args *args);
bool (*is_attach_deferred)(struct device *dev);
+ /* User space instance allocation by the iommu driver */
+ struct iommufd_viommu *(*viommu_alloc)(struct device *dev,
+ unsigned int viommu_type,
+ struct iommu_domain *domain);
+
/* Per device IOMMU features */
int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f);
int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index a0cb08a4b653..650acfac307a 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -16,6 +16,7 @@ struct iommufd_device;
struct page;
struct iommufd_ctx;
struct iommufd_access;
+struct iommufd_hwpt_paging;
struct file;
struct iommu_group;
@@ -77,6 +78,26 @@ void iommufd_access_detach(struct iommufd_access *access);
void iommufd_ctx_get(struct iommufd_ctx *ictx);
+struct iommufd_viommu {
+ struct iommufd_object obj;
+ struct iommufd_ctx *ictx;
+ struct iommu_device *iommu_dev;
+ struct iommufd_hwpt_paging *hwpt;
+
+ const struct iommufd_viommu_ops *ops;
+
+ unsigned int type;
+};
+
+/**
+ * struct iommufd_viommu_ops - viommu specific operations
+ * @free: Free all driver-specific parts of an iommufd_viommu. The memory
+ * of the entire viommu will be free-ed by iommufd core
+ */
+struct iommufd_viommu_ops {
+ void (*free)(struct iommufd_viommu *viommu);
+};
+
#if IS_ENABLED(CONFIG_IOMMUFD)
struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
struct iommufd_ctx *iommufd_ctx_from_fd(int fd);
@@ -93,6 +114,8 @@ int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx);
int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx);
+
+struct iommufd_viommu *_iommufd_viommu_alloc(size_t size);
#else /* !CONFIG_IOMMUFD */
static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
{
@@ -133,5 +156,20 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
{
return -EOPNOTSUPP;
}
+
+static inline struct iommufd_viommu *_iommufd_viommu_alloc(size_t size)
+{
+ return NULL;
+}
#endif /* CONFIG_IOMMUFD */
+
+/*
+ * Helpers for IOMMU driver to allocate driver structures that will be freed by
+ * the iommufd core. A driver is only responsible for its own struct cleanup.
+ */
+#define iommufd_viommu_alloc(drv_struct, member) \
+ container_of(_iommufd_viommu_alloc(sizeof(struct drv_struct) + \
+ BUILD_BUG_ON_ZERO(offsetof( \
+ struct drv_struct, member))), \
+ struct drv_struct, member)
#endif
--
2.43.0
On Fri, Apr 12, 2024 at 08:47:01PM -0700, Nicolin Chen wrote:
> Add a new iommufd_viommu core structure to represent a vIOMMU instance in
> the user space, typically backed by a HW-accelerated feature of an IOMMU,
> e.g. NVIDIA CMDQ-Virtualization (an ARM SMMUv3 extension) and AMD Hardware
> Accelerated Virtualized IOMMU (vIOMMU).
I expect this will also be the only way to pass in an associated KVM,
userspace would supply the kvm when creating the viommu.
The tricky bit of this flow is how to manage the S2. It is necessary
that the S2 be linked to the viommu:
1) ARM BTM requires the VMID to be shared with KVM
2) AMD and others need the S2 translation because some of the HW
acceleration is done inside the guest address space
I haven't looked closely at AMD but presumably the VIOMMU create will
have to install the S2 into a DID or something?
So we need the S2 to exist before the VIOMMU is created, but the
drivers are going to need some more fixing before that will fully
work.
Does the nesting domain create need the viommu as well (in place of
the S2 hwpt)? That feels sort of natural.
There is still a lot of fixing before everything can work fully, but
do we need to make some preperations here in the uapi? Like starting
to thread the S2 through it as I described?
Kevin, does Intel forsee any viommu needs on current/future Intel HW?
I assume you are thinking about invalidation queue bypass like
everyone else. I think it is an essential feature for vSVA.
> A driver should embed this core structure in its driver viommu structure
> and call the new iommufd_viommu_alloc() helper to allocate a core/driver
> structure bundle and fill its core viommu->ops:
> struct my_driver_viommu {
> struct iommufd_viommu core;
> ....
> };
>
> static const struct iommufd_viommu_ops my_driver_viommu_ops = {
> .free = my_driver_viommu_free,
> };
>
> struct my_driver_viommu *my_viommu =
> iommufd_viommu_alloc(my_driver_viommu, core);
Why don't we have an ictx here anyhow? The caller has it? Just pass it
down and then it is normal:
my_viommu = iommufd_object_alloc_elm(ictx, my_viommu, IOMMUFD_OBJ_HWPT_VIOMMU, core.obj);
And abort works properly for error handling.
Jason
On Sun, May 12, 2024 at 11:03:53AM -0300, Jason Gunthorpe wrote:
> On Fri, Apr 12, 2024 at 08:47:01PM -0700, Nicolin Chen wrote:
> > Add a new iommufd_viommu core structure to represent a vIOMMU instance in
> > the user space, typically backed by a HW-accelerated feature of an IOMMU,
> > e.g. NVIDIA CMDQ-Virtualization (an ARM SMMUv3 extension) and AMD Hardware
> > Accelerated Virtualized IOMMU (vIOMMU).
>
> I expect this will also be the only way to pass in an associated KVM,
> userspace would supply the kvm when creating the viommu.
>
> The tricky bit of this flow is how to manage the S2. It is necessary
> that the S2 be linked to the viommu:
>
> 1) ARM BTM requires the VMID to be shared with KVM
> 2) AMD and others need the S2 translation because some of the HW
> acceleration is done inside the guest address space
>
> I haven't looked closely at AMD but presumably the VIOMMU create will
> have to install the S2 into a DID or something?
>
> So we need the S2 to exist before the VIOMMU is created, but the
> drivers are going to need some more fixing before that will fully
> work.
>
> Does the nesting domain create need the viommu as well (in place of
> the S2 hwpt)? That feels sort of natural.
Yes, I had a similar thought initially: each viommu is backed by
a nested IOMMU HW, and a special HW accelerator like VCMDQ could
be treated as an extension on top of that. It might not be very
straightforward like the current design having vintf<->viommu and
vcmdq <-> vqueue though...
In that case, we can then support viommu_cache_invalidate, which
is quite natural for SMMUv3. Yet, I recall Kevin said that VT-d
doesn't want or need that.
> There is still a lot of fixing before everything can work fully, but
> do we need to make some preperations here in the uapi? Like starting
> to thread the S2 through it as I described?
>
> Kevin, does Intel forsee any viommu needs on current/future Intel HW?
> I assume you are thinking about invalidation queue bypass like
> everyone else. I think it is an essential feature for vSVA.
>
> > A driver should embed this core structure in its driver viommu structure
> > and call the new iommufd_viommu_alloc() helper to allocate a core/driver
> > structure bundle and fill its core viommu->ops:
> > struct my_driver_viommu {
> > struct iommufd_viommu core;
> > ....
> > };
> >
> > static const struct iommufd_viommu_ops my_driver_viommu_ops = {
> > .free = my_driver_viommu_free,
> > };
> >
> > struct my_driver_viommu *my_viommu =
> > iommufd_viommu_alloc(my_driver_viommu, core);
>
> Why don't we have an ictx here anyhow? The caller has it? Just pass it
> down and then it is normal:
>
> my_viommu = iommufd_object_alloc_elm(ictx, my_viommu, IOMMUFD_OBJ_HWPT_VIOMMU, core.obj);
Oh, in that case, we probably don't need a level-3 obj allocator
that was previously missing ictx to allocate an obj->id.
Thanks
Nicolin
On Sun, May 12, 2024 at 08:34:02PM -0700, Nicolin Chen wrote: > On Sun, May 12, 2024 at 11:03:53AM -0300, Jason Gunthorpe wrote: > > On Fri, Apr 12, 2024 at 08:47:01PM -0700, Nicolin Chen wrote: > > > Add a new iommufd_viommu core structure to represent a vIOMMU instance in > > > the user space, typically backed by a HW-accelerated feature of an IOMMU, > > > e.g. NVIDIA CMDQ-Virtualization (an ARM SMMUv3 extension) and AMD Hardware > > > Accelerated Virtualized IOMMU (vIOMMU). > > > > I expect this will also be the only way to pass in an associated KVM, > > userspace would supply the kvm when creating the viommu. > > > > The tricky bit of this flow is how to manage the S2. It is necessary > > that the S2 be linked to the viommu: > > > > 1) ARM BTM requires the VMID to be shared with KVM > > 2) AMD and others need the S2 translation because some of the HW > > acceleration is done inside the guest address space > > > > I haven't looked closely at AMD but presumably the VIOMMU create will > > have to install the S2 into a DID or something? > > > > So we need the S2 to exist before the VIOMMU is created, but the > > drivers are going to need some more fixing before that will fully > > work. > > > > Does the nesting domain create need the viommu as well (in place of > > the S2 hwpt)? That feels sort of natural. > > Yes, I had a similar thought initially: each viommu is backed by > a nested IOMMU HW, and a special HW accelerator like VCMDQ could > be treated as an extension on top of that. It might not be very > straightforward like the current design having vintf<->viommu and > vcmdq <-> vqueue though... vqueue should be considered a sub object of the viommu and hold a refcount on the viommu object for its lifetime. > In that case, we can then support viommu_cache_invalidate, which > is quite natural for SMMUv3. Yet, I recall Kevin said that VT-d > doesn't want or need that. Right, Intel currently doesn't need it, but I feel like everyone will need this eventually as the fast invalidation path is quite important. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Tuesday, May 14, 2024 11:56 PM > > On Sun, May 12, 2024 at 08:34:02PM -0700, Nicolin Chen wrote: > > On Sun, May 12, 2024 at 11:03:53AM -0300, Jason Gunthorpe wrote: > > > On Fri, Apr 12, 2024 at 08:47:01PM -0700, Nicolin Chen wrote: > > > > Add a new iommufd_viommu core structure to represent a vIOMMU > instance in > > > > the user space, typically backed by a HW-accelerated feature of an > IOMMU, > > > > e.g. NVIDIA CMDQ-Virtualization (an ARM SMMUv3 extension) and > AMD Hardware > > > > Accelerated Virtualized IOMMU (vIOMMU). > > > > > > I expect this will also be the only way to pass in an associated KVM, > > > userspace would supply the kvm when creating the viommu. > > > > > > The tricky bit of this flow is how to manage the S2. It is necessary > > > that the S2 be linked to the viommu: > > > > > > 1) ARM BTM requires the VMID to be shared with KVM > > > 2) AMD and others need the S2 translation because some of the HW > > > acceleration is done inside the guest address space > > > > > > I haven't looked closely at AMD but presumably the VIOMMU create will > > > have to install the S2 into a DID or something? > > > > > > So we need the S2 to exist before the VIOMMU is created, but the > > > drivers are going to need some more fixing before that will fully > > > work. Can you elaborate on this point? VIOMMU is a dummy container when it's created and the association to S2 comes relevant only until when VQUEUE is created inside and linked to a device? then there should be a window in between allowing the userspace to configure S2. Not saying against setting S2 up before vIOMMU creation. Just want to better understand the rationale here. > > > > > > Does the nesting domain create need the viommu as well (in place of > > > the S2 hwpt)? That feels sort of natural. > > > > Yes, I had a similar thought initially: each viommu is backed by > > a nested IOMMU HW, and a special HW accelerator like VCMDQ could > > be treated as an extension on top of that. It might not be very > > straightforward like the current design having vintf<->viommu and > > vcmdq <-> vqueue though... > > vqueue should be considered a sub object of the viommu and hold a > refcount on the viommu object for its lifetime. > > > In that case, we can then support viommu_cache_invalidate, which > > is quite natural for SMMUv3. Yet, I recall Kevin said that VT-d > > doesn't want or need that. > > Right, Intel currently doesn't need it, but I feel like everyone will > need this eventually as the fast invalidation path is quite important. > yes, there is no need but I don't see any harm of preparing for such extension on VT-d. Logically it's clearer, e.g. if we decide to move device TLB invalidation to a separate uAPI then vIOMMU is certainly a clearer object to carry it. and hardware extensions really looks like optimization on software implementations. and we do need make a decision now, given if we make vIOMMU as a generic object for all vendors it may have potential impact on the user page fault support which Baolu is working on. the so-called fault object will be contained in vIOMMU, which is software managed on VT-d/SMMU but passed through on AMD. And probably we don't need another handle mechanism in the attach path, suppose the vIOMMU object already contains necessary information to find out iommufd_object for a reported fault. Baolu, your thoughts?
On Wed, May 22, 2024 at 08:58:34AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Tuesday, May 14, 2024 11:56 PM
> >
> > On Sun, May 12, 2024 at 08:34:02PM -0700, Nicolin Chen wrote:
> > > On Sun, May 12, 2024 at 11:03:53AM -0300, Jason Gunthorpe wrote:
> > > > On Fri, Apr 12, 2024 at 08:47:01PM -0700, Nicolin Chen wrote:
> > > > > Add a new iommufd_viommu core structure to represent a vIOMMU
> > instance in
> > > > > the user space, typically backed by a HW-accelerated feature of an
> > IOMMU,
> > > > > e.g. NVIDIA CMDQ-Virtualization (an ARM SMMUv3 extension) and
> > AMD Hardware
> > > > > Accelerated Virtualized IOMMU (vIOMMU).
> > > >
> > > > I expect this will also be the only way to pass in an associated KVM,
> > > > userspace would supply the kvm when creating the viommu.
> > > >
> > > > The tricky bit of this flow is how to manage the S2. It is necessary
> > > > that the S2 be linked to the viommu:
> > > >
> > > > 1) ARM BTM requires the VMID to be shared with KVM
> > > > 2) AMD and others need the S2 translation because some of the HW
> > > > acceleration is done inside the guest address space
> > > >
> > > > I haven't looked closely at AMD but presumably the VIOMMU create will
> > > > have to install the S2 into a DID or something?
> > > >
> > > > So we need the S2 to exist before the VIOMMU is created, but the
> > > > drivers are going to need some more fixing before that will fully
> > > > work.
>
> Can you elaborate on this point? VIOMMU is a dummy container when
> it's created and the association to S2 comes relevant only until when
> VQUEUE is created inside and linked to a device?
VIOMMU contains:
- A nesting parent
- A KVM
- Any global per-VM data the driver needs
* In ARM case this is VMID, sometimes shared with KVM
* In AMD case this is will allocate memory in the
"viommu backing storage memory"
Objects can be created on top of a VIOMMU:
- A nested HWPT (iommu_hwpt_alloc::pt_id can be a viommu)
- A vqueue (ARM/AMD)
- Other AMD virtualized objects (EventLog, PPRLog)
It is desirable to keep the VIOMMU linked to only a single nesting
parent that never changes. Given it seems to be a small ask to
allocate the nesting parent before the VIOMMU providing it at VIOMMU
creation time looks like it will simplify the drivers because they can
rely on it always existing and never changing.
I think this lends itself to a logical layered VMM design..
- If VFIO is being used get an iommufd
- Allocate an IOAS for the entire guest physical
- Determine the vIOMMU driver to use
- Allocate a HWPT for use by all the vIOMMU instances
- Allocate a VIOMMU per vIOMMU instance
On ARM the S2 is not divorced from the VIOMMU, ARM requires a single
VMID, shared with KVM, and localized to a single VM for some of the
bypass features (vBTM, vCMDQ). So to attach a S2 you actually have to
attach the VIOMMU to pick up the correct VMID.
I imagine something like this:
hwpt_alloc(deva, nesting_parent=true) = shared_s2
viommu_alloc(deva, shared_s2) = viommu1
viommu_alloc(devb, shared_s2) = viommu2
hwpt_alloc(deva, viommu1, vste) = deva_vste
hwpt_alloc(devb, viommu2, vste) = devb_vste
attach(deva, deva_vste)
attach(devb, devb_vste)
attach(devc, shared_s2)
The driver will then know it should program three different VMIDs for
the same S2 page table, which matches the ARM expectation for
VMID. That is to say we'd pass in the viommu as the pt_id for the
iommu_hwpt_alloc. The viommu would imply both the S2 page table and
any meta information like VMID the driver needs.
Both AMD and the vCMDQ thing need to translate some PFNs through the
S2 and program them elsewhere, this is manually done by SW, and there
are three choices I guess:
- Have the VMM do it and provide void __user * to the driver
- Have the driver do it through the S2 directly and track
S2 invalidations
- Have the driver open an access on the IOAS and use the access unmap
Not sure which is the best..
> > Right, Intel currently doesn't need it, but I feel like everyone will
> > need this eventually as the fast invalidation path is quite important.
>
> yes, there is no need but I don't see any harm of preparing for such
> extension on VT-d. Logically it's clearer, e.g. if we decide to move
> device TLB invalidation to a separate uAPI then vIOMMU is certainly
> a clearer object to carry it. and hardware extensions really looks like
> optimization on software implementations.
>
> and we do need make a decision now, given if we make vIOMMU as
> a generic object for all vendors it may have potential impact on
> the user page fault support which Baolu is working on.
> the so-called
> fault object will be contained in vIOMMU, which is software managed
> on VT-d/SMMU but passed through on AMD.
Hmm, given we currently have no known hardware entanglement between
PRI and VIOMMU it does seem OK for PRI to just exist seperate for
now. If someone needs them linked someday we can add a viommu_id to
the create pri queue command.
> And probably we don't need another handle mechanism in the attach
> path, suppose the vIOMMU object already contains necessary
> information to find out iommufd_object for a reported fault.
The viommu might be useful to have the kernel return the vRID instead
of the dev_id in the fault messages. I'm not sure how valuable this
is..
Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Wednesday, May 22, 2024 9:39 PM > > On Wed, May 22, 2024 at 08:58:34AM +0000, Tian, Kevin wrote: > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > Sent: Tuesday, May 14, 2024 11:56 PM > > > > > > > > So we need the S2 to exist before the VIOMMU is created, but the > > > > > drivers are going to need some more fixing before that will fully > > > > > work. > > > > Can you elaborate on this point? VIOMMU is a dummy container when > > it's created and the association to S2 comes relevant only until when > > VQUEUE is created inside and linked to a device? > > VIOMMU contains: > - A nesting parent > - A KVM > - Any global per-VM data the driver needs > * In ARM case this is VMID, sometimes shared with KVM In which case is it not shared with KVM? I had the impression that VMID always comes from KVM in this VCMDQ usage. 😊 > * In AMD case this is will allocate memory in the > "viommu backing storage memory" > > Objects can be created on top of a VIOMMU: > - A nested HWPT (iommu_hwpt_alloc::pt_id can be a viommu) > - A vqueue (ARM/AMD) > - Other AMD virtualized objects (EventLog, PPRLog) > > It is desirable to keep the VIOMMU linked to only a single nesting > parent that never changes. Given it seems to be a small ask to > allocate the nesting parent before the VIOMMU providing it at VIOMMU > creation time looks like it will simplify the drivers because they can > rely on it always existing and never changing. yes, this makes sense. > > I think this lends itself to a logical layered VMM design.. > > - If VFIO is being used get an iommufd > - Allocate an IOAS for the entire guest physical > - Determine the vIOMMU driver to use > - Allocate a HWPT for use by all the vIOMMU instances > - Allocate a VIOMMU per vIOMMU instance > > On ARM the S2 is not divorced from the VIOMMU, ARM requires a single > VMID, shared with KVM, and localized to a single VM for some of the > bypass features (vBTM, vCMDQ). So to attach a S2 you actually have to > attach the VIOMMU to pick up the correct VMID. > > I imagine something like this: > hwpt_alloc(deva, nesting_parent=true) = shared_s2 > viommu_alloc(deva, shared_s2) = viommu1 > viommu_alloc(devb, shared_s2) = viommu2 > hwpt_alloc(deva, viommu1, vste) = deva_vste > hwpt_alloc(devb, viommu2, vste) = devb_vste > attach(deva, deva_vste) > attach(devb, devb_vste) > attach(devc, shared_s2) I wonder whether we want to make viommu as the 1st-class citizen for any nested hwpt if it is desirable to enable it even for VT-d which lacks of a hw viommu concept at the moment. > > The driver will then know it should program three different VMIDs for > the same S2 page table, which matches the ARM expectation for > VMID. That is to say we'd pass in the viommu as the pt_id for the > iommu_hwpt_alloc. The viommu would imply both the S2 page table and > any meta information like VMID the driver needs. Can you elaborate the aspect about "three different VMIDs"? They are all for the same VM hence sharing the same VMID per the earlier description. This is also echo-ed in patch14: tegra241_cmdqv_viommu_alloc() vintf->vmid = smmu_domain->vmid; > > Both AMD and the vCMDQ thing need to translate some PFNs through the > S2 and program them elsewhere, this is manually done by SW, and there > are three choices I guess: > - Have the VMM do it and provide void __user * to the driver this sounds redundant to what S2 already provides > - Have the driver do it through the S2 directly and track > S2 invalidations this makes more sense to me. Just like the driver already needs to track S2 invalidations to flush any nested cache related to the affected S2 range. > - Have the driver open an access on the IOAS and use the access unmap > it requires adding more iommufd awareness into the iommu driver. I'm inclined to do it only at minimal necessity. > Not sure which is the best.. > > > > Right, Intel currently doesn't need it, but I feel like everyone will > > > need this eventually as the fast invalidation path is quite important. > > > > yes, there is no need but I don't see any harm of preparing for such > > extension on VT-d. Logically it's clearer, e.g. if we decide to move > > device TLB invalidation to a separate uAPI then vIOMMU is certainly > > a clearer object to carry it. and hardware extensions really looks like > > optimization on software implementations. > > > > and we do need make a decision now, given if we make vIOMMU as > > a generic object for all vendors it may have potential impact on > > the user page fault support which Baolu is working on. > > > the so-called > > fault object will be contained in vIOMMU, which is software managed > > on VT-d/SMMU but passed through on AMD. > > Hmm, given we currently have no known hardware entanglement between > PRI and VIOMMU it does seem OK for PRI to just exist seperate for Isn't AMD vPPRLog for directly sending PRI request into the guest? > now. If someone needs them linked someday we can add a viommu_id to > the create pri queue command. I'm more worried about the potential conflict between the vqueue object here and the fault queue object in Baolu's series, if we want to introduce vIOMMU concept to platforms which lack of the hw support. In that case both objects are software constructs to deliver faults.
On Thu, May 23, 2024 at 01:43:45AM +0000, Tian, Kevin wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Wednesday, May 22, 2024 9:39 PM > > > > On Wed, May 22, 2024 at 08:58:34AM +0000, Tian, Kevin wrote: > > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > > Sent: Tuesday, May 14, 2024 11:56 PM > > > > > > > > > > So we need the S2 to exist before the VIOMMU is created, but the > > > > > > drivers are going to need some more fixing before that will fully > > > > > > work. > > > > > > Can you elaborate on this point? VIOMMU is a dummy container when > > > it's created and the association to S2 comes relevant only until when > > > VQUEUE is created inside and linked to a device? > > > > VIOMMU contains: > > - A nesting parent > > - A KVM > > - Any global per-VM data the driver needs > > * In ARM case this is VMID, sometimes shared with KVM > > In which case is it not shared with KVM? I had the impression that > VMID always comes from KVM in this VCMDQ usage. 😊 Not quite, only vBTM needs it to be shared with KVM because the CPU will forward the KVM VMID to the SMMU during invalidations. Everything else in the nesting space (including vCMDQ) just needs the VMID to be unique to the VM since it scopes the ASID that is stored in guest table.s For non-nesting cases (ie no viommu) the VMID can be unique to the S2. > > On ARM the S2 is not divorced from the VIOMMU, ARM requires a single > > VMID, shared with KVM, and localized to a single VM for some of the > > bypass features (vBTM, vCMDQ). So to attach a S2 you actually have to > > attach the VIOMMU to pick up the correct VMID. > > > > I imagine something like this: > > hwpt_alloc(deva, nesting_parent=true) = shared_s2 > > viommu_alloc(deva, shared_s2) = viommu1 > > viommu_alloc(devb, shared_s2) = viommu2 > > hwpt_alloc(deva, viommu1, vste) = deva_vste > > hwpt_alloc(devb, viommu2, vste) = devb_vste > > attach(deva, deva_vste) > > attach(devb, devb_vste) > > attach(devc, shared_s2) > > I wonder whether we want to make viommu as the 1st-class citizen > for any nested hwpt if it is desirable to enable it even for VT-d which > lacks of a hw viommu concept at the moment. I think we may as well code it like that, yes. It is easy to get the S2 out of the viommu and feed that into the intel driver. > > The driver will then know it should program three different VMIDs for > > the same S2 page table, which matches the ARM expectation for > > VMID. That is to say we'd pass in the viommu as the pt_id for the > > iommu_hwpt_alloc. The viommu would imply both the S2 page table and > > any meta information like VMID the driver needs. > > Can you elaborate the aspect about "three different VMIDs"? In SMMUv3 the cache is tagged by (VMID,ASID) where ASID is completely controlled by the guest. Every time the guest observes a SMMUv3 instance it is allowed to creates its own private ASID number space for that instance. The guest could re-use ASID #1 on two instances. So every SMMUv3 instance plugged into the guest needs to have its own unique VMID so that the overlapping ASID's are disambiguate. The above would create a VM where: deva -> vSMMUv3 #1 devb -> vSMMUv3 #2 devc -> No IOMMU > tegra241_cmdqv_viommu_alloc() > vintf->vmid = smmu_domain->vmid; And same here the 'vintf' is processing ASIDs directly from the guest, it needs to understand the VMID they are scoped under. Every vSMMUv3 instance in the guest must get a VMID, vintf, and vCMDQ all to its own. The above needs revising in a viommu world. > > Both AMD and the vCMDQ thing need to translate some PFNs through the > > S2 and program them elsewhere, this is manually done by SW, and there > > are three choices I guess: > > - Have the VMM do it and provide void __user * to the driver > > this sounds redundant to what S2 already provides Yes, but we don't have to track invalidations here.. > > - Have the driver do it through the S2 directly and track > > S2 invalidations > > this makes more sense to me. Just like the driver already needs to track > S2 invalidations to flush any nested cache related to the affected S2 range. I'm a bit worried about 'track invalidations' though.. > > - Have the driver open an access on the IOAS and use the access unmap > > it requires adding more iommufd awareness into the iommu driver. I'm > inclined to do it only at minimal necessity. Yes, it is certainly annoying because of the modular/builtin problem. > > Hmm, given we currently have no known hardware entanglement between > > PRI and VIOMMU it does seem OK for PRI to just exist seperate for > > Isn't AMD vPPRLog for directly sending PRI request into the guest? I think it is, but that would be a vQUEUE on the VIOMMU not adding a VIOMMU to Lu's patches, which is what I ment. > > now. If someone needs them linked someday we can add a viommu_id to > > the create pri queue command. > > I'm more worried about the potential conflict between the vqueue > object here and the fault queue object in Baolu's series, if we want > to introduce vIOMMU concept to platforms which lack of the hw > support. I assume the vPPRLog will steal all the PRI before it reaches the kernel, so once this is turned on Lu's path won't see anything. I don't know if AMD can turn vPPRLog on individually or if it is a whole package once they switch on VIOMMU.. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Thursday, May 23, 2024 8:59 PM > > > > now. If someone needs them linked someday we can add a viommu_id to > > > the create pri queue command. > > > > I'm more worried about the potential conflict between the vqueue > > object here and the fault queue object in Baolu's series, if we want > > to introduce vIOMMU concept to platforms which lack of the hw > > support. > > I assume the vPPRLog will steal all the PRI before it reaches the > kernel, so once this is turned on Lu's path won't see anything. > > I don't know if AMD can turn vPPRLog on individually or if it is a > whole package once they switch on VIOMMU.. > their spec says: " This feature is supported if MMIO Offset 0030h[vIommuSup]=1. When enabled, DTE[vImuEn]=1 indicates the guest owning this device is a vIOMMU enabled guest, and IOMMU should write event and peripheral page requests associated with this device into the guest data structures. " so it's configured per device.
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Thursday, May 23, 2024 8:59 PM > On Thu, May 23, 2024 at 01:43:45AM +0000, Tian, Kevin wrote: > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > Sent: Wednesday, May 22, 2024 9:39 PM > > > > > > The driver will then know it should program three different VMIDs for > > > the same S2 page table, which matches the ARM expectation for > > > VMID. That is to say we'd pass in the viommu as the pt_id for the > > > iommu_hwpt_alloc. The viommu would imply both the S2 page table and > > > any meta information like VMID the driver needs. > > > > Can you elaborate the aspect about "three different VMIDs"? > > In SMMUv3 the cache is tagged by (VMID,ASID) where ASID is completely > controlled by the guest. > > Every time the guest observes a SMMUv3 instance it is allowed to > creates its own private ASID number space for that instance. The guest > could re-use ASID #1 on two instances. > > So every SMMUv3 instance plugged into the guest needs to have its own > unique VMID so that the overlapping ASID's are disambiguate. The above > would create a VM where: > > deva -> vSMMUv3 #1 > devb -> vSMMUv3 #2 > devc -> No IOMMU This assumes that each vSMMUv3 instance has only one ASID space i.e. the guest cannot create multiple VMID's itself? > > > Hmm, given we currently have no known hardware entanglement > between > > > PRI and VIOMMU it does seem OK for PRI to just exist seperate for > > > > Isn't AMD vPPRLog for directly sending PRI request into the guest? > > I think it is, but that would be a vQUEUE on the VIOMMU not adding a > VIOMMU to Lu's patches, which is what I ment. > > > > now. If someone needs them linked someday we can add a viommu_id to > > > the create pri queue command. > > > > I'm more worried about the potential conflict between the vqueue > > object here and the fault queue object in Baolu's series, if we want > > to introduce vIOMMU concept to platforms which lack of the hw > > support. > > I assume the vPPRLog will steal all the PRI before it reaches the > kernel, so once this is turned on Lu's path won't see anything. > Okay, then we expect this vqueue object only for HW acceleration while software-based fault logging is still routed via Baolu's work.
On Fri, May 24, 2024 at 02:16:34AM +0000, Tian, Kevin wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Thursday, May 23, 2024 8:59 PM > > On Thu, May 23, 2024 at 01:43:45AM +0000, Tian, Kevin wrote: > > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > > Sent: Wednesday, May 22, 2024 9:39 PM > > > > > > > > The driver will then know it should program three different VMIDs for > > > > the same S2 page table, which matches the ARM expectation for > > > > VMID. That is to say we'd pass in the viommu as the pt_id for the > > > > iommu_hwpt_alloc. The viommu would imply both the S2 page table and > > > > any meta information like VMID the driver needs. > > > > > > Can you elaborate the aspect about "three different VMIDs"? > > > > In SMMUv3 the cache is tagged by (VMID,ASID) where ASID is completely > > controlled by the guest. > > > > Every time the guest observes a SMMUv3 instance it is allowed to > > creates its own private ASID number space for that instance. The guest > > could re-use ASID #1 on two instances. > > > > So every SMMUv3 instance plugged into the guest needs to have its own > > unique VMID so that the overlapping ASID's are disambiguate. The above > > would create a VM where: > > > > deva -> vSMMUv3 #1 > > devb -> vSMMUv3 #2 > > devc -> No IOMMU > > This assumes that each vSMMUv3 instance has only one ASID space > i.e. the guest cannot create multiple VMID's itself? Right, the vSMMUv3 in the guest has no support VMID at all. > > I assume the vPPRLog will steal all the PRI before it reaches the > > kernel, so once this is turned on Lu's path won't see anything. > > Okay, then we expect this vqueue object only for HW acceleration > while software-based fault logging is still routed via Baolu's work. Yeah, I think it mirrors the invalidation that way, we have a SW invalidation path and a HW path. The PRI is standards based so it is easier to make a generic API for it. Jason
On Thu, May 23, 2024 at 01:43:45AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@nvidia.com>
> > Sent: Wednesday, May 22, 2024 9:39 PM
> > VIOMMU contains:
> > - A nesting parent
> > - A KVM
> > - Any global per-VM data the driver needs
> > * In ARM case this is VMID, sometimes shared with KVM
>
> In which case is it not shared with KVM? I had the impression that
> VMID always comes from KVM in this VCMDQ usage. 😊
Not actually. I guess that shared VMID is for BTM.
> > On ARM the S2 is not divorced from the VIOMMU, ARM requires a single
> > VMID, shared with KVM, and localized to a single VM for some of the
> > bypass features (vBTM, vCMDQ). So to attach a S2 you actually have to
> > attach the VIOMMU to pick up the correct VMID.
> >
> > I imagine something like this:
> > hwpt_alloc(deva, nesting_parent=true) = shared_s2
> > viommu_alloc(deva, shared_s2) = viommu1
> > viommu_alloc(devb, shared_s2) = viommu2
> > hwpt_alloc(deva, viommu1, vste) = deva_vste
> > hwpt_alloc(devb, viommu2, vste) = devb_vste
> > attach(deva, deva_vste)
> > attach(devb, devb_vste)
> > attach(devc, shared_s2)
>
> I wonder whether we want to make viommu as the 1st-class citizen
> for any nested hwpt if it is desirable to enable it even for VT-d which
> lacks of a hw viommu concept at the moment.
I think Jason is completely using SMMU as an example here.
Also FWIW, I am trying a core-allocated core-managed viommu for
IOMMU_VIOMMU_TYPE_DEFAULT. So VT-d driver doesn't need to hold
a viommu while VMM could still allocate one if it wants. And the
VIOMMU interface can provide some helpers if driver wants some
info from the core-managed viommu: a virtual dev ID to physical
dev ID (returning device pointer) translation for example. And
we can add more after we brain storm.
Sample change:
@@ -623,6 +625,18 @@ struct iommu_ops {
+ * @viommu_alloc: Allocate an iommufd_viommu associating to a nested parent
+ * @domain as a user space IOMMU instance for HW-accelerated
+ * features from the physical IOMMU behind the @dev. The
+ * @viommu_type must be defined in include/uapi/linux/iommufd.h
+ * It is suggested to call iommufd_viommu_alloc() helper for
+ * a bundled allocation of the core and the driver structures,
+ * using the given @ictx pointer.
+ * @default_viommu_ops: Driver can choose to use a default core-allocated core-
+ * managed viommu object by providing a default viommu ops.
+ * Otherwise, i.e. for a driver-managed viommu, viommu_ops
+ * should be passed in via iommufd_viommu_alloc() helper in
+ * its own viommu_alloc op.
[..]
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
...
+ if (cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) {
+ viommu = __iommufd_viommu_alloc(
+ ucmd->ictx, sizeof(*viommu),
+ domain->ops->default_viommu_ops);
+ } else {
+ if (!domain->ops->viommu_alloc) {
+ rc = -EOPNOTSUPP;
+ goto out_put_hwpt;
+ }
+
+ viommu = domain->ops->viommu_alloc(domain, idev->dev,
+ ucmd->ictx, cmd->type);
+ }
[..]
// Helper:
+struct device *
+iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id);
> > The driver will then know it should program three different VMIDs for
> > the same S2 page table, which matches the ARM expectation for
> > VMID. That is to say we'd pass in the viommu as the pt_id for the
> > iommu_hwpt_alloc. The viommu would imply both the S2 page table and
> > any meta information like VMID the driver needs.
>
> Can you elaborate the aspect about "three different VMIDs"? They are
> all for the same VM hence sharing the same VMID per the earlier
> description. This is also echo-ed in patch14:
>
> tegra241_cmdqv_viommu_alloc()
> vintf->vmid = smmu_domain->vmid;
The design in the series is still old using a 1:1 relationship
between a viommu and an S2 domain. I think the "three" is from
his SMMU example above? Leaving it to Jason to reply though.
> > now. If someone needs them linked someday we can add a viommu_id to
> > the create pri queue command.
>
> I'm more worried about the potential conflict between the vqueue
> object here and the fault queue object in Baolu's series, if we want
> to introduce vIOMMU concept to platforms which lack of the hw
> support.
I actually see one argument is whether we should use a vqueue
v.s. Baolu's fault queue object, and a counter argument whether
we should also use vqueue for viommu invalidation v.s an array-
based invalidation request that we have similarly for HWPT...
Thanks
Nicolin
> From: Nicolin Chen <nicolinc@nvidia.com> > Sent: Thursday, May 23, 2024 12:02 PM > > On Thu, May 23, 2024 at 01:43:45AM +0000, Tian, Kevin wrote: > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > Sent: Wednesday, May 22, 2024 9:39 PM > > > VIOMMU contains: > > > - A nesting parent > > > - A KVM > > > - Any global per-VM data the driver needs > > > * In ARM case this is VMID, sometimes shared with KVM > > > > In which case is it not shared with KVM? I had the impression that > > VMID always comes from KVM in this VCMDQ usage. 😊 > > Not actually. I guess that shared VMID is for BTM. yes. now that is clear to me after reading more context. > > > > On ARM the S2 is not divorced from the VIOMMU, ARM requires a single > > > VMID, shared with KVM, and localized to a single VM for some of the > > > bypass features (vBTM, vCMDQ). So to attach a S2 you actually have but here the description should be only for vBTM too. vCMDQ has its own VMID policy. > > > attach the VIOMMU to pick up the correct VMID. > > > > > > I imagine something like this: > > > hwpt_alloc(deva, nesting_parent=true) = shared_s2 > > > viommu_alloc(deva, shared_s2) = viommu1 > > > viommu_alloc(devb, shared_s2) = viommu2 > > > hwpt_alloc(deva, viommu1, vste) = deva_vste > > > hwpt_alloc(devb, viommu2, vste) = devb_vste > > > attach(deva, deva_vste) > > > attach(devb, devb_vste) > > > attach(devc, shared_s2) > > > > I wonder whether we want to make viommu as the 1st-class citizen > > for any nested hwpt if it is desirable to enable it even for VT-d which > > lacks of a hw viommu concept at the moment. > > I think Jason is completely using SMMU as an example here. > > Also FWIW, I am trying a core-allocated core-managed viommu for > IOMMU_VIOMMU_TYPE_DEFAULT. So VT-d driver doesn't need to hold > a viommu while VMM could still allocate one if it wants. And the > VIOMMU interface can provide some helpers if driver wants some > info from the core-managed viommu: a virtual dev ID to physical > dev ID (returning device pointer) translation for example. And > we can add more after we brain storm. that'd be nice. > > > The driver will then know it should program three different VMIDs for > > > the same S2 page table, which matches the ARM expectation for > > > VMID. That is to say we'd pass in the viommu as the pt_id for the > > > iommu_hwpt_alloc. The viommu would imply both the S2 page table and > > > any meta information like VMID the driver needs. > > > > Can you elaborate the aspect about "three different VMIDs"? They are > > all for the same VM hence sharing the same VMID per the earlier > > description. This is also echo-ed in patch14: > > > > tegra241_cmdqv_viommu_alloc() > > vintf->vmid = smmu_domain->vmid; > > The design in the series is still old using a 1:1 relationship > between a viommu and an S2 domain. I think the "three" is from > his SMMU example above? Leaving it to Jason to reply though. I'd expect some detailed explanation in the commit msg about how VCMDQ actually works. It's unclear to me whether this "three different VMIDs" thing comes from the hw requirement or certain software design choices.
On 2024/5/22 16:58, Tian, Kevin wrote: >> From: Jason Gunthorpe<jgg@nvidia.com> >> Sent: Tuesday, May 14, 2024 11:56 PM >> >> On Sun, May 12, 2024 at 08:34:02PM -0700, Nicolin Chen wrote: >>> On Sun, May 12, 2024 at 11:03:53AM -0300, Jason Gunthorpe wrote: >>>> On Fri, Apr 12, 2024 at 08:47:01PM -0700, Nicolin Chen wrote: >>>>> Add a new iommufd_viommu core structure to represent a vIOMMU >> instance in >>>>> the user space, typically backed by a HW-accelerated feature of an >> IOMMU, >>>>> e.g. NVIDIA CMDQ-Virtualization (an ARM SMMUv3 extension) and >> AMD Hardware >>>>> Accelerated Virtualized IOMMU (vIOMMU). >>>> I expect this will also be the only way to pass in an associated KVM, >>>> userspace would supply the kvm when creating the viommu. >>>> >>>> The tricky bit of this flow is how to manage the S2. It is necessary >>>> that the S2 be linked to the viommu: >>>> >>>> 1) ARM BTM requires the VMID to be shared with KVM >>>> 2) AMD and others need the S2 translation because some of the HW >>>> acceleration is done inside the guest address space >>>> >>>> I haven't looked closely at AMD but presumably the VIOMMU create will >>>> have to install the S2 into a DID or something? >>>> >>>> So we need the S2 to exist before the VIOMMU is created, but the >>>> drivers are going to need some more fixing before that will fully >>>> work. > Can you elaborate on this point? VIOMMU is a dummy container when > it's created and the association to S2 comes relevant only until when > VQUEUE is created inside and linked to a device? then there should be > a window in between allowing the userspace to configure S2. > > Not saying against setting S2 up before vIOMMU creation. Just want > to better understand the rationale here. > >>>> Does the nesting domain create need the viommu as well (in place of >>>> the S2 hwpt)? That feels sort of natural. >>> Yes, I had a similar thought initially: each viommu is backed by >>> a nested IOMMU HW, and a special HW accelerator like VCMDQ could >>> be treated as an extension on top of that. It might not be very >>> straightforward like the current design having vintf<->viommu and >>> vcmdq <-> vqueue though... >> vqueue should be considered a sub object of the viommu and hold a >> refcount on the viommu object for its lifetime. >> >>> In that case, we can then support viommu_cache_invalidate, which >>> is quite natural for SMMUv3. Yet, I recall Kevin said that VT-d >>> doesn't want or need that. >> Right, Intel currently doesn't need it, but I feel like everyone will >> need this eventually as the fast invalidation path is quite important. >> > yes, there is no need but I don't see any harm of preparing for such > extension on VT-d. Logically it's clearer, e.g. if we decide to move > device TLB invalidation to a separate uAPI then vIOMMU is certainly > a clearer object to carry it. and hardware extensions really looks like > optimization on software implementations. > > and we do need make a decision now, given if we make vIOMMU as > a generic object for all vendors it may have potential impact on > the user page fault support which Baolu is working on. the so-called > fault object will be contained in vIOMMU, which is software managed > on VT-d/SMMU but passed through on AMD. And probably we don't > need another handle mechanism in the attach path, suppose the > vIOMMU object already contains necessary information to find out > iommufd_object for a reported fault. Yes, if the vIOMMU object tracks all iommufd devices that it manages. Best regards, baolu
© 2016 - 2026 Red Hat, Inc.