The child domain is allocated with IOMMU_DOMAIN_NESTED type to store
stage-1 translation (i.e. GVA->GPA). This includes the GCR3 root pointer
table along with guest (v2) page tables. The struct iommu_hwpt_amd_v2
contains this information, and is passed from user-space as a parameter
of the struct iommu_ops.domain_alloc_nested().
The parent domain is tracked using the struct protection_domain.parent.
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
drivers/iommu/amd/Makefile | 2 +-
drivers/iommu/amd/amd_iommu.h | 5 ++
drivers/iommu/amd/amd_iommu_types.h | 5 ++
drivers/iommu/amd/iommu.c | 22 ++++----
drivers/iommu/amd/nested.c | 79 +++++++++++++++++++++++++++++
5 files changed, 103 insertions(+), 10 deletions(-)
create mode 100644 drivers/iommu/amd/nested.c
diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile
index 59c04a67f398..9ccf9d61810c 100644
--- a/drivers/iommu/amd/Makefile
+++ b/drivers/iommu/amd/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o
+obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o nested.o
obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 3ff380afb9f4..8e86d5b1d915 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -8,6 +8,7 @@
#define AMD_IOMMU_H
#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
#include "amd_iommu_types.h"
@@ -190,4 +191,8 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid);
+/* NESTED */
+struct iommu_domain *
+amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+ u32 flags, const struct iommu_user_data *user_data);
#endif /* AMD_IOMMU_H */
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 5343b99913e4..94f51a09b364 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -20,6 +20,8 @@
#include <linux/irqreturn.h>
#include <linux/io-pgtable.h>
+#include <uapi/linux/iommufd.h>
+
/*
* Maximum number of IOMMUs supported
*/
@@ -605,6 +607,9 @@ struct protection_domain {
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
+
+ struct protection_domain *parent; /* Nested parent domain */
+ struct iommu_hwpt_amd_v2 guest_hwpt;
};
/*
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 46682c8ba28d..ea790a8997ee 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2616,6 +2616,7 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
const struct iommu_user_data *user_data)
{
+ struct iommu_domain *dom = ERR_PTR(-EOPNOTSUPP);
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
@@ -2626,29 +2627,31 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
if ((flags & ~supported_flags) || user_data || !is_nest_parent_supported(flags))
return ERR_PTR(-EOPNOTSUPP);
- pr_debug("%s: IOMMU devid=%#x, flags=%#x\n", __func__, dev_data->devid, flags);
+ pr_debug("%s: IOMMU devid=%#x, flags=%#x, supported_flags=%#x\n", __func__, dev_data->devid, flags, supported_flags);
switch (flags & supported_flags) {
case IOMMU_HWPT_ALLOC_DIRTY_TRACKING:
case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT:
case IOMMU_HWPT_ALLOC_NEST_PARENT:
/* Allocate domain with v1 page table for dirty tracking */
- if (!amd_iommu_hd_support(iommu))
- break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
+ if (amd_iommu_hd_support(iommu))
+ dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V1);
+ break;
case IOMMU_HWPT_ALLOC_PASID:
/* Allocate domain with v2 page table if IOMMU supports PASID. */
- if (!amd_iommu_pasid_supported())
- break;
- return do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
+ if (amd_iommu_pasid_supported())
+ dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V2);
+ break;
case 0:
/* If nothing specific is required use the kernel commandline default */
- return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
+ dom = do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable);
+ break;
default:
pr_err("%s: Unhandled flag : 0x%x\n", __func__, flags);
break;
}
- return ERR_PTR(-EOPNOTSUPP);
+
+ return dom;
}
void amd_iommu_domain_free(struct iommu_domain *dom)
@@ -3113,6 +3116,7 @@ const struct iommu_ops amd_iommu_ops = {
.release_domain = &release_domain,
.identity_domain = &identity_domain.domain,
.domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags,
+ .domain_alloc_nested = amd_iommu_domain_alloc_nested,
.domain_alloc_sva = amd_iommu_domain_alloc_sva,
.probe_device = amd_iommu_probe_device,
.release_device = amd_iommu_release_device,
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
new file mode 100644
index 000000000000..09f2a455af33
--- /dev/null
+++ b/drivers/iommu/amd/nested.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ */
+
+#define pr_fmt(fmt) "AMD-Vi: " fmt
+#define dev_fmt(fmt) pr_fmt(fmt)
+
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "amd_iommu.h"
+#include "amd_iommu_types.h"
+
+const struct iommu_domain_ops nested_domain_ops = {
+ .free = amd_iommu_domain_free,
+};
+
+static int udata_to_iommu_hwpt_amd_v2(const struct iommu_user_data *user_data,
+ struct iommu_hwpt_amd_v2 *hwpt)
+{
+ if (!user_data)
+ return -EINVAL;
+
+ if (user_data->type != IOMMU_HWPT_DATA_AMD_V2)
+ return -EOPNOTSUPP;
+
+ return iommu_copy_struct_from_user(hwpt, user_data,
+ IOMMU_HWPT_DATA_AMD_V2,
+ dte);
+}
+
+struct iommu_domain *
+amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
+ u32 flags, const struct iommu_user_data *user_data)
+{
+ int ret;
+ struct iommu_hwpt_amd_v2 hwpt;
+ struct protection_domain *pdom;
+
+ if (parent->ops != amd_iommu_ops.default_domain_ops)
+ return ERR_PTR(-EINVAL);
+
+ ret = udata_to_iommu_hwpt_amd_v2(user_data, &hwpt);
+ if (ret)
+ return ERR_PTR(ret);
+
+ pdom = kzalloc(sizeof(*pdom), GFP_KERNEL);
+ if (IS_ERR(pdom))
+ return ERR_PTR(-ENOMEM);
+
+ pdom->id = amd_iommu_pdom_id_alloc();
+ if (!pdom->id)
+ goto out_err;
+
+ pr_debug("%s: Allocating nested domain with parent domid=%#x\n",
+ __func__, to_pdomain(parent)->id);
+
+ spin_lock_init(&pdom->lock);
+ INIT_LIST_HEAD(&pdom->dev_list);
+ INIT_LIST_HEAD(&pdom->dev_data_list);
+ xa_init(&pdom->iommu_array);
+
+ pdom->pd_mode = PD_MODE_V2;
+ pdom->iop.pgtbl.cfg.amd.nid = NUMA_NO_NODE;
+ pdom->parent = to_pdomain(parent);
+ pdom->domain.ops = &nested_domain_ops;
+ pdom->domain.type = IOMMU_DOMAIN_NESTED;
+ pdom->domain.geometry.aperture_start = 0;
+ pdom->domain.geometry.aperture_end = ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
+ pdom->domain.geometry.force_aperture = true;
+ pdom->domain.pgsize_bitmap = pdom->iop.pgtbl.cfg.pgsize_bitmap;
+ memcpy(&pdom->guest_hwpt, &hwpt, sizeof(struct iommu_hwpt_amd_v2));
+
+ return &pdom->domain;
+out_err:
+ kfree(pdom);
+ return ERR_PTR(-EINVAL);
+}
--
2.34.1
On Wed, Aug 20, 2025 at 11:30:08AM +0000, Suravee Suthikulpanit wrote: > @@ -605,6 +607,9 @@ struct protection_domain { > > struct mmu_notifier mn; /* mmu notifier for the SVA domain */ > struct list_head dev_data_list; /* List of pdom_dev_data */ > + > + struct protection_domain *parent; /* Nested parent domain */ > + struct iommu_hwpt_amd_v2 guest_hwpt; guest_dte is a better name. > @@ -2616,6 +2616,7 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, > const struct iommu_user_data *user_data) > > { > + struct iommu_domain *dom = ERR_PTR(-EOPNOTSUPP); > struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); > const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | > @@ -2626,29 +2627,31 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, > if ((flags & ~supported_flags) || user_data || !is_nest_parent_supported(flags)) > return ERR_PTR(-EOPNOTSUPP); > > - pr_debug("%s: IOMMU devid=%#x, flags=%#x\n", __func__, dev_data->devid, flags); > + pr_debug("%s: IOMMU devid=%#x, flags=%#x, supported_flags=%#x\n", __func__, dev_data->devid, flags, supported_flags); > > switch (flags & supported_flags) { > case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: > case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: > case IOMMU_HWPT_ALLOC_NEST_PARENT: > /* Allocate domain with v1 page table for dirty tracking */ > - if (!amd_iommu_hd_support(iommu)) > - break; > - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); > + if (amd_iommu_hd_support(iommu)) > + dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V1); > + break; > case IOMMU_HWPT_ALLOC_PASID: > /* Allocate domain with v2 page table if IOMMU supports PASID. */ > - if (!amd_iommu_pasid_supported()) > - break; > - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); > + if (amd_iommu_pasid_supported()) > + dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V2); > + break; > case 0: > /* If nothing specific is required use the kernel commandline default */ > - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); > + dom = do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); > + break; > default: > pr_err("%s: Unhandled flag : 0x%x\n", __func__, flags); > break; > } > - return ERR_PTR(-EOPNOTSUPP); > + > + return dom; Why is all this being done? Nothing touches dom on the return path here. > +static int udata_to_iommu_hwpt_amd_v2(const struct iommu_user_data *user_data, > + struct iommu_hwpt_amd_v2 *hwpt) > +{ > + if (!user_data) > + return -EINVAL; > + > + if (user_data->type != IOMMU_HWPT_DATA_AMD_V2) > + return -EOPNOTSUPP; > + > + return iommu_copy_struct_from_user(hwpt, user_data, > + IOMMU_HWPT_DATA_AMD_V2, > + dte); > +} Don't need this helper, iommu_copy_struct_from_user() does everything. > +struct iommu_domain * > +amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, > + u32 flags, const struct iommu_user_data *user_data) > +{ > + int ret; > + struct iommu_hwpt_amd_v2 hwpt; > + struct protection_domain *pdom; > + > + if (parent->ops != amd_iommu_ops.default_domain_ops) > + return ERR_PTR(-EINVAL); This should check it was allocated as a parent domain too. > + ret = udata_to_iommu_hwpt_amd_v2(user_data, &hwpt); > + if (ret) > + return ERR_PTR(ret); > + > + pdom = kzalloc(sizeof(*pdom), GFP_KERNEL); > + if (IS_ERR(pdom)) > + return ERR_PTR(-ENOMEM); I'm not sure it makes sense to allocate a protection_domain here, this doesn't really use much of the struct. Can you make it into its own struct? It would be clearer and safer.. > + pdom->id = amd_iommu_pdom_id_alloc(); > + if (!pdom->id) > + goto out_err; > + > + pr_debug("%s: Allocating nested domain with parent domid=%#x\n", > + __func__, to_pdomain(parent)->id); > + > + spin_lock_init(&pdom->lock); > + INIT_LIST_HEAD(&pdom->dev_list); > + INIT_LIST_HEAD(&pdom->dev_data_list); > + xa_init(&pdom->iommu_array); > + > + pdom->pd_mode = PD_MODE_V2; Nothing should read pd_mode, please check it.. > + pdom->iop.pgtbl.cfg.amd.nid = NUMA_NO_NODE; > + pdom->parent = to_pdomain(parent); > + pdom->domain.ops = &nested_domain_ops; > + pdom->domain.type = IOMMU_DOMAIN_NESTED; > + pdom->domain.geometry.aperture_start = 0; > + pdom->domain.geometry.aperture_end = ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); > + pdom->domain.geometry.force_aperture = true; > + pdom->domain.pgsize_bitmap = pdom->iop.pgtbl.cfg.pgsize_bitmap; And all of these are unnecessary, should never be read. jason
On 8/20/2025 5:00 PM, Suravee Suthikulpanit wrote: > The child domain is allocated with IOMMU_DOMAIN_NESTED type to store > stage-1 translation (i.e. GVA->GPA). This includes the GCR3 root pointer > table along with guest (v2) page tables. The struct iommu_hwpt_amd_v2 > contains this information, and is passed from user-space as a parameter > of the struct iommu_ops.domain_alloc_nested(). > > The parent domain is tracked using the struct protection_domain.parent. > > Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> > --- > drivers/iommu/amd/Makefile | 2 +- > drivers/iommu/amd/amd_iommu.h | 5 ++ > drivers/iommu/amd/amd_iommu_types.h | 5 ++ > drivers/iommu/amd/iommu.c | 22 ++++---- > drivers/iommu/amd/nested.c | 79 +++++++++++++++++++++++++++++ > 5 files changed, 103 insertions(+), 10 deletions(-) > create mode 100644 drivers/iommu/amd/nested.c > > diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile > index 59c04a67f398..9ccf9d61810c 100644 > --- a/drivers/iommu/amd/Makefile > +++ b/drivers/iommu/amd/Makefile > @@ -1,3 +1,3 @@ > # SPDX-License-Identifier: GPL-2.0-only > -obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o > +obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o nested.o > obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o > diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h > index 3ff380afb9f4..8e86d5b1d915 100644 > --- a/drivers/iommu/amd/amd_iommu.h > +++ b/drivers/iommu/amd/amd_iommu.h > @@ -8,6 +8,7 @@ > #define AMD_IOMMU_H > > #include <linux/iommu.h> > +#include <uapi/linux/iommufd.h> > > #include "amd_iommu_types.h" > > @@ -190,4 +191,8 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain, > struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); > struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); > > +/* NESTED */ > +struct iommu_domain * > +amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, > + u32 flags, const struct iommu_user_data *user_data); > #endif /* AMD_IOMMU_H */ > diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h > index 5343b99913e4..94f51a09b364 100644 > --- a/drivers/iommu/amd/amd_iommu_types.h > +++ b/drivers/iommu/amd/amd_iommu_types.h > @@ -20,6 +20,8 @@ > #include <linux/irqreturn.h> > #include <linux/io-pgtable.h> > > +#include <uapi/linux/iommufd.h> > + > /* > * Maximum number of IOMMUs supported > */ > @@ -605,6 +607,9 @@ struct protection_domain { > > struct mmu_notifier mn; /* mmu notifier for the SVA domain */ > struct list_head dev_data_list; /* List of pdom_dev_data */ > + > + struct protection_domain *parent; /* Nested parent domain */ > + struct iommu_hwpt_amd_v2 guest_hwpt; > }; > > /* > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c > index 46682c8ba28d..ea790a8997ee 100644 > --- a/drivers/iommu/amd/iommu.c > +++ b/drivers/iommu/amd/iommu.c > @@ -2616,6 +2616,7 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, > const struct iommu_user_data *user_data) > > { > + struct iommu_domain *dom = ERR_PTR(-EOPNOTSUPP); > struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); > const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | > @@ -2626,29 +2627,31 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, > if ((flags & ~supported_flags) || user_data || !is_nest_parent_supported(flags)) > return ERR_PTR(-EOPNOTSUPP); > > - pr_debug("%s: IOMMU devid=%#x, flags=%#x\n", __func__, dev_data->devid, flags); > + pr_debug("%s: IOMMU devid=%#x, flags=%#x, supported_flags=%#x\n", __func__, dev_data->devid, flags, supported_flags); > > switch (flags & supported_flags) { > case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: > case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: > case IOMMU_HWPT_ALLOC_NEST_PARENT: > /* Allocate domain with v1 page table for dirty tracking */ > - if (!amd_iommu_hd_support(iommu)) > - break; > - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); > + if (amd_iommu_hd_support(iommu)) > + dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V1); > + break; > case IOMMU_HWPT_ALLOC_PASID: > /* Allocate domain with v2 page table if IOMMU supports PASID. */ > - if (!amd_iommu_pasid_supported()) > - break; > - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); > + if (amd_iommu_pasid_supported()) > + dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V2); > + break; > case 0: > /* If nothing specific is required use the kernel commandline default */ > - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); > + dom = do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); > + break; > default: > pr_err("%s: Unhandled flag : 0x%x\n", __func__, flags); > break; > } > - return ERR_PTR(-EOPNOTSUPP); > + > + return dom; > } > > void amd_iommu_domain_free(struct iommu_domain *dom) > @@ -3113,6 +3116,7 @@ const struct iommu_ops amd_iommu_ops = { > .release_domain = &release_domain, > .identity_domain = &identity_domain.domain, > .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, > + .domain_alloc_nested = amd_iommu_domain_alloc_nested, > .domain_alloc_sva = amd_iommu_domain_alloc_sva, > .probe_device = amd_iommu_probe_device, > .release_device = amd_iommu_release_device, > diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c > new file mode 100644 > index 000000000000..09f2a455af33 > --- /dev/null > +++ b/drivers/iommu/amd/nested.c > @@ -0,0 +1,79 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2023 Advanced Micro Devices, Inc. > + */ > + > +#define pr_fmt(fmt) "AMD-Vi: " fmt > +#define dev_fmt(fmt) pr_fmt(fmt) > + > +#include <linux/iommu.h> > +#include <uapi/linux/iommufd.h> > + > +#include "amd_iommu.h" > +#include "amd_iommu_types.h" > + > +const struct iommu_domain_ops nested_domain_ops = { > + .free = amd_iommu_domain_free, > +}; > + > +static int udata_to_iommu_hwpt_amd_v2(const struct iommu_user_data *user_data, > + struct iommu_hwpt_amd_v2 *hwpt) > +{ > + if (!user_data) > + return -EINVAL; > + > + if (user_data->type != IOMMU_HWPT_DATA_AMD_V2) > + return -EOPNOTSUPP; > + > + return iommu_copy_struct_from_user(hwpt, user_data, > + IOMMU_HWPT_DATA_AMD_V2, > + dte); > +} > + > +struct iommu_domain * > +amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, > + u32 flags, const struct iommu_user_data *user_data) > +{ > + int ret; > + struct iommu_hwpt_amd_v2 hwpt; > + struct protection_domain *pdom; > + > + if (parent->ops != amd_iommu_ops.default_domain_ops) > + return ERR_PTR(-EINVAL); > + > + ret = udata_to_iommu_hwpt_amd_v2(user_data, &hwpt); > + if (ret) > + return ERR_PTR(ret); > + > + pdom = kzalloc(sizeof(*pdom), GFP_KERNEL); > + if (IS_ERR(pdom)) > + return ERR_PTR(-ENOMEM); > + > + pdom->id = amd_iommu_pdom_id_alloc(); > + if (!pdom->id) > + goto out_err; > + > + pr_debug("%s: Allocating nested domain with parent domid=%#x\n", > + __func__, to_pdomain(parent)->id); > + > + spin_lock_init(&pdom->lock); > + INIT_LIST_HEAD(&pdom->dev_list); > + INIT_LIST_HEAD(&pdom->dev_data_list); > + xa_init(&pdom->iommu_array); > + > + pdom->pd_mode = PD_MODE_V2; > + pdom->iop.pgtbl.cfg.amd.nid = NUMA_NO_NODE; > + pdom->parent = to_pdomain(parent); > + pdom->domain.ops = &nested_domain_ops; > + pdom->domain.type = IOMMU_DOMAIN_NESTED; > + pdom->domain.geometry.aperture_start = 0; > + pdom->domain.geometry.aperture_end = ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); > + pdom->domain.geometry.force_aperture = true; > + pdom->domain.pgsize_bitmap = pdom->iop.pgtbl.cfg.pgsize_bitmap; Hi Suravee, You are assigning a unintialized pdom->iop.pgtbl.cfg.pgsize_bitmap to pdom->domain.pgsize_bitmap. In non-nested domain attach pdom_setup_pgtable() takes care of initializing pgsize_bitmap. Thanks Sairaj > + memcpy(&pdom->guest_hwpt, &hwpt, sizeof(struct iommu_hwpt_amd_v2)); > + > + return &pdom->domain; > +out_err: > + kfree(pdom); > + return ERR_PTR(-EINVAL); > +}
On Mon, Sep 01, 2025 at 12:39:56PM +0530, Sairaj Kodilkar wrote: > > + pdom->pd_mode = PD_MODE_V2; > > + pdom->parent = to_pdomain(parent); > > + pdom->domain.ops = &nested_domain_ops; > > + pdom->domain.type = IOMMU_DOMAIN_NESTED; > > + pdom->iop.pgtbl.cfg.amd.nid = NUMA_NO_NODE; > > + pdom->domain.geometry.aperture_start = 0; > > + pdom->domain.geometry.aperture_end = ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); > > + pdom->domain.geometry.force_aperture = true; > > + pdom->domain.pgsize_bitmap = pdom->iop.pgtbl.cfg.pgsize_bitmap; > > Hi Suravee, > You are assigning a unintialized pdom->iop.pgtbl.cfg.pgsize_bitmap to > pdom->domain.pgsize_bitmap. In non-nested domain attach pdom_setup_pgtable() > takes care of initializing pgsize_bitmap. nested domains should not have a map/unmap function in their ops so none of this code should be present. Jason
On Wed, Aug 20, 2025 at 11:30:08AM +0000, Suravee Suthikulpanit wrote: > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c > index 46682c8ba28d..ea790a8997ee 100644 > --- a/drivers/iommu/amd/iommu.c > +++ b/drivers/iommu/amd/iommu.c > @@ -2616,6 +2616,7 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, > const struct iommu_user_data *user_data) > > { > + struct iommu_domain *dom = ERR_PTR(-EOPNOTSUPP); > struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); > const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | > @@ -2626,29 +2627,31 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, > if ((flags & ~supported_flags) || user_data || !is_nest_parent_supported(flags)) > return ERR_PTR(-EOPNOTSUPP); > > - pr_debug("%s: IOMMU devid=%#x, flags=%#x\n", __func__, dev_data->devid, flags); > + pr_debug("%s: IOMMU devid=%#x, flags=%#x, supported_flags=%#x\n", __func__, dev_data->devid, flags, supported_flags); > > switch (flags & supported_flags) { > case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: > case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: > case IOMMU_HWPT_ALLOC_NEST_PARENT: > /* Allocate domain with v1 page table for dirty tracking */ > - if (!amd_iommu_hd_support(iommu)) > - break; > - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); > + if (amd_iommu_hd_support(iommu)) > + dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V1); > + break; > case IOMMU_HWPT_ALLOC_PASID: > /* Allocate domain with v2 page table if IOMMU supports PASID. */ > - if (!amd_iommu_pasid_supported()) > - break; > - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); > + if (amd_iommu_pasid_supported()) > + dom = do_iommu_domain_alloc(dev, flags, PD_MODE_V2); > + break; > case 0: > /* If nothing specific is required use the kernel commandline default */ > - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); > + dom = do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); > + break; > default: > pr_err("%s: Unhandled flag : 0x%x\n", __func__, flags); > break; > } > - return ERR_PTR(-EOPNOTSUPP); > + > + return dom; These seem better to be a preparatory patch. > @@ -3113,6 +3116,7 @@ const struct iommu_ops amd_iommu_ops = { > .release_domain = &release_domain, > .identity_domain = &identity_domain.domain, > .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, > + .domain_alloc_nested = amd_iommu_domain_alloc_nested, > .domain_alloc_sva = amd_iommu_domain_alloc_sva, > .probe_device = amd_iommu_probe_device, > .release_device = amd_iommu_release_device, This will be an HWPT-based nesting support, v.s. vIOMMU-based. If AMD wants to enable its Command/Event Buffers, I think this should follow the vIOMMU model instead. > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (C) 2023 Advanced Micro Devices, Inc. > + */ > + > +#define pr_fmt(fmt) "AMD-Vi: " fmt > +#define dev_fmt(fmt) pr_fmt(fmt) > + > +#include <linux/iommu.h> > +#include <uapi/linux/iommufd.h> > + > +#include "amd_iommu.h" > +#include "amd_iommu_types.h" It seems that you already included the uapi header in "amd_iommu.h". > +static int udata_to_iommu_hwpt_amd_v2(const struct iommu_user_data *user_data, > + struct iommu_hwpt_amd_v2 *hwpt) > +{ > + if (!user_data) > + return -EINVAL; > + > + if (user_data->type != IOMMU_HWPT_DATA_AMD_V2) > + return -EOPNOTSUPP; > + iommu_copy_struct_from_user() internally checks these two already. > + return iommu_copy_struct_from_user(hwpt, user_data, > + IOMMU_HWPT_DATA_AMD_V2, > + dte); > +} > + > +struct iommu_domain * > +amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, > + u32 flags, const struct iommu_user_data *user_data) > +{ > + int ret; > + struct iommu_hwpt_amd_v2 hwpt; > + struct protection_domain *pdom; > + > + if (parent->ops != amd_iommu_ops.default_domain_ops) > + return ERR_PTR(-EINVAL); > + > + ret = udata_to_iommu_hwpt_amd_v2(user_data, &hwpt); > + if (ret) > + return ERR_PTR(ret); > + > + pdom = kzalloc(sizeof(*pdom), GFP_KERNEL); > + if (IS_ERR(pdom)) > + return ERR_PTR(-ENOMEM); > + > + pdom->id = amd_iommu_pdom_id_alloc(); > + if (!pdom->id) > + goto out_err; This seems incorrect. amd_iommu_pdom_id_alloc() is a wrapper of the ida_alloc_range() that would return -ENOMEM or -ENOSPC on failure. Also, -EINVAL in out_err isn't nice to replace either of them. So, I think this should be: if (pdom->id <= 0) { ret = pdom->id; goto out_err; } > + > + pr_debug("%s: Allocating nested domain with parent domid=%#x\n", > + __func__, to_pdomain(parent)->id); > + > + spin_lock_init(&pdom->lock); > + INIT_LIST_HEAD(&pdom->dev_list); > + INIT_LIST_HEAD(&pdom->dev_data_list); > + xa_init(&pdom->iommu_array); > + > + pdom->pd_mode = PD_MODE_V2; > + pdom->iop.pgtbl.cfg.amd.nid = NUMA_NO_NODE; > + pdom->parent = to_pdomain(parent); > + pdom->domain.ops = &nested_domain_ops; > + pdom->domain.type = IOMMU_DOMAIN_NESTED; > + pdom->domain.geometry.aperture_start = 0; > + pdom->domain.geometry.aperture_end = ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); > + pdom->domain.geometry.force_aperture = true; > + pdom->domain.pgsize_bitmap = pdom->iop.pgtbl.cfg.pgsize_bitmap; > + memcpy(&pdom->guest_hwpt, &hwpt, sizeof(struct iommu_hwpt_amd_v2)); How about just hold a "struct dev_table_entry guest_dte" in the pdom, instead of holding a uAPI structure? Nicolin
On Fri, Aug 22, 2025 at 12:51:02PM -0700, Nicolin Chen wrote: > > @@ -3113,6 +3116,7 @@ const struct iommu_ops amd_iommu_ops = { > > .release_domain = &release_domain, > > .identity_domain = &identity_domain.domain, > > .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, > > + .domain_alloc_nested = amd_iommu_domain_alloc_nested, > > .domain_alloc_sva = amd_iommu_domain_alloc_sva, > > .probe_device = amd_iommu_probe_device, > > .release_device = amd_iommu_release_device, > > This will be an HWPT-based nesting support, v.s. vIOMMU-based. > > If AMD wants to enable its Command/Event Buffers, I think this > should follow the vIOMMU model instead. I've been expecting drivers to do both, like ARM.. Nested is the basic infrastructure and then the viommu changes what domain id nested will use similar to how ARM constrains the VMID based on the viommu. Suravee is this how you see AMD evolving as well? Jason
On Fri, Aug 22, 2025 at 06:16:18PM -0300, Jason Gunthorpe wrote: > On Fri, Aug 22, 2025 at 12:51:02PM -0700, Nicolin Chen wrote: > > > @@ -3113,6 +3116,7 @@ const struct iommu_ops amd_iommu_ops = { > > > .release_domain = &release_domain, > > > .identity_domain = &identity_domain.domain, > > > .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, > > > + .domain_alloc_nested = amd_iommu_domain_alloc_nested, > > > .domain_alloc_sva = amd_iommu_domain_alloc_sva, > > > .probe_device = amd_iommu_probe_device, > > > .release_device = amd_iommu_release_device, > > > > This will be an HWPT-based nesting support, v.s. vIOMMU-based. > > > > If AMD wants to enable its Command/Event Buffers, I think this > > should follow the vIOMMU model instead. > > I've been expecting drivers to do both, like ARM.. Nested is the basic > infrastructure and then the viommu changes what domain id nested will > use similar to how ARM constrains the VMID based on the viommu. Hmm, we haven't implemented both in ARM but only the vIOMMU-based one.. Yea, AMD could start with the HWPT-based one, but that would need an invalidation op(?), which seems missing in this series. So, to support direct invalidation via Command Buffers, vIOMMU should be the only option, right? Nicolin
On Fri, Aug 22, 2025 at 02:45:58PM -0700, Nicolin Chen wrote: > On Fri, Aug 22, 2025 at 06:16:18PM -0300, Jason Gunthorpe wrote: > > On Fri, Aug 22, 2025 at 12:51:02PM -0700, Nicolin Chen wrote: > > > > @@ -3113,6 +3116,7 @@ const struct iommu_ops amd_iommu_ops = { > > > > .release_domain = &release_domain, > > > > .identity_domain = &identity_domain.domain, > > > > .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, > > > > + .domain_alloc_nested = amd_iommu_domain_alloc_nested, > > > > .domain_alloc_sva = amd_iommu_domain_alloc_sva, > > > > .probe_device = amd_iommu_probe_device, > > > > .release_device = amd_iommu_release_device, > > > > > > This will be an HWPT-based nesting support, v.s. vIOMMU-based. > > > > > > If AMD wants to enable its Command/Event Buffers, I think this > > > should follow the vIOMMU model instead. > > > > I've been expecting drivers to do both, like ARM.. Nested is the basic > > infrastructure and then the viommu changes what domain id nested will > > use similar to how ARM constrains the VMID based on the viommu. > > Hmm, we haven't implemented both in ARM but only the vIOMMU-based > one.. Oh? I mis-remember we had kept the hwpt based non-ATS invalidation around.. I guess that was Intel that ended up like that.. OK then > Yea, AMD could start with the HWPT-based one, but that would need > an invalidation op(?), which seems missing in this series. So, to > support direct invalidation via Command Buffers, vIOMMU should be > the only option, right? Certainly if there is no invalidation op the series is incomplete. Jason
Hi Suravee, kernel test robot noticed the following build warnings: [auto build test WARNING on linus/master] [also build test WARNING on v6.17-rc2 next-20250821] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Suravee-Suthikulpanit/iommu-amd-Make-amd_iommu_pdom_id_alloc-non-static/20250820-194937 base: linus/master patch link: https://lore.kernel.org/r/20250820113009.5233-8-suravee.suthikulpanit%40amd.com patch subject: [PATCH 7/8] iommu/amd: Add support for nested domain allocation config: x86_64-randconfig-123-20250822 (https://download.01.org/0day-ci/archive/20250822/202508221308.4CwLNeZw-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250822/202508221308.4CwLNeZw-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202508221308.4CwLNeZw-lkp@intel.com/ sparse warnings: (new ones prefixed by >>) >> drivers/iommu/amd/nested.c:15:31: sparse: sparse: symbol 'nested_domain_ops' was not declared. Should it be static? vim +/nested_domain_ops +15 drivers/iommu/amd/nested.c 14 > 15 const struct iommu_domain_ops nested_domain_ops = { 16 .free = amd_iommu_domain_free, 17 }; 18 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.