Each nested domain is assigned guest domain ID (gDomID), which guest OS
programs into guest Device Table Entry (gDTE). For each gDomID, the driver
assigns a corresponding host domain ID (hDomID), which will be programmed
into the host Device Table Entry (hDTE).
The hDomID is allocated during amd_iommu_alloc_domain_nested(),
and free during nested_domain_free(). The gDomID-to-hDomID mapping info
(struct guest_domain_mapping_info) is stored in a per-viommu xarray
(struct amd_iommu_viommu.gdomid_array), which is indexed by gDomID.
Note also that parent domain can be shared among struct iommufd_viommu.
Therefore, when hypervisor invalidates the nest parent domain, the AMD
IOMMU command INVALIDATE_IOMMU_PAGES must be issued for each hDomID in
the gdomid_array. This is handled by the iommu_flush_pages_v1_hdom_ids(),
where it iterates through struct protection_domain.viommu_list.
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
drivers/iommu/amd/amd_iommu_types.h | 23 +++++++++
drivers/iommu/amd/iommu.c | 35 +++++++++++++
drivers/iommu/amd/iommufd.c | 34 ++++++++++++
drivers/iommu/amd/nested.c | 80 +++++++++++++++++++++++++++++
4 files changed, 172 insertions(+)
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index e46f346fd6c5..734f6a753b3a 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -498,6 +498,22 @@ struct pdom_iommu_info {
struct amd_iommu_viommu {
struct iommufd_viommu core;
struct protection_domain *parent; /* nest parent domain for this viommu */
+ struct list_head pdom_list; /* For protection_domain->viommu_list */
+
+ /*
+ * Per-vIOMMU guest domain ID to host domain ID mapping.
+ * Indexed by guest domain ID.
+ */
+ struct xarray gdomid_array;
+};
+
+/*
+ * Contains guest domain ID mapping info,
+ * which is stored in the struct xarray gdomid_array.
+ */
+struct guest_domain_mapping_info {
+ refcount_t users;
+ u32 hdom_id; /* Host domain ID */
};
/*
@@ -506,6 +522,7 @@ struct amd_iommu_viommu {
struct nested_domain {
struct iommu_domain domain; /* generic domain handle used by iommu core code */
u16 gdom_id; /* domain ID from gDTE */
+ struct guest_domain_mapping_info *gdom_info;
struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
struct amd_iommu_viommu *viommu; /* AMD hw-viommu this nested domain belong to */
};
@@ -530,6 +547,12 @@ struct protection_domain {
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
+
+ /*
+ * Store reference to list of vIOMMUs, which use this protection domain.
+ * This will be used to look up host domain ID when flushing this domain.
+ */
+ struct list_head viommu_list;
};
PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 724b8723b836..6a26e7a28141 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1513,6 +1513,29 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
iommu_completion_wait(iommu);
}
+static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
+{
+ int ret = 0;
+ struct amd_iommu_viommu *aviommu;
+
+ list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
+ unsigned long i;
+ struct guest_domain_mapping_info *gdom_info;
+ struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, struct amd_iommu, iommu);
+
+ xa_for_each(&aviommu->gdomid_array, i, gdom_info) {
+ struct iommu_cmd cmd;
+
+ pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__,
+ iommu->devid, gdom_info->hdom_id);
+ build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id,
+ IOMMU_NO_PASID, false);
+ ret |= iommu_queue_command(iommu, &cmd);
+ }
+ }
+ return ret;
+}
+
static void amd_iommu_flush_all(struct amd_iommu *iommu)
{
struct iommu_cmd cmd;
@@ -1661,6 +1684,17 @@ static int domain_flush_pages_v1(struct protection_domain *pdom,
ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
}
+ /*
+ * A domain w/ v1 table can be a nest parent, which can have
+ * multiple nested domains. Each nested domain has 1:1 mapping
+ * between gDomID and hDomID. Therefore, flush every hDomID
+ * associated to this nest parent domain.
+ *
+ * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested()
+ */
+ if (!list_empty(&pdom->viommu_list))
+ ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size);
+
return ret;
}
@@ -2469,6 +2503,7 @@ static void protection_domain_init(struct protection_domain *domain)
spin_lock_init(&domain->lock);
INIT_LIST_HEAD(&domain->dev_list);
INIT_LIST_HEAD(&domain->dev_data_list);
+ INIT_LIST_HEAD(&domain->viommu_list);
xa_init(&domain->iommu_array);
}
diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c
index eb6119bdcf12..bb53475f9171 100644
--- a/drivers/iommu/amd/iommufd.c
+++ b/drivers/iommu/amd/iommufd.c
@@ -9,6 +9,8 @@
#include "amd_iommu.h"
#include "amd_iommu_types.h"
+static const struct iommufd_viommu_ops amd_viommu_ops;
+
void *amd_iommufd_hw_info(struct device *dev, u32 *length, u32 *type)
{
struct iommu_hw_info_amd *hwinfo;
@@ -38,10 +40,42 @@ size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type vi
int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent,
const struct iommu_user_data *user_data)
{
+ unsigned long flags;
struct protection_domain *pdom = to_pdomain(parent);
struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
+ xa_init(&aviommu->gdomid_array);
aviommu->parent = pdom;
+ viommu->ops = &amd_viommu_ops;
+
+ spin_lock_irqsave(&pdom->lock, flags);
+ list_add(&aviommu->pdom_list, &pdom->viommu_list);
+ spin_unlock_irqrestore(&pdom->lock, flags);
+
return 0;
}
+
+static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu)
+{
+ unsigned long flags;
+ struct amd_iommu_viommu *entry, *next;
+ struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
+ struct protection_domain *pdom = aviommu->parent;
+
+ spin_lock_irqsave(&pdom->lock, flags);
+ list_for_each_entry_safe(entry, next, &pdom->viommu_list, pdom_list) {
+ if (entry == aviommu)
+ list_del(&entry->pdom_list);
+ }
+ spin_unlock_irqrestore(&pdom->lock, flags);
+
+}
+
+/*
+ * See include/linux/iommufd.h
+ * struct iommufd_viommu_ops - vIOMMU specific operations
+ */
+static const struct iommufd_viommu_ops amd_viommu_ops = {
+ .destroy = amd_iommufd_viommu_destroy,
+};
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
index dd3e53dd16ea..1bbcb16abecc 100644
--- a/drivers/iommu/amd/nested.c
+++ b/drivers/iommu/amd/nested.c
@@ -6,6 +6,7 @@
#define dev_fmt(fmt) "AMD-Vi: " fmt
#include <linux/iommu.h>
+#include <linux/refcount.h>
#include <uapi/linux/iommufd.h>
#include "amd_iommu.h"
@@ -68,6 +69,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
{
int ret;
struct nested_domain *ndom;
+ struct guest_domain_mapping_info *gdom_info, *curr;
struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
@@ -92,7 +94,60 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
ndom->domain.type = IOMMU_DOMAIN_NESTED;
ndom->viommu = aviommu;
+ gdom_info = kzalloc(sizeof(*gdom_info), GFP_KERNEL);
+ if (!gdom_info)
+ goto out_err;
+
+ /*
+ * Normally, when a guest has multiple pass-through devices,
+ * the IOMMU driver setup DTEs with the same stage-2 table and
+ * use the same host domain ID (hDomId). In case of nested translation,
+ * if the guest setup different stage-1 tables with same PASID,
+ * IOMMU would use the same TLB tag. This will results in TLB
+ * aliasing issue.
+ *
+ * The guest is assigning gDomIDs based on its own algorithm for managing
+ * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
+ * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
+ * to a single hDomID. This is done using an xarray in the vIOMMU to
+ * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
+ * command must be issued for each hDomID in the xarray.
+ */
+ curr = xa_cmpxchg(&aviommu->gdomid_array,
+ ndom->gdom_id, NULL, gdom_info, GFP_ATOMIC);
+ if (curr) {
+ if (xa_err(curr)) {
+ ret = -EINVAL;
+ goto out_err_gdom_info;
+ } else {
+ /* The gDomID already exist */
+ pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n",
+ __func__, ndom->gdom_id, curr->hdom_id);
+ refcount_inc(&curr->users);
+ ndom->gdom_info = curr;
+ kfree(gdom_info);
+ return &ndom->domain;
+ }
+ }
+
+ /* The gDomID does not exist. We allocate new hdom_id */
+ gdom_info->hdom_id = amd_iommu_pdom_id_alloc();
+ if (gdom_info->hdom_id <= 0) {
+ xa_cmpxchg(&aviommu->gdomid_array,
+ ndom->gdom_id, gdom_info, NULL, GFP_ATOMIC);
+ ret = -ENOSPC;
+ goto out_err_gdom_info;
+ }
+
+ refcount_set(&gdom_info->users, 1);
+ ndom->gdom_info = gdom_info;
+ pr_debug("%s: Allocate gdom_id=%#x, hdom_id=%#x\n",
+ __func__, ndom->gdom_id, gdom_info->hdom_id);
+
return &ndom->domain;
+
+out_err_gdom_info:
+ kfree(gdom_info);
out_err:
kfree(ndom);
return ERR_PTR(ret);
@@ -100,8 +155,33 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
static void nested_domain_free(struct iommu_domain *dom)
{
+ struct guest_domain_mapping_info *curr;
struct nested_domain *ndom = to_ndomain(dom);
+ struct amd_iommu_viommu *aviommu = ndom->viommu;
+
+ if (!refcount_dec_and_test(&ndom->gdom_info->users))
+ return;
+ /*
+ * The refcount for the gdom_id to hdom_id mapping is zero.
+ * It is now safe to remove the mapping.
+ */
+ curr = xa_cmpxchg(&aviommu->gdomid_array, ndom->gdom_id,
+ ndom->gdom_info, NULL, GFP_ATOMIC);
+ if (curr) {
+ if (xa_err(curr)) {
+ pr_err("%s: Failed to free nested domain gdom_id=%#x\n",
+ __func__, ndom->gdom_id);
+ return;
+ }
+
+ /* success */
+ pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n",
+ __func__, ndom->gdom_id, curr->hdom_id);
+ kfree(curr);
+ }
+
+ amd_iommu_pdom_id_free(ndom->gdom_info->hdom_id);
kfree(ndom);
}
--
2.34.1
On Wed, Nov 12, 2025 at 06:25:03PM +0000, Suravee Suthikulpanit wrote:
> Each nested domain is assigned guest domain ID (gDomID), which guest OS
> programs into guest Device Table Entry (gDTE). For each gDomID, the driver
> assigns a corresponding host domain ID (hDomID), which will be programmed
> into the host Device Table Entry (hDTE).
>
> The hDomID is allocated during amd_iommu_alloc_domain_nested(),
> and free during nested_domain_free(). The gDomID-to-hDomID mapping info
> (struct guest_domain_mapping_info) is stored in a per-viommu xarray
> (struct amd_iommu_viommu.gdomid_array), which is indexed by gDomID.
>
> Note also that parent domain can be shared among struct iommufd_viommu.
> Therefore, when hypervisor invalidates the nest parent domain, the AMD
> IOMMU command INVALIDATE_IOMMU_PAGES must be issued for each hDomID in
> the gdomid_array. This is handled by the iommu_flush_pages_v1_hdom_ids(),
> where it iterates through struct protection_domain.viommu_list.
>
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> ---
> drivers/iommu/amd/amd_iommu_types.h | 23 +++++++++
> drivers/iommu/amd/iommu.c | 35 +++++++++++++
> drivers/iommu/amd/iommufd.c | 34 ++++++++++++
> drivers/iommu/amd/nested.c | 80 +++++++++++++++++++++++++++++
> 4 files changed, 172 insertions(+)
I think this looks OK in general, just the locking needs fixing up.
> +static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
> +{
> + int ret = 0;
> + struct amd_iommu_viommu *aviommu;
> +
> + list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
> + unsigned long i;
> + struct guest_domain_mapping_info *gdom_info;
> + struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, struct amd_iommu, iommu);
> +
> + xa_for_each(&aviommu->gdomid_array, i, gdom_info) {
> + struct iommu_cmd cmd;
What is the locking for the xa here?
It looks missing too. Either hold the xa lock for this iteration, or
do something to hold the pdom lock when updating the xarray.
> + pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__,
> + iommu->devid, gdom_info->hdom_id);
> + build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id,
> + IOMMU_NO_PASID, false);
> + ret |= iommu_queue_command(iommu, &cmd);
> + }
> + }
> + return ret;
> +}
This is kind of painfully slow for invalidation but OK for now, we
don't really expect alot of parent invalidation traffic..
I think after we get this landed:
https://lore.kernel.org/r/cover.1762588839.git.nicolinc@nvidia.com
We should take a serious run at trying to make it shared code and have
AMD use it. That would resolve the performance concern..
> +static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu)
> +{
> + unsigned long flags;
> + struct amd_iommu_viommu *entry, *next;
> + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
> + struct protection_domain *pdom = aviommu->parent;
> +
> + spin_lock_irqsave(&pdom->lock, flags);
> + list_for_each_entry_safe(entry, next, &pdom->viommu_list, pdom_list) {
> + if (entry == aviommu)
> + list_del(&entry->pdom_list);
> + }
> + spin_unlock_irqrestore(&pdom->lock, flags);
Same remark as Nicolin
> static void nested_domain_free(struct iommu_domain *dom)
> {
> + struct guest_domain_mapping_info *curr;
> struct nested_domain *ndom = to_ndomain(dom);
> + struct amd_iommu_viommu *aviommu = ndom->viommu;
> +
> + if (!refcount_dec_and_test(&ndom->gdom_info->users))
> + return;
Same locking issues here, you have to hold the xa_lock while doing
this decrement through to thecmpxchg otherwise it will go wrong.
> + /*
> + * The refcount for the gdom_id to hdom_id mapping is zero.
> + * It is now safe to remove the mapping.
> + */
> + curr = xa_cmpxchg(&aviommu->gdomid_array, ndom->gdom_id,
> + ndom->gdom_info, NULL, GFP_ATOMIC);
> + if (curr) {
> + if (xa_err(curr)) {
> + pr_err("%s: Failed to free nested domain gdom_id=%#x\n",
> + __func__, ndom->gdom_id);
Don't log.. This should be a WARN_ON, the cmpxchg cannot fail unless
the xa is corrupted.
> + return;
> + }
> +
> + /* success */
> + pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n",
> + __func__, ndom->gdom_id, curr->hdom_id);
> + kfree(curr);
> + }
> +
> + amd_iommu_pdom_id_free(ndom->gdom_info->hdom_id);
?? This should be before the kfree(curr). the hdom_id remains
allocated and valid so long as the gdom_info is allocated.
Jason
On Tue, Nov 18, 2025 at 08:11:51PM -0400, Jason Gunthorpe wrote:
> On Wed, Nov 12, 2025 at 06:25:03PM +0000, Suravee Suthikulpanit wrote:
> > +static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
> > +{
> > + int ret = 0;
> > + struct amd_iommu_viommu *aviommu;
> > +
> > + list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
> > + unsigned long i;
> > + struct guest_domain_mapping_info *gdom_info;
> > + struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, struct amd_iommu, iommu);
> > +
> > + xa_for_each(&aviommu->gdomid_array, i, gdom_info) {
> This is kind of painfully slow for invalidation but OK for now, we
> don't really expect alot of parent invalidation traffic..
>
> I think after we get this landed:
>
> https://lore.kernel.org/r/cover.1762588839.git.nicolinc@nvidia.com
>
> We should take a serious run at trying to make it shared code and have
> AMD use it. That would resolve the performance concern..
Yea. I had the same feeling when looking at AMD's patch, and
realized that you did foresee all of these.
I've started work on the VMID sharing using the invs. Perhaps
I can add some patches prior to generalize those helpers.
Thanks
Nicolin
On Wed, Nov 12, 2025 at 06:25:03PM +0000, Suravee Suthikulpanit wrote:
> @@ -38,10 +40,42 @@ size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type vi
> int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent,
> const struct iommu_user_data *user_data)
> {
> + unsigned long flags;
> struct protection_domain *pdom = to_pdomain(parent);
> struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
>
> + xa_init(&aviommu->gdomid_array);
Perhaps init with XA_FLAGS_ALLOC1 since domid can't be 0?
> +static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu)
> +{
> + unsigned long flags;
> + struct amd_iommu_viommu *entry, *next;
> + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
> + struct protection_domain *pdom = aviommu->parent;
> +
> + spin_lock_irqsave(&pdom->lock, flags);
> + list_for_each_entry_safe(entry, next, &pdom->viommu_list, pdom_list) {
> + if (entry == aviommu)
> + list_del(&entry->pdom_list);
> + }
Do we really need the loop? Why not simply do list_del()?
> + spin_unlock_irqrestore(&pdom->lock, flags);
> +
> +}
No need of the extra line at the end of the function.
> @@ -92,7 +94,60 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
> ndom->domain.type = IOMMU_DOMAIN_NESTED;
> ndom->viommu = aviommu;
>
> + gdom_info = kzalloc(sizeof(*gdom_info), GFP_KERNEL);
> + if (!gdom_info)
> + goto out_err;
Missing:
ret = -ENOMEM;
> +
> + /*
> + * Normally, when a guest has multiple pass-through devices,
> + * the IOMMU driver setup DTEs with the same stage-2 table and
> + * use the same host domain ID (hDomId). In case of nested translation,
> + * if the guest setup different stage-1 tables with same PASID,
> + * IOMMU would use the same TLB tag. This will results in TLB
> + * aliasing issue.
> + *
> + * The guest is assigning gDomIDs based on its own algorithm for managing
> + * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
> + * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
> + * to a single hDomID. This is done using an xarray in the vIOMMU to
> + * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
> + * command must be issued for each hDomID in the xarray.
> + */
> + curr = xa_cmpxchg(&aviommu->gdomid_array,
> + ndom->gdom_id, NULL, gdom_info, GFP_ATOMIC);
> + if (curr) {
> + if (xa_err(curr)) {
> + ret = -EINVAL;
> + goto out_err_gdom_info;
> + } else {
> + /* The gDomID already exist */
> + pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n",
> + __func__, ndom->gdom_id, curr->hdom_id);
> + refcount_inc(&curr->users);
> + ndom->gdom_info = curr;
This looks racy..
When a gDomID is shared between two nested domains, a concurrent
nested_domain_free() could enter before refcount_inc(), and call
refcount_dec_and_test() or even free the curr and ndom.
Then, this refcount_inc() will blow up, or curr/ndom will UAF.
Actually, I don't see where amd_iommu_alloc_domain_nested() gets
used in this series.. I assume AMD will use the iommufd's vIOMMU
infrastructure directly which doesn't mutex across nested domain
allocation/free calls.
So, the entire thing here should hold xa_lock(), use xas_load()
for the existing curr and use xas_store() to store gdom_info if
!curr, and xa_unlock() after gdom_info is fully initialized.
> + kfree(gdom_info);
> + return &ndom->domain;
> + }
> + }
> +
> + /* The gDomID does not exist. We allocate new hdom_id */
> + gdom_info->hdom_id = amd_iommu_pdom_id_alloc();
> + if (gdom_info->hdom_id <= 0) {
> + xa_cmpxchg(&aviommu->gdomid_array,
> + ndom->gdom_id, gdom_info, NULL, GFP_ATOMIC);
> + ret = -ENOSPC;
> + goto out_err_gdom_info;
> + }
> +
> + refcount_set(&gdom_info->users, 1);
Similar risk here. gdom_info is stored to the xarray before this
line. A concurrent amd_iommu_alloc_domain_nested() could get the
stored gdom_info and blow up at refcount_inc().
Make sure the entire thing is locked and safe.
Nicolin
On Thu, Nov 13, 2025 at 12:36:07PM -0800, Nicolin Chen wrote:
> > + curr = xa_cmpxchg(&aviommu->gdomid_array,
> > + ndom->gdom_id, NULL, gdom_info, GFP_ATOMIC);
> > + if (curr) {
> > + if (xa_err(curr)) {
> > + ret = -EINVAL;
> > + goto out_err_gdom_info;
> > + } else {
> > + /* The gDomID already exist */
> > + pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n",
> > + __func__, ndom->gdom_id, curr->hdom_id);
> > + refcount_inc(&curr->users);
> > + ndom->gdom_info = curr;
>
> This looks racy..
Yes
> When a gDomID is shared between two nested domains, a concurrent
> nested_domain_free() could enter before refcount_inc(), and call
> refcount_dec_and_test() or even free the curr and ndom.
>
> Then, this refcount_inc() will blow up, or curr/ndom will UAF.
>
> Actually, I don't see where amd_iommu_alloc_domain_nested() gets
> used in this series.. I assume AMD will use the iommufd's vIOMMU
> infrastructure directly which doesn't mutex across nested domain
> allocation/free calls.
>
> So, the entire thing here should hold xa_lock(), use xas_load()
> for the existing curr and use xas_store() to store gdom_info if
> !curr, and xa_unlock() after gdom_info is fully initialized.
No need for xas functions.. You can use the __ functions..
A helper function like this will do the job:
static void *xa_load_or_alloc_locked(struct xarray *xa, unsigned long index, size_t sz)
{
void *elm, *res;
elm = xa_load(xa, index);
if (elm)
return elm;
xa_unlock(xa);
elm = kzalloc(sz, GFP_KERNEL);
xa_lock(xa);
if (!elm)
return ERR_PTR(-ENOMEM);
res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
if (xa_is_err(res))
res = ERR_PTR(xa_err(res));
if (res) {
kfree(elm);
return res;
}
return elm;
}
Call like
xa_lock(&aviommu->gdomid_array);
elm = *xa_load_or_alloc_locked(..)
if (IS_ERR(elm))
..
elm->refcount++;
xa_unlock(&aviommu->gdomid_array);
Needs more bits if you want to use refcount_t
Jason
© 2016 - 2025 Red Hat, Inc.