[PATCH v3 15/15] iommu/amd: Add support for nested domain attach/detach

Suravee Suthikulpanit posted 15 patches 2 months, 1 week ago
There is a newer version of this series
[PATCH v3 15/15] iommu/amd: Add support for nested domain attach/detach
Posted by Suravee Suthikulpanit 2 months, 1 week ago
Introduce set_dte_nested() to program guest translation settings in
the host DTE when attaches the nested domain to a device.

In addition, introduce struct amd_iommu_viommu, which stores reference to
the nest parent domain assigned during the call to struct
iommu_ops.viommu_init(). Information in the nest parent domain is needed
when setting up the DTE for nested translation.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 drivers/iommu/amd/amd_iommu.h       |  3 ++
 drivers/iommu/amd/amd_iommu_types.h |  8 ++++
 drivers/iommu/amd/iommu.c           |  8 ++--
 drivers/iommu/amd/nested.c          | 66 +++++++++++++++++++++++++++++
 4 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index cfb63de7732a..98351b0cb9a0 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -190,6 +190,9 @@ struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid);
 int amd_iommu_completion_wait(struct amd_iommu *iommu);
 
 /* DTE */
+void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
+			  struct protection_domain *domain, u16 domid,
+			  struct dev_table_entry *new);
 int amd_iommu_device_flush_dte(struct iommu_dev_data *dev_data);
 void amd_iommu_update_dte256(struct amd_iommu *iommu,
 			     struct iommu_dev_data *dev_data,
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index a68b5c2fc0a2..683ee288c636 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -17,6 +17,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
+#include <linux/iommufd.h>
 #include <linux/irqreturn.h>
 #include <linux/io-pgtable.h>
 
@@ -420,6 +421,7 @@
 #define DTE_FLAG_HAD	(3ULL << 7)
 #define DTE_MODE_MASK	GENMASK_ULL(11, 9)
 #define DTE_HOST_TRP	GENMASK_ULL(51, 12)
+#define DTE_FLAG_PPR	BIT_ULL(52)
 #define DTE_FLAG_GIOV	BIT_ULL(54)
 #define DTE_FLAG_GV	BIT_ULL(55)
 #define DTE_GLX		GENMASK_ULL(57, 56)
@@ -590,6 +592,11 @@ struct pdom_iommu_info {
 	u32 refcnt;	/* Count of attached dev/pasid per domain/IOMMU */
 };
 
+struct amd_iommu_viommu {
+	struct iommufd_viommu core;
+	struct protection_domain *parent; /* nest parent domain for this viommu */
+};
+
 /*
  * Structure defining one entry in the device table
  */
@@ -607,6 +614,7 @@ struct nested_domain {
 	struct iommu_domain domain; /* generic domain handle used by iommu core code */
 	u16 id;	                    /* the domain id written to the device table */
 	struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
+	struct amd_iommu_viommu *viommu;  /* AMD hw-viommu this nested domain belong to */
 };
 
 /*
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2a536d02aeab..013914fc8a4f 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2044,9 +2044,9 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
 		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
 }
 
-static void set_dte_v1(struct iommu_dev_data *dev_data,
-		       struct protection_domain *domain, u16 domid,
-		       struct dev_table_entry *new)
+void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
+			  struct protection_domain *domain, u16 domid,
+			  struct dev_table_entry *new)
 {
 	/*
 	 * When SNP is enabled, we can only support TV=1 with non-zero domain ID.
@@ -2089,7 +2089,7 @@ static void set_dte_entry(struct amd_iommu *iommu,
 
 	old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK;
 
-	set_dte_v1(dev_data, domain, domid, &new);
+	amd_iommu_set_dte_v1(dev_data, domain, domid, &new);
 	set_dte_gcr3_table(iommu, dev_data, &new);
 
 	if (dev_data->ppr)
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
index 3307c925d3c1..ca3d3001c87f 100644
--- a/drivers/iommu/amd/nested.c
+++ b/drivers/iommu/amd/nested.c
@@ -63,6 +63,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 			      const struct iommu_user_data *user_data)
 {
 	int ret;
+	struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
 	struct iommu_hwpt_amd_guest gdte;
 	struct nested_domain *ndom;
 
@@ -85,6 +86,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 
 	ndom->domain.ops = &nested_domain_ops;
 	ndom->domain.type = IOMMU_DOMAIN_NESTED;
+	ndom->viommu = aviommu;
 	memcpy(&ndom->gdte, &gdte, sizeof(gdte));
 
 	/*
@@ -111,6 +113,69 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 	return ERR_PTR(ret);
 }
 
+static void set_dte_nested(struct amd_iommu *iommu,
+			   struct iommu_domain *dom,
+			   struct iommu_dev_data *dev_data)
+{
+	struct protection_domain *parent;
+	struct dev_table_entry new = {0};
+	struct nested_domain *ndom = to_ndomain(dom);
+	struct iommu_hwpt_amd_guest *gdte = &ndom->gdte;
+
+	/*
+	 * The nest parent domain is attached during the call to the
+	 * struct iommu_ops.viommu_init(), which will be stored as part
+	 * of the struct amd_iommu_viommu.parent.
+	 */
+	if (WARN_ON(!ndom->viommu || !ndom->viommu->parent))
+		return;
+
+	parent = ndom->viommu->parent;
+	amd_iommu_make_clear_dte(dev_data, &new);
+
+	/*
+	 * Use nested domain ID to program DTE.
+	 * See amd_iommu_alloc_domain_nested().
+	 */
+	amd_iommu_set_dte_v1(dev_data, parent, ndom->id, &new);
+
+	/* Guest PPR */
+	new.data[0] |= gdte->dte[0] & DTE_FLAG_PPR;
+
+	/* Guest translation stuff */
+	new.data[0] |= gdte->dte[0] & (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV);
+
+	/* GCR3 table */
+	new.data[0] |= gdte->dte[0] & DTE_GCR3_14_12;
+	new.data[1] |= gdte->dte[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31);
+
+	/* Guest paging mode */
+	new.data[2] |= gdte->dte[2] & DTE_GPT_LEVEL_MASK;
+
+	amd_iommu_update_dte256(iommu, dev_data, &new);
+}
+
+static int nested_attach_device(struct iommu_domain *dom, struct device *dev)
+{
+	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
+	int ret = 0;
+
+	if (WARN_ON(dom->type != IOMMU_DOMAIN_NESTED))
+		return -EINVAL;
+
+	mutex_lock(&dev_data->mutex);
+
+	/* Update device table entry */
+	set_dte_nested(iommu, dom, dev_data);
+	amd_iommu_device_flush_dte(dev_data);
+	amd_iommu_completion_wait(iommu);
+
+	mutex_unlock(&dev_data->mutex);
+
+	return ret;
+}
+
 static void nested_domain_free(struct iommu_domain *dom)
 {
 	struct nested_domain *ndom = to_ndomain(dom);
@@ -120,6 +185,7 @@ static void nested_domain_free(struct iommu_domain *dom)
 }
 
 static const struct iommu_domain_ops nested_domain_ops = {
+	.attach_dev = nested_attach_device,
 	.free = nested_domain_free,
 };
 
-- 
2.34.1
Re: [PATCH v3 15/15] iommu/amd: Add support for nested domain attach/detach
Posted by Jason Gunthorpe 2 months, 1 week ago
On Thu, Oct 09, 2025 at 11:57:55PM +0000, Suravee Suthikulpanit wrote:
> Introduce set_dte_nested() to program guest translation settings in
> the host DTE when attaches the nested domain to a device.
> 
> In addition, introduce struct amd_iommu_viommu, which stores reference to
> the nest parent domain assigned during the call to struct
> iommu_ops.viommu_init(). Information in the nest parent domain is needed
> when setting up the DTE for nested translation.
> 
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> ---
>  drivers/iommu/amd/amd_iommu.h       |  3 ++
>  drivers/iommu/amd/amd_iommu_types.h |  8 ++++
>  drivers/iommu/amd/iommu.c           |  8 ++--
>  drivers/iommu/amd/nested.c          | 66 +++++++++++++++++++++++++++++
>  4 files changed, 81 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
> index cfb63de7732a..98351b0cb9a0 100644
> --- a/drivers/iommu/amd/amd_iommu.h
> +++ b/drivers/iommu/amd/amd_iommu.h
> @@ -190,6 +190,9 @@ struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid);
>  int amd_iommu_completion_wait(struct amd_iommu *iommu);
>  
>  /* DTE */
> +void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
> +			  struct protection_domain *domain, u16 domid,
> +			  struct dev_table_entry *new);
>  int amd_iommu_device_flush_dte(struct iommu_dev_data *dev_data);
>  void amd_iommu_update_dte256(struct amd_iommu *iommu,
>  			     struct iommu_dev_data *dev_data,
> diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
> index a68b5c2fc0a2..683ee288c636 100644
> --- a/drivers/iommu/amd/amd_iommu_types.h
> +++ b/drivers/iommu/amd/amd_iommu_types.h
> @@ -17,6 +17,7 @@
>  #include <linux/list.h>
>  #include <linux/spinlock.h>
>  #include <linux/pci.h>
> +#include <linux/iommufd.h>
>  #include <linux/irqreturn.h>
>  #include <linux/io-pgtable.h>
>  
> @@ -420,6 +421,7 @@
>  #define DTE_FLAG_HAD	(3ULL << 7)
>  #define DTE_MODE_MASK	GENMASK_ULL(11, 9)
>  #define DTE_HOST_TRP	GENMASK_ULL(51, 12)
> +#define DTE_FLAG_PPR	BIT_ULL(52)
>  #define DTE_FLAG_GIOV	BIT_ULL(54)
>  #define DTE_FLAG_GV	BIT_ULL(55)
>  #define DTE_GLX		GENMASK_ULL(57, 56)
> @@ -590,6 +592,11 @@ struct pdom_iommu_info {
>  	u32 refcnt;	/* Count of attached dev/pasid per domain/IOMMU */
>  };
>  
> +struct amd_iommu_viommu {
> +	struct iommufd_viommu core;
> +	struct protection_domain *parent; /* nest parent domain for this viommu */
> +};

This alone is not enough, the core code needs to allocate this
memory too. Make adding the viommu to be its own patch before adding
allocating the nested domain and move these hunks:

> @@ -607,6 +614,7 @@ struct nested_domain {
>  	struct iommu_domain domain; /* generic domain handle used by iommu core code */
>  	u16 id;	                    /* the domain id written to the device table */
>  	struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
> +	struct amd_iommu_viommu *viommu;  /* AMD hw-viommu this nested domain belong to */

Into the nested domain allocation patch.

> @@ -2044,9 +2044,9 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
>  		target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
>  }
>  
> -static void set_dte_v1(struct iommu_dev_data *dev_data,
> -		       struct protection_domain *domain, u16 domid,
> -		       struct dev_table_entry *new)
> +void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
> +			  struct protection_domain *domain, u16 domid,
> +			  struct dev_table_entry *new)
>  {

Give the function the right name in the patch that adds it?

> diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
> index 3307c925d3c1..ca3d3001c87f 100644
> --- a/drivers/iommu/amd/nested.c
> +++ b/drivers/iommu/amd/nested.c
> @@ -63,6 +63,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
>  			      const struct iommu_user_data *user_data)
>  {
>  	int ret;
> +	struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
>  	struct iommu_hwpt_amd_guest gdte;
>  	struct nested_domain *ndom;
>  
> @@ -85,6 +86,7 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
>  
>  	ndom->domain.ops = &nested_domain_ops;
>  	ndom->domain.type = IOMMU_DOMAIN_NESTED;
> +	ndom->viommu = aviommu;
>  	memcpy(&ndom->gdte, &gdte, sizeof(gdte));

These hunks to the nesting domain allocation patch

> +static void set_dte_nested(struct amd_iommu *iommu,
> +			   struct iommu_domain *dom,
> +			   struct iommu_dev_data *dev_data)
> +{
> +	struct protection_domain *parent;
> +	struct dev_table_entry new = {0};
> +	struct nested_domain *ndom = to_ndomain(dom);
> +	struct iommu_hwpt_amd_guest *gdte = &ndom->gdte;
> +
> +	/*
> +	 * The nest parent domain is attached during the call to the
> +	 * struct iommu_ops.viommu_init(), which will be stored as part
> +	 * of the struct amd_iommu_viommu.parent.
> +	 */
> +	if (WARN_ON(!ndom->viommu || !ndom->viommu->parent))
> +		return;
> +
> +	parent = ndom->viommu->parent;
> +	amd_iommu_make_clear_dte(dev_data, &new);
> +
> +	/*
> +	 * Use nested domain ID to program DTE.
> +	 * See amd_iommu_alloc_domain_nested().
> +	 */
> +	amd_iommu_set_dte_v1(dev_data, parent, ndom->id, &new);
> +
> +	/* Guest PPR */
> +	new.data[0] |= gdte->dte[0] & DTE_FLAG_PPR;
> +
> +	/* Guest translation stuff */
> +	new.data[0] |= gdte->dte[0] & (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV);
> +
> +	/* GCR3 table */
> +	new.data[0] |= gdte->dte[0] & DTE_GCR3_14_12;
> +	new.data[1] |= gdte->dte[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31);
> +
> +	/* Guest paging mode */
> +	new.data[2] |= gdte->dte[2] & DTE_GPT_LEVEL_MASK;
> +
> +	amd_iommu_update_dte256(iommu, dev_data, &new);
> +}

This looks good

> +static int nested_attach_device(struct iommu_domain *dom, struct device *dev)
> +{
> +	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
> +	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
> +	int ret = 0;
> +
> +	if (WARN_ON(dom->type != IOMMU_DOMAIN_NESTED))
> +		return -EINVAL;
> +
> +	mutex_lock(&dev_data->mutex);
> +
> +	/* Update device table entry */
> +	set_dte_nested(iommu, dom, dev_data);
> +	amd_iommu_device_flush_dte(dev_data);
> +	amd_iommu_completion_wait(iommu);
> +
> +	mutex_unlock(&dev_data->mutex);

Where does the code record the ndom->id to push invalidates when the
S2 is changed? Seems like an important thing to be missing!

Shouldn't all this attach related stuff be in here too??

        ret = pdom_attach_iommu(iommu, domain);
        dev_data->domain = domain;

        spin_lock_irqsave(&domain->lock, flags);
        list_add(&dev_data->list, &domain->dev_list);
        spin_unlock_irqrestore(&domain->lock, flags);

At a bare minimum if the series is going to stop here then it must
also do correct invalidation for any S2 changes.

Given that, I'd suggest to also fix the domain id's with the xarray so
you don't have to redo the invalidation logic.

Jason
Re: [PATCH v3 15/15] iommu/amd: Add support for nested domain attach/detach
Posted by Suthikulpanit, Suravee 1 month, 4 weeks ago

On 10/10/2025 6:20 PM, Jason Gunthorpe wrote:
> On Thu, Oct 09, 2025 at 11:57:55PM +0000, Suravee Suthikulpanit wrote:
>> Introduce set_dte_nested() to program guest translation settings in
>> the host DTE when attaches the nested domain to a device.
>> .....
>>
>> +static int nested_attach_device(struct iommu_domain *dom, struct device *dev)
>> +{
>> +	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
>> +	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
>> +	int ret = 0;
>> +
>> +	if (WARN_ON(dom->type != IOMMU_DOMAIN_NESTED))
>> +		return -EINVAL;
>> +
>> +	mutex_lock(&dev_data->mutex);
>> +
>> +	/* Update device table entry */
>> +	set_dte_nested(iommu, dom, dev_data);
>> +	amd_iommu_device_flush_dte(dev_data);
>> +	amd_iommu_completion_wait(iommu);
>> +
>> +	mutex_unlock(&dev_data->mutex);
> 
> Where does the code record the ndom->id to push invalidates when the
> S2 is changed? Seems like an important thing to be missing!
> 
> Shouldn't all this attach related stuff be in here too??
> 
>          ret = pdom_attach_iommu(iommu, domain);
>          dev_data->domain = domain;
> 
>          spin_lock_irqsave(&domain->lock, flags);
>          list_add(&dev_data->list, &domain->dev_list);
>          spin_unlock_irqrestore(&domain->lock, flags);
> 
> At a bare minimum if the series is going to stop here then it must
> also do correct invalidation for any S2 changes.
> 
> Given that, I'd suggest to also fix the domain id's with the xarray so
> you don't have to redo the invalidation logic.
> 
> Jason

I am reworking this series to include S2 flushing, and will be sending 
out v4.

Thanks,
Suravee