Introduce set_dte_nested() to program guest translation settings in
the host DTE when attaches the nested domain to a device.
Also, enable the GCR3TRPMode feature when supported.
Note that nested translation is only supported with the GCR3TRP mode.
When it is enabled, the AMD IOMMU driver programs the GCR3 Table Root
Pointer field of the device table entry with the GPA provided by the guest.
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
drivers/iommu/amd/amd_iommu.h | 2 +
drivers/iommu/amd/amd_iommu_types.h | 3 +
drivers/iommu/amd/init.c | 3 +
drivers/iommu/amd/nested.c | 111 +++++++++++++++++++++++++++-
4 files changed, 116 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index cc1f14899dfe..924152973d11 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -190,6 +190,8 @@ struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid);
int amd_iommu_completion_wait(struct amd_iommu *iommu);
+extern bool amd_iommu_snp_en;
+
/* DTE */
int amd_iommu_device_flush_dte(struct iommu_dev_data *dev_data);
void amd_iommu_update_dte256(struct amd_iommu *iommu,
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index ba27fad77b57..9bc2e0e18978 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -188,6 +188,7 @@
#define CONTROL_EPH_EN 45
#define CONTROL_XT_EN 50
#define CONTROL_INTCAPXT_EN 51
+#define CONTROL_GCR3TRPMODE 58
#define CONTROL_IRTCACHEDIS 59
#define CONTROL_SNPAVIC_EN 61
@@ -417,6 +418,8 @@
#define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1)
#define DTE_FLAG_HAD (3ULL << 7)
+#define DTE_MODE_MASK GENMASK_ULL(11, 9)
+#define DTE_FLAG_PPR BIT_ULL(52)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX GENMASK_ULL(57, 56)
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index f2991c11867c..c45a4bd89569 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -1122,6 +1122,9 @@ static void iommu_enable_gt(struct amd_iommu *iommu)
return;
iommu_feature_enable(iommu, CONTROL_GT_EN);
+
+ if (check_feature2(FEATURE_GCR3TRPMODE))
+ iommu_feature_enable(iommu, CONTROL_GCR3TRPMODE);
}
/* sets a specific bit in the device table entry. */
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
index 11a0237174bb..5a0c369ba283 100644
--- a/drivers/iommu/amd/nested.c
+++ b/drivers/iommu/amd/nested.c
@@ -11,9 +11,7 @@
#include "amd_iommu.h"
#include "amd_iommu_types.h"
-static const struct iommu_domain_ops nested_domain_ops = {
- .free = amd_iommu_domain_free,
-};
+static const struct iommu_domain_ops nested_domain_ops;
static inline struct nested_domain *to_ndomain(struct iommu_domain *dom)
{
@@ -65,3 +63,110 @@ amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
kfree(ndom);
return ERR_PTR(ret);
}
+
+static void set_dte_nested(struct amd_iommu *iommu,
+ struct dev_table_entry *gdte,
+ struct nested_domain *ndom,
+ struct iommu_dev_data *dev_data)
+{
+ struct dev_table_entry *initial_dte;
+ struct dev_table_entry new = {0};
+ struct protection_domain *pdom = dev_data->parent;
+
+ if (WARN_ON(!ndom || !pdom || (pdom->iop.mode == PAGE_MODE_NONE)))
+ return;
+
+ amd_iommu_make_clear_dte(dev_data, &new);
+
+ new.data[0] |= iommu_virt_to_phys(pdom->iop.root);
+ new.data[0] |= FIELD_PREP(DTE_MODE_MASK, pdom->iop.mode);
+ new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
+ new.data[0] |= (DTE_FLAG_PPR & gdte->data[0]);
+
+ if (pdom->dirty_tracking)
+ new.data[0] |= DTE_FLAG_HAD;
+
+ if (dev_data->ats_enabled)
+ new.data[1] |= DTE_FLAG_IOTLB;
+
+ /*
+ * Use nested domain ID to program DTE.
+ * See amd_iommu_alloc_domain_nested().
+ */
+ new.data[1] |= ndom->id;
+
+ /*
+ * Restore cached persistent DTE bits, which can be set by information
+ * in IVRS table. See set_dev_entry_from_acpi().
+ */
+ initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
+ if (initial_dte) {
+ new.data128[0] |= initial_dte->data128[0];
+ new.data128[1] |= initial_dte->data128[1];
+ }
+
+ /* Guest translation stuff */
+ new.data[0] |= (gdte->data[0] &
+ (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV));
+
+ /* GCR3 table */
+ new.data[0] |= (gdte->data[0] & DTE_GCR3_14_12);
+ new.data[1] |= (gdte->data[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31));
+
+ /* Guest paging mode */
+ new.data[2] |= (gdte->data[2] & DTE_GPT_LEVEL_MASK);
+
+ amd_iommu_update_dte256(iommu, dev_data, &new);
+}
+
+static int nested_attach_device(struct iommu_domain *dom, struct device *dev)
+{
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+ struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
+ struct nested_domain *ndom = to_ndomain(dom);
+ struct dev_table_entry *gdte = &ndom->guest_dte;
+ int ret = 0;
+
+ if (dev_data->ndom == ndom)
+ return ret;
+
+ if (!dev_is_pci(dev))
+ return -EINVAL;
+
+ /* Currently only support GCR3TRPMode with nested translation */
+ if (!check_feature2(FEATURE_GCR3TRPMODE))
+ return -EOPNOTSUPP;
+
+ /* We need to check host capability before setting the mode */
+ if ((FIELD_GET(DTE_GPT_LEVEL_MASK, gdte->data[2]) == GUEST_PGTABLE_5_LEVEL) &&
+ (amd_iommu_gpt_level < PAGE_MODE_5_LEVEL))
+ return -EOPNOTSUPP;
+
+ WARN_ON(dev_data->ndom);
+
+ dev_data->ndom = ndom;
+
+ mutex_lock(&dev_data->mutex);
+
+ /* Update device table entry */
+ set_dte_nested(iommu, gdte, ndom, dev_data);
+ amd_iommu_device_flush_dte(dev_data);
+ amd_iommu_completion_wait(iommu);
+
+ mutex_unlock(&dev_data->mutex);
+
+ return ret;
+}
+
+static void nested_domain_free(struct iommu_domain *dom)
+{
+ struct nested_domain *ndom = to_ndomain(dom);
+
+ amd_iommu_pdom_id_free(ndom->id);
+ kfree(ndom);
+}
+
+static const struct iommu_domain_ops nested_domain_ops = {
+ .attach_dev = nested_attach_device,
+ .free = nested_domain_free,
+};
--
2.34.1
On Wed, Oct 01, 2025 at 06:09:53AM +0000, Suravee Suthikulpanit wrote:
> +static void set_dte_nested(struct amd_iommu *iommu,
> + struct dev_table_entry *gdte,
> + struct nested_domain *ndom,
> + struct iommu_dev_data *dev_data)
> +{
> + struct dev_table_entry *initial_dte;
> + struct dev_table_entry new = {0};
> + struct protection_domain *pdom = dev_data->parent;
No, this is ndom->parent.
The parent is NOT required to be attached to the device already.
> + if (WARN_ON(!ndom || !pdom || (pdom->iop.mode == PAGE_MODE_NONE)))
> + return;
> +
> + amd_iommu_make_clear_dte(dev_data, &new);
> +
> + new.data[0] |= iommu_virt_to_phys(pdom->iop.root);
> + new.data[0] |= FIELD_PREP(DTE_MODE_MASK, pdom->iop.mode);
> + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
> + new.data[0] |= (DTE_FLAG_PPR & gdte->data[0]);
> + if (pdom->dirty_tracking)
> + new.data[0] |= DTE_FLAG_HAD;
> +
> + if (dev_data->ats_enabled)
> + new.data[1] |= DTE_FLAG_IOTLB;
This sequence should be in some set_dte_gcr3() ??
> + /*
> + * Restore cached persistent DTE bits, which can be set by information
> + * in IVRS table. See set_dev_entry_from_acpi().
> + */
> + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
> + if (initial_dte) {
> + new.data128[0] |= initial_dte->data128[0];
> + new.data128[1] |= initial_dte->data128[1];
> + }
This should go into amd_iommu_make_clear_dte() I think, and refactor
it out of iommu_update_dte256() ?
Every created DTE needs these bits set, right?
> +
> + /* Guest translation stuff */
> + new.data[0] |= (gdte->data[0] &
> + (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV));
> +
> + /* GCR3 table */
> + new.data[0] |= (gdte->data[0] & DTE_GCR3_14_12);
> + new.data[1] |= (gdte->data[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31));
> +
> + /* Guest paging mode */
> + new.data[2] |= (gdte->data[2] & DTE_GPT_LEVEL_MASK);
I didn't see anything validating gdte has only permitted set bits in
the prior patch?
If this is goint to decode array item by item then why not use struct
iommu_hwpt_amd_guest in the nested_domain ?
> +static int nested_attach_device(struct iommu_domain *dom, struct device *dev)
> +{
> + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
> + struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
> + struct nested_domain *ndom = to_ndomain(dom);
> + struct dev_table_entry *gdte = &ndom->guest_dte;
> + int ret = 0;
> +
> + if (dev_data->ndom == ndom)
> + return ret;
> +
> + if (!dev_is_pci(dev))
> + return -EINVAL;
Why?
> + /* Currently only support GCR3TRPMode with nested translation */
> + if (!check_feature2(FEATURE_GCR3TRPMODE))
> + return -EOPNOTSUPP;
This is impossible since we can't allocate a nest parent. If you want
to make a redundent check then call is_nest_parent_supported()
> + /* We need to check host capability before setting the mode */
> + if ((FIELD_GET(DTE_GPT_LEVEL_MASK, gdte->data[2]) == GUEST_PGTABLE_5_LEVEL) &&
> + (amd_iommu_gpt_level < PAGE_MODE_5_LEVEL))
> + return -EOPNOTSUPP;
I wonder if this should be done during alloc
> + WARN_ON(dev_data->ndom);
> +
> + dev_data->ndom = ndom;
Useless?
> + mutex_lock(&dev_data->mutex);
> +
> + /* Update device table entry */
> + set_dte_nested(iommu, gdte, ndom, dev_data);
> + amd_iommu_device_flush_dte(dev_data);
> + amd_iommu_completion_wait(iommu);
Hurray
Jason
On 10/6/2025 9:59 AM, Jason Gunthorpe wrote:
> On Wed, Oct 01, 2025 at 06:09:53AM +0000, Suravee Suthikulpanit wrote:
>> ....
>> + if (WARN_ON(!ndom || !pdom || (pdom->iop.mode == PAGE_MODE_NONE)))
>> + return;
>> +
>> + amd_iommu_make_clear_dte(dev_data, &new);
>> +
>> + new.data[0] |= iommu_virt_to_phys(pdom->iop.root);
>> + new.data[0] |= FIELD_PREP(DTE_MODE_MASK, pdom->iop.mode);
>> + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
>> + new.data[0] |= (DTE_FLAG_PPR & gdte->data[0]);
>
>> + if (pdom->dirty_tracking)
>> + new.data[0] |= DTE_FLAG_HAD;
>> +
>> + if (dev_data->ats_enabled)
>> + new.data[1] |= DTE_FLAG_IOTLB;
>
> This sequence should be in some set_dte_gcr3() ??
Not sure what you mean. This logic was in set_dte_entry(), and
duplicated here in the set_dte_nested() since we no longer calling
set_dte_entry() from the nested path. Also, it's not really related to
GCR3 table.
>> + /*
>> + * Restore cached persistent DTE bits, which can be set by information
>> + * in IVRS table. See set_dev_entry_from_acpi().
>> + */
>> + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
>> + if (initial_dte) {
>> + new.data128[0] |= initial_dte->data128[0];
>> + new.data128[1] |= initial_dte->data128[1];
>> + }
>
> This should go into amd_iommu_make_clear_dte() I think, and refactor
> it out of iommu_update_dte256() ?
> Every created DTE needs these bits set, right?
Currently, the amd_iommu_make_clear_dte() clears all the DTE bits and
set the DTE[V] (valid) bit. This is used when preparing the DTE for
programming, detaching domain, and when setting the blocking domain.
Putting this logic in the function would change the behavior.
These bits affect Interrupt remapping (Lint0/Lint1/NMI/INIT/ExtInt
interrupt pass-through) and System management message behavior. It
should be okay to set these up for the specified devices in the current
implementation.
>> +
>> + /* Guest translation stuff */
>> + new.data[0] |= (gdte->data[0] &
>> + (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV));
>> +
>> + /* GCR3 table */
>> + new.data[0] |= (gdte->data[0] & DTE_GCR3_14_12);
>> + new.data[1] |= (gdte->data[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31));
>> +
>> + /* Guest paging mode */
>> + new.data[2] |= (gdte->data[2] & DTE_GPT_LEVEL_MASK);
>
> I didn't see anything validating gdte has only permitted set bits in
> the prior patch?
Not sure which on are you referring to.
> If this is goint to decode array item by item then why not use struct
> iommu_hwpt_amd_guest in the nested_domain ?
The struct dev_table_entry *gdte is basically the same information as in
the struct iommu_hwpt_amd_guest.dte that we copied from the userspace
into the more appropriate in-kernel data structure type, which is used
within the driver.
Here, we just select only what we needed for configuring guest page
table specifically to be programmed onto the host DTE.
Thanks,
Suravee
On Tue, Oct 07, 2025 at 02:22:30PM -0500, Suthikulpanit, Suravee wrote:
>
>
> On 10/6/2025 9:59 AM, Jason Gunthorpe wrote:
> > On Wed, Oct 01, 2025 at 06:09:53AM +0000, Suravee Suthikulpanit wrote:
> > > ....
> > > + if (WARN_ON(!ndom || !pdom || (pdom->iop.mode == PAGE_MODE_NONE)))
> > > + return;
> > > +
> > > + amd_iommu_make_clear_dte(dev_data, &new);
> > > +
> > > + new.data[0] |= iommu_virt_to_phys(pdom->iop.root);
> > > + new.data[0] |= FIELD_PREP(DTE_MODE_MASK, pdom->iop.mode);
> > > + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
> > > + new.data[0] |= (DTE_FLAG_PPR & gdte->data[0]);
> >
> > > + if (pdom->dirty_tracking)
> > > + new.data[0] |= DTE_FLAG_HAD;
> > > +
> > > + if (dev_data->ats_enabled)
> > > + new.data[1] |= DTE_FLAG_IOTLB;
> >
> > This sequence should be in some set_dte_gcr3() ??
>
> Not sure what you mean. This logic was in set_dte_entry(), and duplicated
> here in the set_dte_nested() since we no longer calling set_dte_entry() from
> the nested path. Also, it's not really related to GCR3 table.
Sorry some make_ste_v1()
This logic should be the same for any v1 page table.
> > > + /*
> > > + * Restore cached persistent DTE bits, which can be set by information
> > > + * in IVRS table. See set_dev_entry_from_acpi().
> > > + */
> > > + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
> > > + if (initial_dte) {
> > > + new.data128[0] |= initial_dte->data128[0];
> > > + new.data128[1] |= initial_dte->data128[1];
> > > + }
> >
> > This should go into amd_iommu_make_clear_dte() I think, and refactor
> > it out of iommu_update_dte256() ?
> > Every created DTE needs these bits set, right?
>
> Currently, the amd_iommu_make_clear_dte() clears all the DTE bits and set
> the DTE[V] (valid) bit. This is used when preparing the DTE for programming,
> detaching domain, and when setting the blocking domain. Putting this logic
> in the function would change the behavior.
Yes, but it seems like that is the right behavior?
> These bits affect Interrupt remapping (Lint0/Lint1/NMI/INIT/ExtInt interrupt
> pass-through) and System management message behavior.
Because these bits shouldn't be cleared on a blocking domain!
Interrupts are still expected to work.
> > > +
> > > + /* Guest translation stuff */
> > > + new.data[0] |= (gdte->data[0] &
> > > + (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV));
> > > +
> > > + /* GCR3 table */
> > > + new.data[0] |= (gdte->data[0] & DTE_GCR3_14_12);
> > > + new.data[1] |= (gdte->data[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31));
> > > +
> > > + /* Guest paging mode */
> > > + new.data[2] |= (gdte->data[2] & DTE_GPT_LEVEL_MASK);
> >
> > I didn't see anything validating gdte has only permitted set bits in
> > the prior patch?
>
> Not sure which on are you referring to.
You have to validate gdte has only valid bits.
> > If this is goint to decode array item by item then why not use struct
> > iommu_hwpt_amd_guest in the nested_domain ?
>
> The struct dev_table_entry *gdte is basically the same information as in the
> struct iommu_hwpt_amd_guest.dte that we copied from the userspace into the
> more appropriate in-kernel data structure type, which is used within the
> driver.
The appropriate type is the userspace type if you don't actually ever
use anything unique to the kernel type. You can avoid the special
assert/etc since this way of accessing it is not sensitive to layout.
> Here, we just select only what we needed for configuring guest page table
> specifically to be programmed onto the host DTE.
Everything you don't pick up should be checked to be 0. VMM needs to
filter out unsuopported things or generate a bad DTE error.
Jason
On 10/8/2025 5:13 AM, Jason Gunthorpe wrote:
> On Tue, Oct 07, 2025 at 02:22:30PM -0500, Suthikulpanit, Suravee wrote:
>>
>> On 10/6/2025 9:59 AM, Jason Gunthorpe wrote:
>>> On Wed, Oct 01, 2025 at 06:09:53AM +0000, Suravee Suthikulpanit wrote:
>>>> ....
>>>> + if (WARN_ON(!ndom || !pdom || (pdom->iop.mode == PAGE_MODE_NONE)))
>>>> + return;
>>>> +
>>>> + amd_iommu_make_clear_dte(dev_data, &new);
>>>> +
>>>> + new.data[0] |= iommu_virt_to_phys(pdom->iop.root);
>>>> + new.data[0] |= FIELD_PREP(DTE_MODE_MASK, pdom->iop.mode);
>>>> + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
>>>> + new.data[0] |= (DTE_FLAG_PPR & gdte->data[0]);
>>>> + if (pdom->dirty_tracking)
>>>> + new.data[0] |= DTE_FLAG_HAD;
>>>> +
>>>> + if (dev_data->ats_enabled)
>>>> + new.data[1] |= DTE_FLAG_IOTLB;
>>> This sequence should be in some set_dte_gcr3() ??
>> Not sure what you mean. This logic was in set_dte_entry(), and duplicated
>> here in the set_dte_nested() since we no longer calling set_dte_entry() from
>> the nested path. Also, it's not really related to GCR3 table.
> Sorry some make_ste_v1()
>
> This logic should be the same for any v1 page table.
>>>> + /*
>>>> + * Restore cached persistent DTE bits, which can be set by information
>>>> + * in IVRS table. See set_dev_entry_from_acpi().
>>>> + */
>>>> + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
>>>> + if (initial_dte) {
>>>> + new.data128[0] |= initial_dte->data128[0];
>>>> + new.data128[1] |= initial_dte->data128[1];
>>>> + }
>>> This should go into amd_iommu_make_clear_dte() I think, and refactor
>>> it out of iommu_update_dte256() ?
>>> Every created DTE needs these bits set, right?
>> Currently, the amd_iommu_make_clear_dte() clears all the DTE bits and set
>> the DTE[V] (valid) bit. This is used when preparing the DTE for programming,
>> detaching domain, and when setting the blocking domain. Putting this logic
>> in the function would change the behavior.
> Yes, but it seems like that is the right behavior?
>
>> These bits affect Interrupt remapping (Lint0/Lint1/NMI/INIT/ExtInt interrupt
>> pass-through) and System management message behavior.
> Because these bits shouldn't be cleared on a blocking domain!
> Interrupts are still expected to work.
>
>
>>>> +
>>>> + /* Guest translation stuff */
>>>> + new.data[0] |= (gdte->data[0] &
>>>> + (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV));
>>>> +
>>>> + /* GCR3 table */
>>>> + new.data[0] |= (gdte->data[0] & DTE_GCR3_14_12);
>>>> + new.data[1] |= (gdte->data[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31));
>>>> +
>>>> + /* Guest paging mode */
>>>> + new.data[2] |= (gdte->data[2] & DTE_GPT_LEVEL_MASK);
>>> I didn't see anything validating gdte has only permitted set bits in
>>> the prior patch?
>> Not sure which on are you referring to.
> You have to validate gdte has only valid bits.
>
>>> If this is goint to decode array item by item then why not use struct
>>> iommu_hwpt_amd_guest in the nested_domain ?
>> The struct dev_table_entry *gdte is basically the same information as in the
>> struct iommu_hwpt_amd_guest.dte that we copied from the userspace into the
>> more appropriate in-kernel data structure type, which is used within the
>> driver.
> The appropriate type is the userspace type if you don't actually ever
> use anything unique to the kernel type. You can avoid the special
> assert/etc since this way of accessing it is not sensitive to layout.
>
>> Here, we just select only what we needed for configuring guest page table
>> specifically to be programmed onto the host DTE.
> Everything you don't pick up should be checked to be 0. VMM needs to
> filter out unsuopported things or generate a bad DTE error.
An alternative can be to introduce a struct which only contains relevant
fields.
Something similar to -->
struct iommu_hwpt_amd_guest {
void *gcr3;
unsigned char guest_paging_mode;
}
VMM can take care of writing to these fields.
Thanks
Sairaj
On Thu, Oct 09, 2025 at 12:48:27PM +0530, Sairaj Kodilkar wrote: > > > Here, we just select only what we needed for configuring guest page table > > > specifically to be programmed onto the host DTE. > > Everything you don't pick up should be checked to be 0. VMM needs to > > filter out unsuopported things or generate a bad DTE error. > An alternative can be to introduce a struct which only contains relevant > fields. We don't want this as a uAPI, use the normal DTE and have the kernel check the things it currently support. Future kernels can support new things through the same ABI. VMM is reponsible to 0 out things the kernel shouldn't see or it handles on its own. Jason
On Wed, Oct 01, 2025 at 06:09:53AM +0000, Suravee Suthikulpanit wrote:
> /* sets a specific bit in the device table entry. */
> diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
> index 11a0237174bb..5a0c369ba283 100644
> --- a/drivers/iommu/amd/nested.c
> +++ b/drivers/iommu/amd/nested.c
> @@ -11,9 +11,7 @@
> #include "amd_iommu.h"
> #include "amd_iommu_types.h"
>
> -static const struct iommu_domain_ops nested_domain_ops = {
> - .free = amd_iommu_domain_free,
> -};
Oh no, amd_iommu_domain_free() with to_pdomain() won't work. So
you should move the nested_domain_free() to the previous patch,
pairing with amd_iommu_alloc_domain_nested().
> +static void set_dte_nested(struct amd_iommu *iommu,
> + struct dev_table_entry *gdte,
> + struct nested_domain *ndom,
> + struct iommu_dev_data *dev_data)
> +{
> + struct dev_table_entry *initial_dte;
> + struct dev_table_entry new = {0};
> + struct protection_domain *pdom = dev_data->parent;
> +
> + if (WARN_ON(!ndom || !pdom || (pdom->iop.mode == PAGE_MODE_NONE)))
> + return;
> +
> + amd_iommu_make_clear_dte(dev_data, &new);
> +
> + new.data[0] |= iommu_virt_to_phys(pdom->iop.root);
> + new.data[0] |= FIELD_PREP(DTE_MODE_MASK, pdom->iop.mode);
> + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_TV;
> + new.data[0] |= (DTE_FLAG_PPR & gdte->data[0]);
new.data[0] |= DTE_FLAG_PPR & gdte->data[0];
> + /* Guest translation stuff */
> + new.data[0] |= (gdte->data[0] & (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV));
new.data[0] |= gdte->data[0] & (DTE_GLX | DTE_FLAG_GV | DTE_FLAG_GIOV);
> +
> + /* GCR3 table */
> + new.data[0] |= (gdte->data[0] & DTE_GCR3_14_12);
> + new.data[1] |= (gdte->data[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31));
> +
> + /* Guest paging mode */
> + new.data[2] |= (gdte->data[2] & DTE_GPT_LEVEL_MASK);
All these outer parentheses are redundant.
> +static int nested_attach_device(struct iommu_domain *dom, struct device *dev)
> +{
> + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
> + struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
> + struct nested_domain *ndom = to_ndomain(dom);
> + struct dev_table_entry *gdte = &ndom->guest_dte;
> + int ret = 0;
> +
> + if (dev_data->ndom == ndom)
> + return ret;
> +
> + if (!dev_is_pci(dev))
> + return -EINVAL;
> +
> + /* Currently only support GCR3TRPMode with nested translation */
> + if (!check_feature2(FEATURE_GCR3TRPMODE))
> + return -EOPNOTSUPP;
The amd_iommu_alloc_domain_nested() should probably validate this
feature, so !FEATURE_GCR3TRPMODE wouldn't allocate a nested domain
at the first place, and then no need to revalidate it in attach().
> +
> + /* We need to check host capability before setting the mode */
> + if ((FIELD_GET(DTE_GPT_LEVEL_MASK, gdte->data[2]) == GUEST_PGTABLE_5_LEVEL) &&
> + (amd_iommu_gpt_level < PAGE_MODE_5_LEVEL))
> + return -EOPNOTSUPP;
Ditto.
The attach callback function should only check things related to
the compatibility between a device and a domain, while this is a
domain specific validation. Better do it in alloc() IMHO.
> + WARN_ON(dev_data->ndom);
return -EBUSY;
?
With these fixed,
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
© 2016 - 2026 Red Hat, Inc.