Programs guest translation settings in the host DTE when attaches the
nested domain to a device.
Also, enable the GCR3TRPMode feature when supported.
Note that nested translation is only supported with the GCR3TRP mode.
When it is enabled, the AMD IOMMU driver programs the GCR3 Table Root
Pointer field of the device table entry with the GPA provided by the guest.
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
drivers/iommu/amd/amd_iommu_types.h | 5 ++
drivers/iommu/amd/init.c | 3 ++
drivers/iommu/amd/iommu.c | 49 +++++++++++++++---
drivers/iommu/amd/nested.c | 80 +++++++++++++++++++++++++++--
4 files changed, 127 insertions(+), 10 deletions(-)
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 94f51a09b364..f8c392aadeb1 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -191,6 +191,7 @@
#define CONTROL_EPH_EN 45
#define CONTROL_XT_EN 50
#define CONTROL_INTCAPXT_EN 51
+#define CONTROL_GCR3TRPMODE 58
#define CONTROL_IRTCACHEDIS 59
#define CONTROL_SNPAVIC_EN 61
@@ -420,6 +421,8 @@
#define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1)
#define DTE_FLAG_HAD (3ULL << 7)
+#define DTE_FLAG_PPR BIT_ULL(52)
+#define DTE_FLAG_GLX BIT_ULL(53)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX GENMASK_ULL(57, 56)
@@ -555,6 +558,7 @@ struct amd_irte_ops;
struct gcr3_tbl_info {
u64 *gcr3_tbl; /* Guest CR3 table */
+ u64 trp_gpa; /* Guest CR3 TRP GPA for nested domain */
int glx; /* Number of levels for GCR3 table */
u32 pasid_cnt; /* Track attached PASIDs */
u16 domid; /* Per device domain ID */
@@ -610,6 +614,7 @@ struct protection_domain {
struct protection_domain *parent; /* Nested parent domain */
struct iommu_hwpt_amd_v2 guest_hwpt;
+ u16 guest_paging_mode; /* Guest paging mode */
};
/*
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 8de689b2c5ed..b340afd6901f 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -971,6 +971,9 @@ static void iommu_enable_gt(struct amd_iommu *iommu)
return;
iommu_feature_enable(iommu, CONTROL_GT_EN);
+
+ if (check_feature2(FEATURE_GCR3TRPMODE))
+ iommu_feature_enable(iommu, CONTROL_GCR3TRPMODE);
}
/* sets a specific bit in the device table entry. */
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index ea790a8997ee..935eaffb6814 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1856,6 +1856,18 @@ static void free_gcr3_tbl_level2(u64 *tbl)
}
}
+static inline bool amd_iommu_domain_is_nested(struct protection_domain *pdom)
+{
+ return (pdom && (pdom->domain.type == IOMMU_DOMAIN_NESTED));
+}
+
+static inline bool has_gcr3_table(struct gcr3_tbl_info *gcr3_info)
+{
+ if (!gcr3_info || (!gcr3_info->gcr3_tbl && !gcr3_info->trp_gpa))
+ return false;
+ return true;
+}
+
static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
{
if (gcr3_info->glx == 2)
@@ -1901,7 +1913,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
if (levels > amd_iommu_max_glx_val)
return -EINVAL;
- if (gcr3_info->gcr3_tbl)
+ if (has_gcr3_table(gcr3_info))
return -EBUSY;
/* Allocate per device domain ID */
@@ -2023,17 +2035,31 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
struct dev_table_entry *target)
{
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ struct protection_domain *pdom = dev_data->domain;
u64 gcr3;
- if (!gcr3_info->gcr3_tbl)
+ if (!has_gcr3_table(gcr3_info))
return;
- pr_debug("%s: devid=%#x, glx=%#x, giov=%#x, gcr3_tbl=%#llx\n",
+ /* We need to check host capability before setting the mode. */
+ if ((pdom->guest_paging_mode == PAGE_MODE_5_LEVEL) &&
+ (amd_iommu_gpt_level < PAGE_MODE_5_LEVEL)) {
+ pr_err("Cannot support Guest paging mode=%#x (dom_id=%#x).\n",
+ pdom->guest_paging_mode, pdom->id);
+ return;
+ }
+
+ pr_debug("%s: devid=%#x, glx=%#x, giov=%#x, gcr3_tbl=%#llx, trp_gpa=%#llx, type=%#x\n",
__func__, dev_data->devid, gcr3_info->glx, gcr3_info->giov,
- (unsigned long long)gcr3_info->gcr3_tbl);
+ (unsigned long long)gcr3_info->gcr3_tbl, gcr3_info->trp_gpa,
+ pdom->domain.type);
gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
+ /* For nested domain, use GCR3 GPA provided */
+ if (gcr3_info->trp_gpa)
+ gcr3 = gcr3_info->trp_gpa;
+
target->data[0] |= DTE_FLAG_GV |
FIELD_PREP(DTE_GLX, gcr3_info->glx) |
FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12);
@@ -2044,7 +2070,7 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu,
FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31);
/* Guest page table can only support 4 and 5 levels */
- if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+ if (pdom->guest_paging_mode == PAGE_MODE_5_LEVEL)
target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
else
target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
@@ -2061,7 +2087,14 @@ static void set_dte_entry(struct amd_iommu *iommu,
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
- if (gcr3_info && gcr3_info->gcr3_tbl)
+ /*
+ * For nested domain, use parent domain to setup v1 table
+ * information and domain id.
+ */
+ if (amd_iommu_domain_is_nested(domain))
+ domain = domain->parent;
+
+ if (has_gcr3_table(gcr3_info))
domid = dev_data->gcr3_info.domid;
else
domid = domain->id;
@@ -2293,7 +2326,8 @@ int __amd_iommu_attach_device(struct device *dev, struct protection_domain *doma
goto out;
/* Setup GCR3 table */
- if (pdom_is_sva_capable(domain)) {
+ if (!amd_iommu_domain_is_nested(domain) && pdom_is_sva_capable(domain)) {
+ pr_warn("%s: Allocating guest page table\n", __func__);
ret = init_gcr3_table(dev_data, domain);
if (ret) {
pdom_detach_iommu(iommu, domain);
@@ -2519,6 +2553,7 @@ static int pdom_setup_pgtable(struct protection_domain *domain,
fmt = AMD_IOMMU_V1;
break;
case PD_MODE_V2:
+ domain->guest_paging_mode = amd_iommu_gpt_level;
fmt = AMD_IOMMU_V2;
break;
case PD_MODE_NONE:
diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c
index 09f2a455af33..c9bf44e6298d 100644
--- a/drivers/iommu/amd/nested.c
+++ b/drivers/iommu/amd/nested.c
@@ -12,9 +12,7 @@
#include "amd_iommu.h"
#include "amd_iommu_types.h"
-const struct iommu_domain_ops nested_domain_ops = {
- .free = amd_iommu_domain_free,
-};
+const struct iommu_domain_ops nested_domain_ops;
static int udata_to_iommu_hwpt_amd_v2(const struct iommu_user_data *user_data,
struct iommu_hwpt_amd_v2 *hwpt)
@@ -77,3 +75,79 @@ amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent,
kfree(pdom);
return ERR_PTR(-EINVAL);
}
+
+static inline u64 hwpt_to_gcr3_trp(u64 *dte)
+{
+ u64 gcr3;
+
+ gcr3 = (FIELD_GET(DTE_GCR3_14_12, dte[0]) << 12);
+ gcr3 |= (FIELD_GET(DTE_GCR3_30_15, dte[1]) << 15);
+ gcr3 |= (FIELD_GET(DTE_GCR3_51_31, dte[1]) << 31);
+ return gcr3;
+}
+
+static int nested_gcr3_update(struct protection_domain *pdom, struct device *dev)
+{
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+ struct iommu_hwpt_amd_v2 *hwpt = &pdom->guest_hwpt;
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ if (!pdev || !hwpt)
+ return -EINVAL;
+
+ /* Note: Currently only support GCR3TRPMode with nested translation */
+ if (!check_feature2(FEATURE_GCR3TRPMODE))
+ return -EOPNOTSUPP;
+
+ if (FIELD_GET(DTE_GPT_LEVEL_MASK, hwpt->dte[2]) == GUEST_PGTABLE_5_LEVEL)
+ pdom->guest_paging_mode = PAGE_MODE_5_LEVEL;
+ else
+ pdom->guest_paging_mode = PAGE_MODE_4_LEVEL;
+
+ dev_data->ppr = FIELD_GET(DTE_FLAG_PPR, hwpt->dte[0]);
+ dev_data->gcr3_info.glx = FIELD_GET(DTE_FLAG_GLX, hwpt->dte[0]);
+ dev_data->gcr3_info.giov = FIELD_GET(DTE_FLAG_GIOV, hwpt->dte[0]);
+ dev_data->gcr3_info.trp_gpa = hwpt_to_gcr3_trp(hwpt->dte);
+ /* Due to possible aliasing issue use nested domain ID */
+ dev_data->gcr3_info.domid = pdom->id;
+ pr_debug("%s: devid=%#x, domid=%#x, trp_gpa=%#llx, glx=%#x\n", __func__,
+ pci_dev_id(pdev),
+ dev_data->gcr3_info.domid,
+ dev_data->gcr3_info.trp_gpa,
+ dev_data->gcr3_info.glx);
+
+ return 0;
+}
+
+static int amd_iommu_nested_attach_device(struct iommu_domain *dom, struct device *dev)
+{
+ struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
+ struct protection_domain *pdom = to_pdomain(dom);
+ struct pci_dev *pdev;
+ int ret;
+
+ if (dev_data->domain == pdom)
+ return 0;
+
+ ret = nested_gcr3_update(pdom, dev);
+ if (ret)
+ return ret;
+
+ if (dev_data->domain)
+ amd_iommu_detach_device(dev);
+
+ ret = __amd_iommu_attach_device(dev, pdom);
+ if (ret)
+ return ret;
+
+ pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL;
+ if (pdev)
+ amd_iommu_pdev_enable_cap_ats(pdev);
+
+ return ret;
+}
+
+const struct iommu_domain_ops nested_domain_ops = {
+ .attach_dev = amd_iommu_nested_attach_device,
+ .free = amd_iommu_domain_free,
+};
--
2.34.1
On 8/20/2025 5:00 PM, Suravee Suthikulpanit wrote: > Programs guest translation settings in the host DTE when attaches the > nested domain to a device. > > Also, enable the GCR3TRPMode feature when supported. > > Note that nested translation is only supported with the GCR3TRP mode. > When it is enabled, the AMD IOMMU driver programs the GCR3 Table Root > Pointer field of the device table entry with the GPA provided by the guest. > > Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> > --- > drivers/iommu/amd/amd_iommu_types.h | 5 ++ > drivers/iommu/amd/init.c | 3 ++ > drivers/iommu/amd/iommu.c | 49 +++++++++++++++--- > drivers/iommu/amd/nested.c | 80 +++++++++++++++++++++++++++-- > 4 files changed, 127 insertions(+), 10 deletions(-) > > diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h > index 94f51a09b364..f8c392aadeb1 100644 > --- a/drivers/iommu/amd/amd_iommu_types.h > +++ b/drivers/iommu/amd/amd_iommu_types.h > @@ -191,6 +191,7 @@ > #define CONTROL_EPH_EN 45 > #define CONTROL_XT_EN 50 > #define CONTROL_INTCAPXT_EN 51 > +#define CONTROL_GCR3TRPMODE 58 > #define CONTROL_IRTCACHEDIS 59 > #define CONTROL_SNPAVIC_EN 61 > > @@ -420,6 +421,8 @@ > #define DTE_FLAG_V BIT_ULL(0) > #define DTE_FLAG_TV BIT_ULL(1) > #define DTE_FLAG_HAD (3ULL << 7) > +#define DTE_FLAG_PPR BIT_ULL(52) > +#define DTE_FLAG_GLX BIT_ULL(53) > #define DTE_FLAG_GIOV BIT_ULL(54) > #define DTE_FLAG_GV BIT_ULL(55) > #define DTE_GLX GENMASK_ULL(57, 56) > @@ -555,6 +558,7 @@ struct amd_irte_ops; > > struct gcr3_tbl_info { > u64 *gcr3_tbl; /* Guest CR3 table */ > + u64 trp_gpa; /* Guest CR3 TRP GPA for nested domain */ > int glx; /* Number of levels for GCR3 table */ > u32 pasid_cnt; /* Track attached PASIDs */ > u16 domid; /* Per device domain ID */ > @@ -610,6 +614,7 @@ struct protection_domain { > > struct protection_domain *parent; /* Nested parent domain */ > struct iommu_hwpt_amd_v2 guest_hwpt; > + u16 guest_paging_mode; /* Guest paging mode */ > }; > > /* > diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c > index 8de689b2c5ed..b340afd6901f 100644 > --- a/drivers/iommu/amd/init.c > +++ b/drivers/iommu/amd/init.c > @@ -971,6 +971,9 @@ static void iommu_enable_gt(struct amd_iommu *iommu) > return; > > iommu_feature_enable(iommu, CONTROL_GT_EN); > + > + if (check_feature2(FEATURE_GCR3TRPMODE)) > + iommu_feature_enable(iommu, CONTROL_GCR3TRPMODE); > } > > /* sets a specific bit in the device table entry. */ > diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c > index ea790a8997ee..935eaffb6814 100644 > --- a/drivers/iommu/amd/iommu.c > +++ b/drivers/iommu/amd/iommu.c > @@ -1856,6 +1856,18 @@ static void free_gcr3_tbl_level2(u64 *tbl) > } > } > > +static inline bool amd_iommu_domain_is_nested(struct protection_domain *pdom) > +{ > + return (pdom && (pdom->domain.type == IOMMU_DOMAIN_NESTED)); > +} > + > +static inline bool has_gcr3_table(struct gcr3_tbl_info *gcr3_info) > +{ > + if (!gcr3_info || (!gcr3_info->gcr3_tbl && !gcr3_info->trp_gpa)) > + return false; > + return true; > +} > + > static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) > { > if (gcr3_info->glx == 2) > @@ -1901,7 +1913,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, > if (levels > amd_iommu_max_glx_val) > return -EINVAL; > > - if (gcr3_info->gcr3_tbl) > + if (has_gcr3_table(gcr3_info)) > return -EBUSY; > > /* Allocate per device domain ID */ > @@ -2023,17 +2035,31 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu, > struct dev_table_entry *target) > { > struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; > + struct protection_domain *pdom = dev_data->domain; > u64 gcr3; > > - if (!gcr3_info->gcr3_tbl) > + if (!has_gcr3_table(gcr3_info)) > return; > > - pr_debug("%s: devid=%#x, glx=%#x, giov=%#x, gcr3_tbl=%#llx\n", > + /* We need to check host capability before setting the mode. */ > + if ((pdom->guest_paging_mode == PAGE_MODE_5_LEVEL) && > + (amd_iommu_gpt_level < PAGE_MODE_5_LEVEL)) { > + pr_err("Cannot support Guest paging mode=%#x (dom_id=%#x).\n", > + pdom->guest_paging_mode, pdom->id); > + return; > + } > + > + pr_debug("%s: devid=%#x, glx=%#x, giov=%#x, gcr3_tbl=%#llx, trp_gpa=%#llx, type=%#x\n", > __func__, dev_data->devid, gcr3_info->glx, gcr3_info->giov, > - (unsigned long long)gcr3_info->gcr3_tbl); > + (unsigned long long)gcr3_info->gcr3_tbl, gcr3_info->trp_gpa, > + pdom->domain.type); > > gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); > > + /* For nested domain, use GCR3 GPA provided */ > + if (gcr3_info->trp_gpa) > + gcr3 = gcr3_info->trp_gpa; > + > target->data[0] |= DTE_FLAG_GV | > FIELD_PREP(DTE_GLX, gcr3_info->glx) | > FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12); > @@ -2044,7 +2070,7 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu, > FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); > > /* Guest page table can only support 4 and 5 levels */ > - if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) > + if (pdom->guest_paging_mode == PAGE_MODE_5_LEVEL) > target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); > else > target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); > @@ -2061,7 +2087,14 @@ static void set_dte_entry(struct amd_iommu *iommu, > struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; > struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; > > - if (gcr3_info && gcr3_info->gcr3_tbl) > + /* > + * For nested domain, use parent domain to setup v1 table > + * information and domain id. > + */ > + if (amd_iommu_domain_is_nested(domain)) > + domain = domain->parent; > + > + if (has_gcr3_table(gcr3_info)) > domid = dev_data->gcr3_info.domid; > else > domid = domain->id; > @@ -2293,7 +2326,8 @@ int __amd_iommu_attach_device(struct device *dev, struct protection_domain *doma > goto out; > > /* Setup GCR3 table */ > - if (pdom_is_sva_capable(domain)) { > + if (!amd_iommu_domain_is_nested(domain) && pdom_is_sva_capable(domain)) { > + pr_warn("%s: Allocating guest page table\n", __func__); > ret = init_gcr3_table(dev_data, domain); > if (ret) { > pdom_detach_iommu(iommu, domain); > @@ -2519,6 +2553,7 @@ static int pdom_setup_pgtable(struct protection_domain *domain, > fmt = AMD_IOMMU_V1; > break; > case PD_MODE_V2: > + domain->guest_paging_mode = amd_iommu_gpt_level; > fmt = AMD_IOMMU_V2; > break; > case PD_MODE_NONE: > diff --git a/drivers/iommu/amd/nested.c b/drivers/iommu/amd/nested.c > index 09f2a455af33..c9bf44e6298d 100644 > --- a/drivers/iommu/amd/nested.c > +++ b/drivers/iommu/amd/nested.c > @@ -12,9 +12,7 @@ > #include "amd_iommu.h" > #include "amd_iommu_types.h" > > -const struct iommu_domain_ops nested_domain_ops = { > - .free = amd_iommu_domain_free, > -}; > +const struct iommu_domain_ops nested_domain_ops; > > static int udata_to_iommu_hwpt_amd_v2(const struct iommu_user_data *user_data, > struct iommu_hwpt_amd_v2 *hwpt) > @@ -77,3 +75,79 @@ amd_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, > kfree(pdom); > return ERR_PTR(-EINVAL); > } > + > +static inline u64 hwpt_to_gcr3_trp(u64 *dte) > +{ > + u64 gcr3; > + > + gcr3 = (FIELD_GET(DTE_GCR3_14_12, dte[0]) << 12); > + gcr3 |= (FIELD_GET(DTE_GCR3_30_15, dte[1]) << 15); > + gcr3 |= (FIELD_GET(DTE_GCR3_51_31, dte[1]) << 31); > + return gcr3; > +} > + > +static int nested_gcr3_update(struct protection_domain *pdom, struct device *dev) > +{ > + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > + struct iommu_hwpt_amd_v2 *hwpt = &pdom->guest_hwpt; > + struct pci_dev *pdev = to_pci_dev(dev); > + > + if (!pdev || !hwpt) > + return -EINVAL; > + > + /* Note: Currently only support GCR3TRPMode with nested translation */ > + if (!check_feature2(FEATURE_GCR3TRPMODE)) > + return -EOPNOTSUPP; > + > + if (FIELD_GET(DTE_GPT_LEVEL_MASK, hwpt->dte[2]) == GUEST_PGTABLE_5_LEVEL) > + pdom->guest_paging_mode = PAGE_MODE_5_LEVEL; > + else > + pdom->guest_paging_mode = PAGE_MODE_4_LEVEL; > + > + dev_data->ppr = FIELD_GET(DTE_FLAG_PPR, hwpt->dte[0]); > + dev_data->gcr3_info.glx = FIELD_GET(DTE_FLAG_GLX, hwpt->dte[0]); > + dev_data->gcr3_info.giov = FIELD_GET(DTE_FLAG_GIOV, hwpt->dte[0]); > + dev_data->gcr3_info.trp_gpa = hwpt_to_gcr3_trp(hwpt->dte); > + /* Due to possible aliasing issue use nested domain ID */ > + dev_data->gcr3_info.domid = pdom->id; > + pr_debug("%s: devid=%#x, domid=%#x, trp_gpa=%#llx, glx=%#x\n", __func__, > + pci_dev_id(pdev), > + dev_data->gcr3_info.domid, > + dev_data->gcr3_info.trp_gpa, > + dev_data->gcr3_info.glx); > + > + return 0; > +} > + > +static int amd_iommu_nested_attach_device(struct iommu_domain *dom, struct device *dev) > +{ > + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > + struct protection_domain *pdom = to_pdomain(dom); > + struct pci_dev *pdev; > + int ret; > + > + if (dev_data->domain == pdom) > + return 0; > + > + ret = nested_gcr3_update(pdom, dev); > + if (ret) > + return ret; > + > + if (dev_data->domain) > + amd_iommu_detach_device(dev); > + > + ret = __amd_iommu_attach_device(dev, pdom); > + if (ret) > + return ret; > + > + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; > + if (pdev) > + amd_iommu_pdev_enable_cap_ats(pdev); > + Hi Suravee, You dont need to enable ATS capability here. The function __amd_iommu_attach_device() takes care of it. > + return ret; > +} > + > +const struct iommu_domain_ops nested_domain_ops = { > + .attach_dev = amd_iommu_nested_attach_device, > + .free = amd_iommu_domain_free, > +};
On Wed, Aug 20, 2025 at 11:30:09AM +0000, Suravee Suthikulpanit wrote: > @@ -2023,17 +2035,31 @@ static void set_dte_gcr3_table(struct amd_iommu *iommu, > struct dev_table_entry *target) > { > struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; > + struct protection_domain *pdom = dev_data->domain; > u64 gcr3; > > - if (!gcr3_info->gcr3_tbl) > + if (!has_gcr3_table(gcr3_info)) > return; > > - pr_debug("%s: devid=%#x, glx=%#x, giov=%#x, gcr3_tbl=%#llx\n", > + /* We need to check host capability before setting the mode. */ > + if ((pdom->guest_paging_mode == PAGE_MODE_5_LEVEL) && > + (amd_iommu_gpt_level < PAGE_MODE_5_LEVEL)) { > + pr_err("Cannot support Guest paging mode=%#x (dom_id=%#x).\n", > + pdom->guest_paging_mode, pdom->id); Should be checked during allocation time And again please don't mess up this function with nested DTEs. The vDTE should be validated during creation or fail creation. I see this is missing validation, every single bit in the vDTE needs to be checked to be 0 or supported by the kernel. The logic should simply take the vDTE and merge it with the host DTE as a simple bitwise operation. This is why I keep saying to fix the flow here so this can be written properly, and don't mess with the gcr3_info. > @@ -2293,7 +2326,8 @@ int __amd_iommu_attach_device(struct device *dev, struct protection_domain *doma > goto out; > > /* Setup GCR3 table */ > - if (pdom_is_sva_capable(domain)) { > + if (!amd_iommu_domain_is_nested(domain) && pdom_is_sva_capable(domain)) { > + pr_warn("%s: Allocating guest page table\n", __func__); ?? > -const struct iommu_domain_ops nested_domain_ops = { > - .free = amd_iommu_domain_free, > -}; > +const struct iommu_domain_ops nested_domain_ops; Put stuff where it belongs when first adding it.. > +static inline u64 hwpt_to_gcr3_trp(u64 *dte) > +{ > + u64 gcr3; > + > + gcr3 = (FIELD_GET(DTE_GCR3_14_12, dte[0]) << 12); > + gcr3 |= (FIELD_GET(DTE_GCR3_30_15, dte[1]) << 15); > + gcr3 |= (FIELD_GET(DTE_GCR3_51_31, dte[1]) << 31); > + return gcr3; > +} > + > +static int nested_gcr3_update(struct protection_domain *pdom, struct device *dev) > +{ > + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > + struct iommu_hwpt_amd_v2 *hwpt = &pdom->guest_hwpt; > + struct pci_dev *pdev = to_pci_dev(dev); > + > + if (!pdev || !hwpt) > + return -EINVAL; > + > + /* Note: Currently only support GCR3TRPMode with nested translation */ > + if (!check_feature2(FEATURE_GCR3TRPMODE)) > + return -EOPNOTSUPP; > + > + if (FIELD_GET(DTE_GPT_LEVEL_MASK, hwpt->dte[2]) == GUEST_PGTABLE_5_LEVEL) > + pdom->guest_paging_mode = PAGE_MODE_5_LEVEL; > + else > + pdom->guest_paging_mode = PAGE_MODE_4_LEVEL; > + > + dev_data->ppr = FIELD_GET(DTE_FLAG_PPR, hwpt->dte[0]); > + dev_data->gcr3_info.glx = FIELD_GET(DTE_FLAG_GLX, hwpt->dte[0]); > + dev_data->gcr3_info.giov = FIELD_GET(DTE_FLAG_GIOV, hwpt->dte[0]); > + dev_data->gcr3_info.trp_gpa = hwpt_to_gcr3_trp(hwpt->dte); > + /* Due to possible aliasing issue use nested domain ID */ > + dev_data->gcr3_info.domid = pdom->id; > + pr_debug("%s: devid=%#x, domid=%#x, trp_gpa=%#llx, glx=%#x\n", __func__, > + pci_dev_id(pdev), > + dev_data->gcr3_info.domid, > + dev_data->gcr3_info.trp_gpa, > + dev_data->gcr3_info.glx); > + > + return 0; > +} None of this logic is needed if the vDTE is treated bitwise. > +static int amd_iommu_nested_attach_device(struct iommu_domain *dom, struct device *dev) > +{ > + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > + struct protection_domain *pdom = to_pdomain(dom); > + struct pci_dev *pdev; > + int ret; > + > + if (dev_data->domain == pdom) > + return 0; > + > + ret = nested_gcr3_update(pdom, dev); > + if (ret) > + return ret; > + > + if (dev_data->domain) > + amd_iommu_detach_device(dev); I'm strongly against not supporting hitless vDTE update - this is part of the HW spec, the VMM should implement it, not create problematic bugs to deal with down the road. Everytime we let the VMM deviate from the HW spec in undiscoverable ways it causes problems :( Meaning you can't call detach_device, you have to support hitless update of the DTE between different attachment types. You already did the hard work of making update_dte256(), but the surrounding flows still need fixing. Jason
On Wed, Aug 20, 2025 at 11:30:09AM +0000, Suravee Suthikulpanit wrote: > +static inline bool has_gcr3_table(struct gcr3_tbl_info *gcr3_info) > +{ > + if (!gcr3_info || (!gcr3_info->gcr3_tbl && !gcr3_info->trp_gpa)) > + return false; "gcr3_info" seems always pointing to "&dev_data->gcr3_info", which can never be NULL. > @@ -2061,7 +2087,14 @@ static void set_dte_entry(struct amd_iommu *iommu, > struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; > struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; > > - if (gcr3_info && gcr3_info->gcr3_tbl) > + /* > + * For nested domain, use parent domain to setup v1 table > + * information and domain id. > + */ > + if (amd_iommu_domain_is_nested(domain)) > + domain = domain->parent; > + > + if (has_gcr3_table(gcr3_info)) > domid = dev_data->gcr3_info.domid; There is already a local variable "gcr3_info". > +static int nested_gcr3_update(struct protection_domain *pdom, struct device *dev) > +{ > + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > + struct iommu_hwpt_amd_v2 *hwpt = &pdom->guest_hwpt; > + struct pci_dev *pdev = to_pci_dev(dev); > + > + if (!pdev || !hwpt) > + return -EINVAL; to_pci_dev is a container_of from the dev. !pdev indicates a !dev that should never happen in the path of an attach_dev op. Or, did you actually want to check if dev_is_pci(dev)? Also, hwpt is "&pdom->guest_hwpt", which would never be NULL. > +static int amd_iommu_nested_attach_device(struct iommu_domain *dom, struct device *dev) > +{ > + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); > + struct protection_domain *pdom = to_pdomain(dom); > + struct pci_dev *pdev; > + int ret; > + > + if (dev_data->domain == pdom) > + return 0; > + > + ret = nested_gcr3_update(pdom, dev); > + if (ret) > + return ret; > + > + if (dev_data->domain) > + amd_iommu_detach_device(dev); > + > + ret = __amd_iommu_attach_device(dev, pdom); > + if (ret) > + return ret; > + > + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; > + if (pdev) > + amd_iommu_pdev_enable_cap_ats(pdev); Is "dev_data->dev" expected to be "dev"? Nicolin
On 8/22/2025 3:20 PM, Nicolin Chen wrote: > On Wed, Aug 20, 2025 at 11:30:09AM +0000, Suravee Suthikulpanit wrote: >> +static inline bool has_gcr3_table(struct gcr3_tbl_info *gcr3_info) >> +{ >> + if (!gcr3_info || (!gcr3_info->gcr3_tbl && !gcr3_info->trp_gpa)) >> + return false; > > "gcr3_info" seems always pointing to "&dev_data->gcr3_info", which > can never be NULL. right >> @@ -2061,7 +2087,14 @@ static void set_dte_entry(struct amd_iommu *iommu, >> struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; >> struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; >> >> - if (gcr3_info && gcr3_info->gcr3_tbl) >> + /* >> + * For nested domain, use parent domain to setup v1 table >> + * information and domain id. >> + */ >> + if (amd_iommu_domain_is_nested(domain)) >> + domain = domain->parent; >> + >> + if (has_gcr3_table(gcr3_info)) >> domid = dev_data->gcr3_info.domid; > > There is already a local variable "gcr3_info". right. >> +static int nested_gcr3_update(struct protection_domain *pdom, struct device *dev) >> +{ >> + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); >> + struct iommu_hwpt_amd_v2 *hwpt = &pdom->guest_hwpt; >> + struct pci_dev *pdev = to_pci_dev(dev); >> + >> + if (!pdev || !hwpt) >> + return -EINVAL; > > to_pci_dev is a container_of from the dev. !pdev indicates a !dev > that should never happen in the path of an attach_dev op. Or, did > you actually want to check if dev_is_pci(dev)? correct, I should have just checked for dev_is_pci(dev). > Also, hwpt is "&pdom->guest_hwpt", which would never be NULL. > >> +static int amd_iommu_nested_attach_device(struct iommu_domain *dom, struct device *dev) >> +{ >> + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); >> + struct protection_domain *pdom = to_pdomain(dom); >> + struct pci_dev *pdev; >> + int ret; >> + >> + if (dev_data->domain == pdom) >> + return 0; >> + >> + ret = nested_gcr3_update(pdom, dev); >> + if (ret) >> + return ret; >> + >> + if (dev_data->domain) >> + amd_iommu_detach_device(dev); >> + >> + ret = __amd_iommu_attach_device(dev, pdom); >> + if (ret) >> + return ret; >> + >> + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; >> + if (pdev) >> + amd_iommu_pdev_enable_cap_ats(pdev); > > Is "dev_data->dev" expected to be "dev"? correct. Thanks for the review. I'll clean up the logic in amd_iommu_nested_attach_device() to return error early for non-pci device. Thanks, Suravee
© 2016 - 2025 Red Hat, Inc.