From: Yi Liu <yi.l.liu@intel.com>
Add a framework to check and synchronize host IOMMU cap/ecap with
vIOMMU cap/ecap.
Currently only stage-2 translation is supported which is backed by
shadow page table on host side. So we don't need exact matching of
each bit of cap/ecap between vIOMMU and host. However, we can still
utilize this framework to ensure compatibility of host and vIOMMU's
address width at least, i.e., vIOMMU's aw_bits <= host aw_bits,
which is missed before.
When stage-1 translation is supported in future, a.k.a. scalable
modern mode, we need to ensure compatibility of each bits. Some
bits are user controllable, they should be checked with host side
to ensure compatibility. Other bits are not, they should be synced
into vIOMMU cap/ecap for compatibility.
The sequence will be:
vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init()
iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init()
iommu->host_cap/ecap is checked and updated some bits with host cap/ecap. ---- vtd_sync_hw_info()
iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- vtd_machine_done_hook()
iommu->host_cap/ecap is a temporary storage to hold intermediate value
when synthesize host cap/ecap and vIOMMU's initial configured cap/ecap.
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
---
include/hw/i386/intel_iommu.h | 4 ++
hw/i386/intel_iommu.c | 78 +++++++++++++++++++++++++++++++----
2 files changed, 75 insertions(+), 7 deletions(-)
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index c65fdde56f..b8abbcce12 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -292,6 +292,9 @@ struct IntelIOMMUState {
uint64_t cap; /* The value of capability reg */
uint64_t ecap; /* The value of extended capability reg */
+ uint64_t host_cap; /* The value of host capability reg */
+ uint64_t host_ecap; /* The value of host ext-capability reg */
+
uint32_t context_cache_gen; /* Should be in [1,MAX] */
GHashTable *iotlb; /* IOTLB */
@@ -314,6 +317,7 @@ struct IntelIOMMUState {
bool dma_translation; /* Whether DMA translation supported */
bool pasid; /* Whether to support PASID */
+ bool cap_finalized; /* Whether VTD capability finalized */
/*
* Protects IOMMU states in general. Currently it protects the
* per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4c1d058ebd..be03fcbf52 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3819,6 +3819,47 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
return vtd_dev_as;
}
+static bool vtd_sync_hw_info(IntelIOMMUState *s, struct iommu_hw_info_vtd *vtd,
+ Error **errp)
+{
+ uint64_t addr_width;
+
+ addr_width = (vtd->cap_reg >> 16) & 0x3fULL;
+ if (s->aw_bits > addr_width) {
+ error_setg(errp, "User aw-bits: %u > host address width: %lu",
+ s->aw_bits, addr_width);
+ return false;
+ }
+
+ /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */
+
+ return true;
+}
+
+/*
+ * virtual VT-d which wants nested needs to check the host IOMMU
+ * nesting cap info behind the assigned devices. Thus that vIOMMU
+ * could bind guest page table to host.
+ */
+static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice *idev,
+ Error **errp)
+{
+ struct iommu_hw_info_vtd vtd;
+ enum iommu_hw_info_type type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
+
+ if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) {
+ error_setg(errp, "Failed to get IOMMU capability!!!");
+ return false;
+ }
+
+ if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) {
+ error_setg(errp, "IOMMU hardware is not compatible!!!");
+ return false;
+ }
+
+ return vtd_sync_hw_info(s, &vtd, errp);
+}
+
static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUFDDevice *idev, Error **errp)
{
@@ -3837,6 +3878,10 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t devfn,
return 0;
}
+ if (!vtd_check_idev(s, idev, errp)) {
+ return -1;
+ }
+
vtd_iommu_lock(s);
vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key);
@@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s)
{
X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
- memset(s->csr, 0, DMAR_REG_SIZE);
- memset(s->wmask, 0, DMAR_REG_SIZE);
- memset(s->w1cmask, 0, DMAR_REG_SIZE);
- memset(s->womask, 0, DMAR_REG_SIZE);
+ /* CAP/ECAP are initialized in machine create done stage */
+ memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG);
+ memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG);
+ memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG);
+ memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG);
s->root = 0;
s->root_scalable = false;
@@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s)
vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP;
}
- vtd_cap_init(s);
+ if (!s->cap_finalized) {
+ vtd_cap_init(s);
+ s->host_cap = s->cap;
+ s->host_ecap = s->ecap;
+ }
+
vtd_reset_caches(s);
/* Define registers with default values and bit semantics */
vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
- vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
- vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0);
vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL);
vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0);
@@ -4241,6 +4290,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
return true;
}
+static void vtd_setup_capability_reg(IntelIOMMUState *s)
+{
+ vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0);
+ vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0);
+}
+
static int vtd_machine_done_notify_one(Object *child, void *unused)
{
IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
@@ -4259,6 +4314,14 @@ static int vtd_machine_done_notify_one(Object *child, void *unused)
static void vtd_machine_done_hook(Notifier *notifier, void *unused)
{
+ IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default());
+
+ iommu->cap = iommu->host_cap;
+ iommu->ecap = iommu->host_ecap;
+ iommu->cap_finalized = true;
+
+ vtd_setup_capability_reg(iommu);
+
object_child_foreach_recursive(object_get_root(),
vtd_machine_done_notify_one, NULL);
}
@@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
QLIST_INIT(&s->vtd_as_with_notifiers);
qemu_mutex_init(&s->iommu_lock);
+ s->cap_finalized = false;
memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
"intel_iommu", DMAR_REG_SIZE);
memory_region_add_subregion(get_system_memory(),
--
2.34.1
On 1/15/24 11:13, Zhenzhong Duan wrote: > From: Yi Liu <yi.l.liu@intel.com> > > Add a framework to check and synchronize host IOMMU cap/ecap with > vIOMMU cap/ecap. > > Currently only stage-2 translation is supported which is backed by > shadow page table on host side. So we don't need exact matching of > each bit of cap/ecap between vIOMMU and host. However, we can still > utilize this framework to ensure compatibility of host and vIOMMU's > address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, > which is missed before. > > When stage-1 translation is supported in future, a.k.a. scalable > modern mode, we need to ensure compatibility of each bits. Some > bits are user controllable, they should be checked with host side > to ensure compatibility. Other bits are not, they should be synced > into vIOMMU cap/ecap for compatibility. > > The sequence will be: > > vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() > iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() > iommu->host_cap/ecap is checked and updated some bits with host cap/ecap. ---- vtd_sync_hw_info() > iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- vtd_machine_done_hook() > > iommu->host_cap/ecap is a temporary storage to hold intermediate value > when synthesize host cap/ecap and vIOMMU's initial configured cap/ecap. The above "sequence" paragraph is not very clear. The patch may need to be split further. > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> > --- > include/hw/i386/intel_iommu.h | 4 ++ > hw/i386/intel_iommu.c | 78 +++++++++++++++++++++++++++++++---- > 2 files changed, 75 insertions(+), 7 deletions(-) > > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > index c65fdde56f..b8abbcce12 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -292,6 +292,9 @@ struct IntelIOMMUState { > uint64_t cap; /* The value of capability reg */ > uint64_t ecap; /* The value of extended capability reg */ > > + uint64_t host_cap; /* The value of host capability reg */ > + uint64_t host_ecap; /* The value of host ext-capability reg */ > + > uint32_t context_cache_gen; /* Should be in [1,MAX] */ > GHashTable *iotlb; /* IOTLB */ > > @@ -314,6 +317,7 @@ struct IntelIOMMUState { > bool dma_translation; /* Whether DMA translation supported */ > bool pasid; /* Whether to support PASID */ > > + bool cap_finalized; /* Whether VTD capability finalized */ > /* > * Protects IOMMU states in general. Currently it protects the > * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace. > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 4c1d058ebd..be03fcbf52 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -3819,6 +3819,47 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, > return vtd_dev_as; > } > > +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct iommu_hw_info_vtd *vtd, > + Error **errp) > +{ > + uint64_t addr_width; > + > + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; Virek uses the same kind of macro in : https://lore.kernel.org/qemu-devel/20240118192049.1796763-1-vivek.kasireddy@intel.com/ What about the + 1 ? Looks like it's missing here, according to 11.4.2 Capability Register. Could we introduce a common macro in intel_iommu_internal.h ? > + if (s->aw_bits > addr_width) { > + error_setg(errp, "User aw-bits: %u > host address width: %lu", I think %lu should be PRId64. This is a general comment. You should avoid %llx, %lx, etc. in the code. > + s->aw_bits, addr_width); > + return false; > + } > + > + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ > + > + return true; > +} > + > +/* > + * virtual VT-d which wants nested needs to check the host IOMMU > + * nesting cap info behind the assigned devices. Thus that vIOMMU > + * could bind guest page table to host. > + */ > +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice *idev, > + Error **errp) > +{ > + struct iommu_hw_info_vtd vtd; > + enum iommu_hw_info_type type = IOMMU_HW_INFO_TYPE_INTEL_VTD; > + > + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { > + error_setg(errp, "Failed to get IOMMU capability!!!"); > + return false; > + } > + > + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { > + error_setg(errp, "IOMMU hardware is not compatible!!!"); > + return false; > + } > + > + return vtd_sync_hw_info(s, &vtd, errp); > +} > + > static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t devfn, > IOMMUFDDevice *idev, Error **errp) > { > @@ -3837,6 +3878,10 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t devfn, > return 0; > } > > + if (!vtd_check_idev(s, idev, errp)) { > + return -1; > + } > + > vtd_iommu_lock(s); > > vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); > @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) > { > X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); > > - memset(s->csr, 0, DMAR_REG_SIZE); > - memset(s->wmask, 0, DMAR_REG_SIZE); > - memset(s->w1cmask, 0, DMAR_REG_SIZE); > - memset(s->womask, 0, DMAR_REG_SIZE); > + /* CAP/ECAP are initialized in machine create done stage */ > + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); > + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); > + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); > + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); > > s->root = 0; > s->root_scalable = false; > @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) vtd_init() is called from reset and from realize. This is redundant. reset should be enough. > vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; > } > > - vtd_cap_init(s); > + if (!s->cap_finalized) { ok. so this can only be done in reset. > + vtd_cap_init(s); > + s->host_cap = s->cap; > + s->host_ecap = s->ecap; > + } > + > vtd_reset_caches(s); > > /* Define registers with default values and bit semantics */ > vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); > - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); > - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); > vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); > vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); > vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); > @@ -4241,6 +4290,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) > return true; > } > > +static void vtd_setup_capability_reg(IntelIOMMUState *s) > +{ > + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); > + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); > +} > + > static int vtd_machine_done_notify_one(Object *child, void *unused) > { > IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); > @@ -4259,6 +4314,14 @@ static int vtd_machine_done_notify_one(Object *child, void *unused) > > static void vtd_machine_done_hook(Notifier *notifier, void *unused) > { > + IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); > + > + iommu->cap = iommu->host_cap; > + iommu->ecap = iommu->host_ecap; > + iommu->cap_finalized = true; > + > + vtd_setup_capability_reg(iommu); > + This is confusing. Please split the patch better to reflect the ordering of the e/cap register settings. Thanks, C. > object_child_foreach_recursive(object_get_root(), > vtd_machine_done_notify_one, NULL); > } > @@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) > > QLIST_INIT(&s->vtd_as_with_notifiers); > qemu_mutex_init(&s->iommu_lock); > + s->cap_finalized = false; > memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, > "intel_iommu", DMAR_REG_SIZE); > memory_region_add_subregion(get_system_memory(),
>-----Original Message----- >From: Cédric Le Goater <clg@redhat.com> >Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check and >sync host IOMMU cap/ecap > >On 1/15/24 11:13, Zhenzhong Duan wrote: >> From: Yi Liu <yi.l.liu@intel.com> >> >> Add a framework to check and synchronize host IOMMU cap/ecap with >> vIOMMU cap/ecap. >> >> Currently only stage-2 translation is supported which is backed by >> shadow page table on host side. So we don't need exact matching of >> each bit of cap/ecap between vIOMMU and host. However, we can still >> utilize this framework to ensure compatibility of host and vIOMMU's >> address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, >> which is missed before. >> >> When stage-1 translation is supported in future, a.k.a. scalable >> modern mode, we need to ensure compatibility of each bits. Some >> bits are user controllable, they should be checked with host side >> to ensure compatibility. Other bits are not, they should be synced >> into vIOMMU cap/ecap for compatibility. >> >> The sequence will be: >> >> vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() >> iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() >> iommu->host_cap/ecap is checked and updated some bits with host >cap/ecap. ---- vtd_sync_hw_info() >> iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- >vtd_machine_done_hook() >> >> iommu->host_cap/ecap is a temporary storage to hold intermediate value >> when synthesize host cap/ecap and vIOMMU's initial configured cap/ecap. > > >The above "sequence" paragraph is not very clear. The patch may need to >be split further. OK, will do. > > > >> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >> --- >> include/hw/i386/intel_iommu.h | 4 ++ >> hw/i386/intel_iommu.c | 78 >+++++++++++++++++++++++++++++++---- >> 2 files changed, 75 insertions(+), 7 deletions(-) >> >> diff --git a/include/hw/i386/intel_iommu.h >b/include/hw/i386/intel_iommu.h >> index c65fdde56f..b8abbcce12 100644 >> --- a/include/hw/i386/intel_iommu.h >> +++ b/include/hw/i386/intel_iommu.h >> @@ -292,6 +292,9 @@ struct IntelIOMMUState { >> uint64_t cap; /* The value of capability reg */ >> uint64_t ecap; /* The value of extended capability reg */ >> >> + uint64_t host_cap; /* The value of host capability reg */ >> + uint64_t host_ecap; /* The value of host ext-capability reg */ >> + >> uint32_t context_cache_gen; /* Should be in [1,MAX] */ >> GHashTable *iotlb; /* IOTLB */ >> >> @@ -314,6 +317,7 @@ struct IntelIOMMUState { >> bool dma_translation; /* Whether DMA translation supported */ >> bool pasid; /* Whether to support PASID */ >> >> + bool cap_finalized; /* Whether VTD capability finalized */ >> /* >> * Protects IOMMU states in general. Currently it protects the >> * per-IOMMU IOTLB cache, and context entry cache in >VTDAddressSpace. >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >> index 4c1d058ebd..be03fcbf52 100644 >> --- a/hw/i386/intel_iommu.c >> +++ b/hw/i386/intel_iommu.c >> @@ -3819,6 +3819,47 @@ VTDAddressSpace >*vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, >> return vtd_dev_as; >> } >> >> +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct >iommu_hw_info_vtd *vtd, >> + Error **errp) >> +{ >> + uint64_t addr_width; >> + >> + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; > >Virek uses the same kind of macro in : > > https://lore.kernel.org/qemu-devel/20240118192049.1796763-1- >vivek.kasireddy@intel.com/ > >What about the + 1 ? Looks like it's missing here, according to 11.4.2 >Capability Register. > >Could we introduce a common macro in intel_iommu_internal.h ? Sure. > > >> + if (s->aw_bits > addr_width) { >> + error_setg(errp, "User aw-bits: %u > host address width: %lu", > >I think %lu should be PRId64. This is a general comment. You should avoid >%llx, %lx, etc. in the code. Got it. > >> + s->aw_bits, addr_width); >> + return false; >> + } >> + >> + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ >> + >> + return true; >> +} >> + >> +/* >> + * virtual VT-d which wants nested needs to check the host IOMMU >> + * nesting cap info behind the assigned devices. Thus that vIOMMU >> + * could bind guest page table to host. >> + */ >> +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice *idev, >> + Error **errp) >> +{ >> + struct iommu_hw_info_vtd vtd; >> + enum iommu_hw_info_type type = >IOMMU_HW_INFO_TYPE_INTEL_VTD; >> + >> + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { >> + error_setg(errp, "Failed to get IOMMU capability!!!"); >> + return false; >> + } >> + >> + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { >> + error_setg(errp, "IOMMU hardware is not compatible!!!"); >> + return false; >> + } >> + >> + return vtd_sync_hw_info(s, &vtd, errp); >> +} >> + >> static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, >int32_t devfn, >> IOMMUFDDevice *idev, Error **errp) >> { >> @@ -3837,6 +3878,10 @@ static int vtd_dev_set_iommu_device(PCIBus >*bus, void *opaque, int32_t devfn, >> return 0; >> } >> >> + if (!vtd_check_idev(s, idev, errp)) { >> + return -1; >> + } >> + >> vtd_iommu_lock(s); >> >> vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); >> @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) >> { >> X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); >> >> - memset(s->csr, 0, DMAR_REG_SIZE); >> - memset(s->wmask, 0, DMAR_REG_SIZE); >> - memset(s->w1cmask, 0, DMAR_REG_SIZE); >> - memset(s->womask, 0, DMAR_REG_SIZE); >> + /* CAP/ECAP are initialized in machine create done stage */ >> + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >> + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >> + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >> + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >> >> s->root = 0; >> s->root_scalable = false; >> @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) > >vtd_init() is called from reset and from realize. This is redundant. >reset should be enough. Looks so. I'll try to see if it break anything. > > >> vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; >> } >> >> - vtd_cap_init(s); >> + if (!s->cap_finalized) { > >ok. so this can only be done in reset. Not quite get, vtd_init can be called multiple times harmlessly before machine create done. But once is enough, i.e., in reset. The call in realize looks redundant. > >> + vtd_cap_init(s); >> + s->host_cap = s->cap; >> + s->host_ecap = s->ecap; >> + } >> + >> vtd_reset_caches(s); >> >> /* Define registers with default values and bit semantics */ >> vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); >> - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >> - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >> vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); >> vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); >> vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); >> @@ -4241,6 +4290,12 @@ static bool >vtd_decide_config(IntelIOMMUState *s, Error **errp) >> return true; >> } >> >> +static void vtd_setup_capability_reg(IntelIOMMUState *s) >> +{ >> + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >> + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >> +} >> + >> static int vtd_machine_done_notify_one(Object *child, void *unused) >> { >> IntelIOMMUState *iommu = >INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >> @@ -4259,6 +4314,14 @@ static int >vtd_machine_done_notify_one(Object *child, void *unused) >> >> static void vtd_machine_done_hook(Notifier *notifier, void *unused) >> { >> + IntelIOMMUState *iommu = >INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >> + >> + iommu->cap = iommu->host_cap; >> + iommu->ecap = iommu->host_ecap; >> + iommu->cap_finalized = true; >> + >> + vtd_setup_capability_reg(iommu); >> + > >This is confusing. Please split the patch better to reflect the ordering of >the e/cap register settings. Will do. Thanks Zhenzhong
Hi Zhenzhong, On 1/15/24 11:13, Zhenzhong Duan wrote: > From: Yi Liu <yi.l.liu@intel.com> > > Add a framework to check and synchronize host IOMMU cap/ecap with > vIOMMU cap/ecap. > > Currently only stage-2 translation is supported which is backed by > shadow page table on host side. So we don't need exact matching of > each bit of cap/ecap between vIOMMU and host. However, we can still > utilize this framework to ensure compatibility of host and vIOMMU's > address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, > which is missed before. > > When stage-1 translation is supported in future, a.k.a. scalable > modern mode, we need to ensure compatibility of each bits. Some > bits are user controllable, they should be checked with host side > to ensure compatibility. Other bits are not, they should be synced > into vIOMMU cap/ecap for compatibility. > > The sequence will be: > > vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() > iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() > iommu->host_cap/ecap is checked and updated some bits with host cap/ecap. ---- vtd_sync_hw_info() > iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- vtd_machine_done_hook() > > iommu->host_cap/ecap is a temporary storage to hold intermediate value > when synthesize host cap/ecap and vIOMMU's initial configured cap/ecap. > > Signed-off-by: Yi Liu <yi.l.liu@intel.com> > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> > Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> > --- > include/hw/i386/intel_iommu.h | 4 ++ > hw/i386/intel_iommu.c | 78 +++++++++++++++++++++++++++++++---- > 2 files changed, 75 insertions(+), 7 deletions(-) > > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > index c65fdde56f..b8abbcce12 100644 > --- a/include/hw/i386/intel_iommu.h > +++ b/include/hw/i386/intel_iommu.h > @@ -292,6 +292,9 @@ struct IntelIOMMUState { > uint64_t cap; /* The value of capability reg */ > uint64_t ecap; /* The value of extended capability reg */ > > + uint64_t host_cap; /* The value of host capability reg */ > + uint64_t host_ecap; /* The value of host ext-capability reg */ > + > uint32_t context_cache_gen; /* Should be in [1,MAX] */ > GHashTable *iotlb; /* IOTLB */ > > @@ -314,6 +317,7 @@ struct IntelIOMMUState { > bool dma_translation; /* Whether DMA translation supported */ > bool pasid; /* Whether to support PASID */ > > + bool cap_finalized; /* Whether VTD capability finalized */ > /* > * Protects IOMMU states in general. Currently it protects the > * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace. > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 4c1d058ebd..be03fcbf52 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -3819,6 +3819,47 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, > return vtd_dev_as; > } > > +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct iommu_hw_info_vtd *vtd, > + Error **errp) > +{ > + uint64_t addr_width; > + > + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; > + if (s->aw_bits > addr_width) { > + error_setg(errp, "User aw-bits: %u > host address width: %lu", > + s->aw_bits, addr_width); > + return false; > + } > + > + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ > + > + return true; > +} > + > +/* > + * virtual VT-d which wants nested needs to check the host IOMMU > + * nesting cap info behind the assigned devices. Thus that vIOMMU > + * could bind guest page table to host. > + */ > +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice *idev, > + Error **errp) > +{ > + struct iommu_hw_info_vtd vtd; > + enum iommu_hw_info_type type = IOMMU_HW_INFO_TYPE_INTEL_VTD; > + > + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { > + error_setg(errp, "Failed to get IOMMU capability!!!"); > + return false; > + } > + > + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { > + error_setg(errp, "IOMMU hardware is not compatible!!!"); > + return false; > + } > + > + return vtd_sync_hw_info(s, &vtd, errp); > +} > + > static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t devfn, > IOMMUFDDevice *idev, Error **errp) > { > @@ -3837,6 +3878,10 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t devfn, > return 0; > } > > + if (!vtd_check_idev(s, idev, errp)) {In In [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for hotplugged devices https://lore.kernel.org/all/20240117080414.316890-1-eric.auger@redhat.com/ I also attempt to pass host iommu info to the virtio-iommu but with legacy BE. In my case I want to pass the reserved memory regions which also model the aw. So this is a pretty similar use case. Why don't we pass the pointer to an opaque iommu_hw_info instead, through the PCIIOMMUOps? > + return -1; > + } > + > vtd_iommu_lock(s); > > vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); > @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) > { > X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); > > - memset(s->csr, 0, DMAR_REG_SIZE); > - memset(s->wmask, 0, DMAR_REG_SIZE); > - memset(s->w1cmask, 0, DMAR_REG_SIZE); > - memset(s->womask, 0, DMAR_REG_SIZE); > + /* CAP/ECAP are initialized in machine create done stage */ > + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); > + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); > + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); > + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - DMAR_GCMD_REG); This change is not documented in the commit msg. Sorry I don't get why this is needed? > > s->root = 0; > s->root_scalable = false; > @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) > vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; > } > > - vtd_cap_init(s); > + if (!s->cap_finalized) { > + vtd_cap_init(s); > + s->host_cap = s->cap; > + s->host_ecap = s->ecap; > + } > + > vtd_reset_caches(s); > > /* Define registers with default values and bit semantics */ > vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); > - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); > - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); > vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); > vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); > vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); > @@ -4241,6 +4290,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) > return true; > } > > +static void vtd_setup_capability_reg(IntelIOMMUState *s) > +{ > + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); > + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); > +} > + > static int vtd_machine_done_notify_one(Object *child, void *unused) > { > IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); > @@ -4259,6 +4314,14 @@ static int vtd_machine_done_notify_one(Object *child, void *unused) > > static void vtd_machine_done_hook(Notifier *notifier, void *unused) > { > + IntelIOMMUState *iommu = INTEL_IOMMU_DEVICE(x86_iommu_get_default()); > + > + iommu->cap = iommu->host_cap; > + iommu->ecap = iommu->host_ecap; > + iommu->cap_finalized = true; I don't think you can change the defaults like this without taking care of compats (migration). Thanks Eric > + > + vtd_setup_capability_reg(iommu); > + > object_child_foreach_recursive(object_get_root(), > vtd_machine_done_notify_one, NULL); > } > @@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) > > QLIST_INIT(&s->vtd_as_with_notifiers); > qemu_mutex_init(&s->iommu_lock); > + s->cap_finalized = false; > memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, > "intel_iommu", DMAR_REG_SIZE); > memory_region_add_subregion(get_system_memory(),
Hi Eric, >-----Original Message----- >From: Eric Auger <eric.auger@redhat.com> >Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check and >sync host IOMMU cap/ecap > >Hi Zhenzhong, > >On 1/15/24 11:13, Zhenzhong Duan wrote: >> From: Yi Liu <yi.l.liu@intel.com> >> >> Add a framework to check and synchronize host IOMMU cap/ecap with >> vIOMMU cap/ecap. >> >> Currently only stage-2 translation is supported which is backed by >> shadow page table on host side. So we don't need exact matching of >> each bit of cap/ecap between vIOMMU and host. However, we can still >> utilize this framework to ensure compatibility of host and vIOMMU's >> address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, >> which is missed before. >> >> When stage-1 translation is supported in future, a.k.a. scalable >> modern mode, we need to ensure compatibility of each bits. Some >> bits are user controllable, they should be checked with host side >> to ensure compatibility. Other bits are not, they should be synced >> into vIOMMU cap/ecap for compatibility. >> >> The sequence will be: >> >> vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() >> iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() >> iommu->host_cap/ecap is checked and updated some bits with host >cap/ecap. ---- vtd_sync_hw_info() >> iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- >vtd_machine_done_hook() >> >> iommu->host_cap/ecap is a temporary storage to hold intermediate value >> when synthesize host cap/ecap and vIOMMU's initial configured cap/ecap. >> >> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >> --- >> include/hw/i386/intel_iommu.h | 4 ++ >> hw/i386/intel_iommu.c | 78 >+++++++++++++++++++++++++++++++---- >> 2 files changed, 75 insertions(+), 7 deletions(-) >> >> diff --git a/include/hw/i386/intel_iommu.h >b/include/hw/i386/intel_iommu.h >> index c65fdde56f..b8abbcce12 100644 >> --- a/include/hw/i386/intel_iommu.h >> +++ b/include/hw/i386/intel_iommu.h >> @@ -292,6 +292,9 @@ struct IntelIOMMUState { >> uint64_t cap; /* The value of capability reg */ >> uint64_t ecap; /* The value of extended capability reg */ >> >> + uint64_t host_cap; /* The value of host capability reg */ >> + uint64_t host_ecap; /* The value of host ext-capability reg */ >> + >> uint32_t context_cache_gen; /* Should be in [1,MAX] */ >> GHashTable *iotlb; /* IOTLB */ >> >> @@ -314,6 +317,7 @@ struct IntelIOMMUState { >> bool dma_translation; /* Whether DMA translation supported */ >> bool pasid; /* Whether to support PASID */ >> >> + bool cap_finalized; /* Whether VTD capability finalized */ >> /* >> * Protects IOMMU states in general. Currently it protects the >> * per-IOMMU IOTLB cache, and context entry cache in >VTDAddressSpace. >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >> index 4c1d058ebd..be03fcbf52 100644 >> --- a/hw/i386/intel_iommu.c >> +++ b/hw/i386/intel_iommu.c >> @@ -3819,6 +3819,47 @@ VTDAddressSpace >*vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, >> return vtd_dev_as; >> } >> >> +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct >iommu_hw_info_vtd *vtd, >> + Error **errp) >> +{ >> + uint64_t addr_width; >> + >> + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; >> + if (s->aw_bits > addr_width) { >> + error_setg(errp, "User aw-bits: %u > host address width: %lu", >> + s->aw_bits, addr_width); >> + return false; >> + } >> + >> + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ >> + >> + return true; >> +} >> + >> +/* >> + * virtual VT-d which wants nested needs to check the host IOMMU >> + * nesting cap info behind the assigned devices. Thus that vIOMMU >> + * could bind guest page table to host. >> + */ >> +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice *idev, >> + Error **errp) >> +{ >> + struct iommu_hw_info_vtd vtd; >> + enum iommu_hw_info_type type = >IOMMU_HW_INFO_TYPE_INTEL_VTD; >> + >> + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { >> + error_setg(errp, "Failed to get IOMMU capability!!!"); >> + return false; >> + } >> + >> + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { >> + error_setg(errp, "IOMMU hardware is not compatible!!!"); >> + return false; >> + } >> + >> + return vtd_sync_hw_info(s, &vtd, errp); >> +} >> + >> static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t >devfn, >> IOMMUFDDevice *idev, Error **errp) >> { >> @@ -3837,6 +3878,10 @@ static int vtd_dev_set_iommu_device(PCIBus >*bus, void *opaque, int32_t devfn, >> return 0; >> } >> >> + if (!vtd_check_idev(s, idev, errp)) {In >In >[RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for >hotplugged devices >https://lore.kernel.org/all/20240117080414.316890-1- >eric.auger@redhat.com/ > >I also attempt to pass host iommu info to the virtio-iommu but with >legacy BE. I think your patch works with iommufd BE too😊 Because iommufd BE also fills bcontainer->iova_ranges in iommufd_cdev_get_info_iova_range(). > In my case I want to pass the reserved memory regions which >also model the aw. >So this is a pretty similar use case. Yes. > >Why don't we pass the pointer to an opaque iommu_hw_info instead, >through the PCIIOMMUOps? Passing iommu_hw_info is ok for this series, but we want more from IOMMUFDDevice in nesting series. I.e., allocate/free ioas/hwpt, attach/detach from hwpt, get dirty bitmap, etc. It's more flexible to let vIOMMU get what it want itself. > > > >> + return -1; >> + } >> + >> vtd_iommu_lock(s); >> >> vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); >> @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) >> { >> X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); >> >> - memset(s->csr, 0, DMAR_REG_SIZE); >> - memset(s->wmask, 0, DMAR_REG_SIZE); >> - memset(s->w1cmask, 0, DMAR_REG_SIZE); >> - memset(s->womask, 0, DMAR_REG_SIZE); >> + /* CAP/ECAP are initialized in machine create done stage */ >> + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >> + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >> + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >> + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >DMAR_GCMD_REG); >This change is not documented in the commit msg. >Sorry I don't get why this is needed? I'll doc it. Above we have one line to explain when cap/ecap are initialized. vtd_init() is called in qemu init and guest reset. In qemu init, Cap/ecap is finalized, after that we don't want cap/ecap to be changed. So we bypass change to cap/ecap here. >> >> s->root = 0; >> s->root_scalable = false; >> @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) >> vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; >> } >> >> - vtd_cap_init(s); >> + if (!s->cap_finalized) { >> + vtd_cap_init(s); >> + s->host_cap = s->cap; >> + s->host_ecap = s->ecap; >> + } >> + >> vtd_reset_caches(s); >> >> /* Define registers with default values and bit semantics */ >> vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); >> - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >> - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >> vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); >> vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); >> vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); >> @@ -4241,6 +4290,12 @@ static bool >vtd_decide_config(IntelIOMMUState *s, Error **errp) >> return true; >> } >> >> +static void vtd_setup_capability_reg(IntelIOMMUState *s) >> +{ >> + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >> + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >> +} >> + >> static int vtd_machine_done_notify_one(Object *child, void *unused) >> { >> IntelIOMMUState *iommu = >INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >> @@ -4259,6 +4314,14 @@ static int >vtd_machine_done_notify_one(Object *child, void *unused) >> >> static void vtd_machine_done_hook(Notifier *notifier, void *unused) >> { >> + IntelIOMMUState *iommu = >INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >> + >> + iommu->cap = iommu->host_cap; >> + iommu->ecap = iommu->host_ecap; >> + iommu->cap_finalized = true; >I don't think you can change the defaults like this without taking care >of compats (migration). Will bump vtd_vmstate .version_id works? Thanks Zhenzhong > >Thanks > >Eric >> + >> + vtd_setup_capability_reg(iommu); >> + >> object_child_foreach_recursive(object_get_root(), >> vtd_machine_done_notify_one, NULL); >> } >> @@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, Error >**errp) >> >> QLIST_INIT(&s->vtd_as_with_notifiers); >> qemu_mutex_init(&s->iommu_lock); >> + s->cap_finalized = false; >> memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, >> "intel_iommu", DMAR_REG_SIZE); >> memory_region_add_subregion(get_system_memory(),
On 1/18/24 10:30, Duan, Zhenzhong wrote: > Hi Eric, > >> -----Original Message----- >> From: Eric Auger <eric.auger@redhat.com> >> Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check and >> sync host IOMMU cap/ecap >> >> Hi Zhenzhong, >> >> On 1/15/24 11:13, Zhenzhong Duan wrote: >>> From: Yi Liu <yi.l.liu@intel.com> >>> >>> Add a framework to check and synchronize host IOMMU cap/ecap with >>> vIOMMU cap/ecap. >>> >>> Currently only stage-2 translation is supported which is backed by >>> shadow page table on host side. So we don't need exact matching of >>> each bit of cap/ecap between vIOMMU and host. However, we can still >>> utilize this framework to ensure compatibility of host and vIOMMU's >>> address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, >>> which is missed before. >>> >>> When stage-1 translation is supported in future, a.k.a. scalable >>> modern mode, we need to ensure compatibility of each bits. Some >>> bits are user controllable, they should be checked with host side >>> to ensure compatibility. Other bits are not, they should be synced >>> into vIOMMU cap/ecap for compatibility. >>> >>> The sequence will be: >>> >>> vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() >>> iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() >>> iommu->host_cap/ecap is checked and updated some bits with host >> cap/ecap. ---- vtd_sync_hw_info() >>> iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- >> vtd_machine_done_hook() >>> iommu->host_cap/ecap is a temporary storage to hold intermediate value >>> when synthesize host cap/ecap and vIOMMU's initial configured cap/ecap. >>> >>> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >>> --- >>> include/hw/i386/intel_iommu.h | 4 ++ >>> hw/i386/intel_iommu.c | 78 >> +++++++++++++++++++++++++++++++---- >>> 2 files changed, 75 insertions(+), 7 deletions(-) >>> >>> diff --git a/include/hw/i386/intel_iommu.h >> b/include/hw/i386/intel_iommu.h >>> index c65fdde56f..b8abbcce12 100644 >>> --- a/include/hw/i386/intel_iommu.h >>> +++ b/include/hw/i386/intel_iommu.h >>> @@ -292,6 +292,9 @@ struct IntelIOMMUState { >>> uint64_t cap; /* The value of capability reg */ >>> uint64_t ecap; /* The value of extended capability reg */ >>> >>> + uint64_t host_cap; /* The value of host capability reg */ >>> + uint64_t host_ecap; /* The value of host ext-capability reg */ >>> + >>> uint32_t context_cache_gen; /* Should be in [1,MAX] */ >>> GHashTable *iotlb; /* IOTLB */ >>> >>> @@ -314,6 +317,7 @@ struct IntelIOMMUState { >>> bool dma_translation; /* Whether DMA translation supported */ >>> bool pasid; /* Whether to support PASID */ >>> >>> + bool cap_finalized; /* Whether VTD capability finalized */ >>> /* >>> * Protects IOMMU states in general. Currently it protects the >>> * per-IOMMU IOTLB cache, and context entry cache in >> VTDAddressSpace. >>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >>> index 4c1d058ebd..be03fcbf52 100644 >>> --- a/hw/i386/intel_iommu.c >>> +++ b/hw/i386/intel_iommu.c >>> @@ -3819,6 +3819,47 @@ VTDAddressSpace >> *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, >>> return vtd_dev_as; >>> } >>> >>> +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct >> iommu_hw_info_vtd *vtd, >>> + Error **errp) >>> +{ >>> + uint64_t addr_width; >>> + >>> + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; >>> + if (s->aw_bits > addr_width) { >>> + error_setg(errp, "User aw-bits: %u > host address width: %lu", >>> + s->aw_bits, addr_width); >>> + return false; >>> + } >>> + >>> + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ >>> + >>> + return true; >>> +} >>> + >>> +/* >>> + * virtual VT-d which wants nested needs to check the host IOMMU >>> + * nesting cap info behind the assigned devices. Thus that vIOMMU >>> + * could bind guest page table to host. >>> + */ >>> +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice *idev, >>> + Error **errp) >>> +{ >>> + struct iommu_hw_info_vtd vtd; >>> + enum iommu_hw_info_type type = >> IOMMU_HW_INFO_TYPE_INTEL_VTD; >>> + >>> + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { >>> + error_setg(errp, "Failed to get IOMMU capability!!!"); >>> + return false; >>> + } >>> + >>> + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { >>> + error_setg(errp, "IOMMU hardware is not compatible!!!"); >>> + return false; >>> + } >>> + >>> + return vtd_sync_hw_info(s, &vtd, errp); >>> +} >>> + >>> static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int32_t >> devfn, >>> IOMMUFDDevice *idev, Error **errp) >>> { >>> @@ -3837,6 +3878,10 @@ static int vtd_dev_set_iommu_device(PCIBus >> *bus, void *opaque, int32_t devfn, >>> return 0; >>> } >>> >>> + if (!vtd_check_idev(s, idev, errp)) {In >> In >> [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for >> hotplugged devices >> https://lore.kernel.org/all/20240117080414.316890-1- >> eric.auger@redhat.com/ >> >> I also attempt to pass host iommu info to the virtio-iommu but with >> legacy BE. > I think your patch works with iommufd BE too😊 Because iommufd BE > also fills bcontainer->iova_ranges in iommufd_cdev_get_info_iova_range(). correct. I wanted to emphasize that we also have the need to pass host iommu info in legacy mode for instance. In this series you introduce an object that works with the iommufd backed but I think if we go this way we would need another one for the legacy device. So maybe introducing a base object derived into 2 ones may be the most appropriate? Maybe, given the assumption that we will use iommufd for new use cases this legacy object will implement much fewer interfaces but still. > >> In my case I want to pass the reserved memory regions which >> also model the aw. >> So this is a pretty similar use case. > Yes. > >> Why don't we pass the pointer to an opaque iommu_hw_info instead, >> through the PCIIOMMUOps? > Passing iommu_hw_info is ok for this series, but we want more from > IOMMUFDDevice in nesting series. I.e., allocate/free ioas/hwpt, > attach/detach from hwpt, get dirty bitmap, etc. It's more flexible to > let vIOMMU get what it want itself. OK, would be interesting to define the class for this object. Worth to be introduced either in the cover letter or in the 1st patch Eric > >> >> >>> + return -1; >>> + } >>> + >>> vtd_iommu_lock(s); >>> >>> vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); >>> @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) >>> { >>> X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); >>> >>> - memset(s->csr, 0, DMAR_REG_SIZE); >>> - memset(s->wmask, 0, DMAR_REG_SIZE); >>> - memset(s->w1cmask, 0, DMAR_REG_SIZE); >>> - memset(s->womask, 0, DMAR_REG_SIZE); >>> + /* CAP/ECAP are initialized in machine create done stage */ >>> + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >> DMAR_GCMD_REG); >>> + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >> DMAR_GCMD_REG); >>> + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >> DMAR_GCMD_REG); >>> + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >> DMAR_GCMD_REG); >> This change is not documented in the commit msg. >> Sorry I don't get why this is needed? > I'll doc it. Above we have one line to explain when cap/ecap are initialized. > vtd_init() is called in qemu init and guest reset. In qemu init, > Cap/ecap is finalized, after that we don't want cap/ecap to be changed. > So we bypass change to cap/ecap here. > >>> s->root = 0; >>> s->root_scalable = false; >>> @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) >>> vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; >>> } >>> >>> - vtd_cap_init(s); >>> + if (!s->cap_finalized) { >>> + vtd_cap_init(s); >>> + s->host_cap = s->cap; >>> + s->host_ecap = s->ecap; >>> + } >>> + >>> vtd_reset_caches(s); >>> >>> /* Define registers with default values and bit semantics */ >>> vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); >>> - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>> - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>> vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); >>> vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); >>> vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); >>> @@ -4241,6 +4290,12 @@ static bool >> vtd_decide_config(IntelIOMMUState *s, Error **errp) >>> return true; >>> } >>> >>> +static void vtd_setup_capability_reg(IntelIOMMUState *s) >>> +{ >>> + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>> + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>> +} >>> + >>> static int vtd_machine_done_notify_one(Object *child, void *unused) >>> { >>> IntelIOMMUState *iommu = >> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>> @@ -4259,6 +4314,14 @@ static int >> vtd_machine_done_notify_one(Object *child, void *unused) >>> static void vtd_machine_done_hook(Notifier *notifier, void *unused) >>> { >>> + IntelIOMMUState *iommu = >> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>> + >>> + iommu->cap = iommu->host_cap; >>> + iommu->ecap = iommu->host_ecap; >>> + iommu->cap_finalized = true; >> I don't think you can change the defaults like this without taking care >> of compats (migration). > Will bump vtd_vmstate .version_id works? > > Thanks > Zhenzhong > >> Thanks >> >> Eric >>> + >>> + vtd_setup_capability_reg(iommu); >>> + >>> object_child_foreach_recursive(object_get_root(), >>> vtd_machine_done_notify_one, NULL); >>> } >>> @@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, Error >> **errp) >>> QLIST_INIT(&s->vtd_as_with_notifiers); >>> qemu_mutex_init(&s->iommu_lock); >>> + s->cap_finalized = false; >>> memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, >>> "intel_iommu", DMAR_REG_SIZE); >>> memory_region_add_subregion(get_system_memory(),
>-----Original Message----- >From: Eric Auger <eric.auger@redhat.com> >Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check and >sync host IOMMU cap/ecap > > > >On 1/18/24 10:30, Duan, Zhenzhong wrote: >> Hi Eric, >> >>> -----Original Message----- >>> From: Eric Auger <eric.auger@redhat.com> >>> Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check >and >>> sync host IOMMU cap/ecap >>> >>> Hi Zhenzhong, >>> >>> On 1/15/24 11:13, Zhenzhong Duan wrote: >>>> From: Yi Liu <yi.l.liu@intel.com> >>>> >>>> Add a framework to check and synchronize host IOMMU cap/ecap with >>>> vIOMMU cap/ecap. >>>> >>>> Currently only stage-2 translation is supported which is backed by >>>> shadow page table on host side. So we don't need exact matching of >>>> each bit of cap/ecap between vIOMMU and host. However, we can still >>>> utilize this framework to ensure compatibility of host and vIOMMU's >>>> address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, >>>> which is missed before. >>>> >>>> When stage-1 translation is supported in future, a.k.a. scalable >>>> modern mode, we need to ensure compatibility of each bits. Some >>>> bits are user controllable, they should be checked with host side >>>> to ensure compatibility. Other bits are not, they should be synced >>>> into vIOMMU cap/ecap for compatibility. >>>> >>>> The sequence will be: >>>> >>>> vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() >>>> iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() >>>> iommu->host_cap/ecap is checked and updated some bits with host >>> cap/ecap. ---- vtd_sync_hw_info() >>>> iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- >>> vtd_machine_done_hook() >>>> iommu->host_cap/ecap is a temporary storage to hold intermediate >value >>>> when synthesize host cap/ecap and vIOMMU's initial configured >cap/ecap. >>>> >>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >>>> --- >>>> include/hw/i386/intel_iommu.h | 4 ++ >>>> hw/i386/intel_iommu.c | 78 >>> +++++++++++++++++++++++++++++++---- >>>> 2 files changed, 75 insertions(+), 7 deletions(-) >>>> >>>> diff --git a/include/hw/i386/intel_iommu.h >>> b/include/hw/i386/intel_iommu.h >>>> index c65fdde56f..b8abbcce12 100644 >>>> --- a/include/hw/i386/intel_iommu.h >>>> +++ b/include/hw/i386/intel_iommu.h >>>> @@ -292,6 +292,9 @@ struct IntelIOMMUState { >>>> uint64_t cap; /* The value of capability reg */ >>>> uint64_t ecap; /* The value of extended capability reg */ >>>> >>>> + uint64_t host_cap; /* The value of host capability reg */ >>>> + uint64_t host_ecap; /* The value of host ext-capability reg */ >>>> + >>>> uint32_t context_cache_gen; /* Should be in [1,MAX] */ >>>> GHashTable *iotlb; /* IOTLB */ >>>> >>>> @@ -314,6 +317,7 @@ struct IntelIOMMUState { >>>> bool dma_translation; /* Whether DMA translation supported >*/ >>>> bool pasid; /* Whether to support PASID */ >>>> >>>> + bool cap_finalized; /* Whether VTD capability finalized */ >>>> /* >>>> * Protects IOMMU states in general. Currently it protects the >>>> * per-IOMMU IOTLB cache, and context entry cache in >>> VTDAddressSpace. >>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >>>> index 4c1d058ebd..be03fcbf52 100644 >>>> --- a/hw/i386/intel_iommu.c >>>> +++ b/hw/i386/intel_iommu.c >>>> @@ -3819,6 +3819,47 @@ VTDAddressSpace >>> *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, >>>> return vtd_dev_as; >>>> } >>>> >>>> +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct >>> iommu_hw_info_vtd *vtd, >>>> + Error **errp) >>>> +{ >>>> + uint64_t addr_width; >>>> + >>>> + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; >>>> + if (s->aw_bits > addr_width) { >>>> + error_setg(errp, "User aw-bits: %u > host address width: %lu", >>>> + s->aw_bits, addr_width); >>>> + return false; >>>> + } >>>> + >>>> + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ >>>> + >>>> + return true; >>>> +} >>>> + >>>> +/* >>>> + * virtual VT-d which wants nested needs to check the host IOMMU >>>> + * nesting cap info behind the assigned devices. Thus that vIOMMU >>>> + * could bind guest page table to host. >>>> + */ >>>> +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice >*idev, >>>> + Error **errp) >>>> +{ >>>> + struct iommu_hw_info_vtd vtd; >>>> + enum iommu_hw_info_type type = >>> IOMMU_HW_INFO_TYPE_INTEL_VTD; >>>> + >>>> + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { >>>> + error_setg(errp, "Failed to get IOMMU capability!!!"); >>>> + return false; >>>> + } >>>> + >>>> + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { >>>> + error_setg(errp, "IOMMU hardware is not compatible!!!"); >>>> + return false; >>>> + } >>>> + >>>> + return vtd_sync_hw_info(s, &vtd, errp); >>>> +} >>>> + >>>> static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, >int32_t >>> devfn, >>>> IOMMUFDDevice *idev, Error **errp) >>>> { >>>> @@ -3837,6 +3878,10 @@ static int >vtd_dev_set_iommu_device(PCIBus >>> *bus, void *opaque, int32_t devfn, >>>> return 0; >>>> } >>>> >>>> + if (!vtd_check_idev(s, idev, errp)) {In >>> In >>> [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for >>> hotplugged devices >>> https://lore.kernel.org/all/20240117080414.316890-1- >>> eric.auger@redhat.com/ >>> >>> I also attempt to pass host iommu info to the virtio-iommu but with >>> legacy BE. >> I think your patch works with iommufd BE too😊 Because iommufd BE >> also fills bcontainer->iova_ranges in iommufd_cdev_get_info_iova_range(). >correct. I wanted to emphasize that we also have the need to pass host >iommu info in legacy mode for instance. In this series you introduce an >object that works with the iommufd backed but I think if we go this way >we would need another one for the legacy device. So maybe introducing a >base object derived into 2 ones may be the most appropriate? Maybe, >given the assumption that we will use iommufd for new use cases this >legacy object will implement much fewer interfaces but still. How about this: enum IOMMU_LEGACY_DEVICE_TYPE { IOMMU_LEGACY_VFIO_DEVICE, IOMMU_LEGACY_VDPA_DEVICE, } typedef struct IOMMULegacyDevice { enum IOMMU_LEGACY_DEVICE_TYPE type; /* common field */ union { .... } } IOMMULegacyDevice; typedef struct IOMMUFDDevice { IOMMUFDBackend *iommufd; uint32_t dev_id; uint32_t ioas_id; } IOMMUFDDevice; enum IOMMUDEVICE_TYPE { IOMMUFD_DEVICE, IOMMU_LEGACY_DEVICE, } struct IOMMUDevice { enum IOMMU_DEVICE_TYPE type; /* common field */ GList *iova_ranges; union { IOMMULegacyDevice legacy_dev; IOMMUFDDevice idev; } } >> >>> In my case I want to pass the reserved memory regions which >>> also model the aw. >>> So this is a pretty similar use case. >> Yes. >> >>> Why don't we pass the pointer to an opaque iommu_hw_info instead, >>> through the PCIIOMMUOps? >> Passing iommu_hw_info is ok for this series, but we want more from >> IOMMUFDDevice in nesting series. I.e., allocate/free ioas/hwpt, >> attach/detach from hwpt, get dirty bitmap, etc. It's more flexible to >> let vIOMMU get what it want itself. >OK, would be interesting to define the class for this object. Worth to >be introduced either in the cover letter or in the 1st patch Not a QOM class because we don't want it showed out through query-qmp-schema. Thanks Zhenzhong > >Eric >> >>> >>> >>>> + return -1; >>>> + } >>>> + >>>> vtd_iommu_lock(s); >>>> >>>> vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); >>>> @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) >>>> { >>>> X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); >>>> >>>> - memset(s->csr, 0, DMAR_REG_SIZE); >>>> - memset(s->wmask, 0, DMAR_REG_SIZE); >>>> - memset(s->w1cmask, 0, DMAR_REG_SIZE); >>>> - memset(s->womask, 0, DMAR_REG_SIZE); >>>> + /* CAP/ECAP are initialized in machine create done stage */ >>>> + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>>> + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>>> + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>>> + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>> DMAR_GCMD_REG); >>> This change is not documented in the commit msg. >>> Sorry I don't get why this is needed? >> I'll doc it. Above we have one line to explain when cap/ecap are initialized. >> vtd_init() is called in qemu init and guest reset. In qemu init, >> Cap/ecap is finalized, after that we don't want cap/ecap to be changed. >> So we bypass change to cap/ecap here. >> >>>> s->root = 0; >>>> s->root_scalable = false; >>>> @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) >>>> vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; >>>> } >>>> >>>> - vtd_cap_init(s); >>>> + if (!s->cap_finalized) { >>>> + vtd_cap_init(s); >>>> + s->host_cap = s->cap; >>>> + s->host_ecap = s->ecap; >>>> + } >>>> + >>>> vtd_reset_caches(s); >>>> >>>> /* Define registers with default values and bit semantics */ >>>> vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); >>>> - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>>> - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>>> vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); >>>> vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); >>>> vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); >>>> @@ -4241,6 +4290,12 @@ static bool >>> vtd_decide_config(IntelIOMMUState *s, Error **errp) >>>> return true; >>>> } >>>> >>>> +static void vtd_setup_capability_reg(IntelIOMMUState *s) >>>> +{ >>>> + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>>> + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>>> +} >>>> + >>>> static int vtd_machine_done_notify_one(Object *child, void *unused) >>>> { >>>> IntelIOMMUState *iommu = >>> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>>> @@ -4259,6 +4314,14 @@ static int >>> vtd_machine_done_notify_one(Object *child, void *unused) >>>> static void vtd_machine_done_hook(Notifier *notifier, void *unused) >>>> { >>>> + IntelIOMMUState *iommu = >>> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>>> + >>>> + iommu->cap = iommu->host_cap; >>>> + iommu->ecap = iommu->host_ecap; >>>> + iommu->cap_finalized = true; >>> I don't think you can change the defaults like this without taking care >>> of compats (migration). >> Will bump vtd_vmstate .version_id works? >> >> Thanks >> Zhenzhong >> >>> Thanks >>> >>> Eric >>>> + >>>> + vtd_setup_capability_reg(iommu); >>>> + >>>> object_child_foreach_recursive(object_get_root(), >>>> vtd_machine_done_notify_one, NULL); >>>> } >>>> @@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, >Error >>> **errp) >>>> QLIST_INIT(&s->vtd_as_with_notifiers); >>>> qemu_mutex_init(&s->iommu_lock); >>>> + s->cap_finalized = false; >>>> memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, >>>> "intel_iommu", DMAR_REG_SIZE); >>>> memory_region_add_subregion(get_system_memory(),
On 1/19/24 12:55, Duan, Zhenzhong wrote: > >> -----Original Message----- >> From: Eric Auger <eric.auger@redhat.com> >> Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check and >> sync host IOMMU cap/ecap >> >> >> >> On 1/18/24 10:30, Duan, Zhenzhong wrote: >>> Hi Eric, >>> >>>> -----Original Message----- >>>> From: Eric Auger <eric.auger@redhat.com> >>>> Subject: Re: [PATCH rfcv1 6/6] intel_iommu: add a framework to check >> and >>>> sync host IOMMU cap/ecap >>>> >>>> Hi Zhenzhong, >>>> >>>> On 1/15/24 11:13, Zhenzhong Duan wrote: >>>>> From: Yi Liu <yi.l.liu@intel.com> >>>>> >>>>> Add a framework to check and synchronize host IOMMU cap/ecap with >>>>> vIOMMU cap/ecap. >>>>> >>>>> Currently only stage-2 translation is supported which is backed by >>>>> shadow page table on host side. So we don't need exact matching of >>>>> each bit of cap/ecap between vIOMMU and host. However, we can still >>>>> utilize this framework to ensure compatibility of host and vIOMMU's >>>>> address width at least, i.e., vIOMMU's aw_bits <= host aw_bits, >>>>> which is missed before. >>>>> >>>>> When stage-1 translation is supported in future, a.k.a. scalable >>>>> modern mode, we need to ensure compatibility of each bits. Some >>>>> bits are user controllable, they should be checked with host side >>>>> to ensure compatibility. Other bits are not, they should be synced >>>>> into vIOMMU cap/ecap for compatibility. >>>>> >>>>> The sequence will be: >>>>> >>>>> vtd_cap_init() initializes iommu->cap/ecap. ---- vtd_cap_init() >>>>> iommu->host_cap/ecap is initialized as iommu->cap/ecap. ---- vtd_init() >>>>> iommu->host_cap/ecap is checked and updated some bits with host >>>> cap/ecap. ---- vtd_sync_hw_info() >>>>> iommu->cap/ecap is finalized as iommu->host_cap/ecap. ---- >>>> vtd_machine_done_hook() >>>>> iommu->host_cap/ecap is a temporary storage to hold intermediate >> value >>>>> when synthesize host cap/ecap and vIOMMU's initial configured >> cap/ecap. >>>>> Signed-off-by: Yi Liu <yi.l.liu@intel.com> >>>>> Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> >>>>> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com> >>>>> --- >>>>> include/hw/i386/intel_iommu.h | 4 ++ >>>>> hw/i386/intel_iommu.c | 78 >>>> +++++++++++++++++++++++++++++++---- >>>>> 2 files changed, 75 insertions(+), 7 deletions(-) >>>>> >>>>> diff --git a/include/hw/i386/intel_iommu.h >>>> b/include/hw/i386/intel_iommu.h >>>>> index c65fdde56f..b8abbcce12 100644 >>>>> --- a/include/hw/i386/intel_iommu.h >>>>> +++ b/include/hw/i386/intel_iommu.h >>>>> @@ -292,6 +292,9 @@ struct IntelIOMMUState { >>>>> uint64_t cap; /* The value of capability reg */ >>>>> uint64_t ecap; /* The value of extended capability reg */ >>>>> >>>>> + uint64_t host_cap; /* The value of host capability reg */ >>>>> + uint64_t host_ecap; /* The value of host ext-capability reg */ >>>>> + >>>>> uint32_t context_cache_gen; /* Should be in [1,MAX] */ >>>>> GHashTable *iotlb; /* IOTLB */ >>>>> >>>>> @@ -314,6 +317,7 @@ struct IntelIOMMUState { >>>>> bool dma_translation; /* Whether DMA translation supported >> */ >>>>> bool pasid; /* Whether to support PASID */ >>>>> >>>>> + bool cap_finalized; /* Whether VTD capability finalized */ >>>>> /* >>>>> * Protects IOMMU states in general. Currently it protects the >>>>> * per-IOMMU IOTLB cache, and context entry cache in >>>> VTDAddressSpace. >>>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >>>>> index 4c1d058ebd..be03fcbf52 100644 >>>>> --- a/hw/i386/intel_iommu.c >>>>> +++ b/hw/i386/intel_iommu.c >>>>> @@ -3819,6 +3819,47 @@ VTDAddressSpace >>>> *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, >>>>> return vtd_dev_as; >>>>> } >>>>> >>>>> +static bool vtd_sync_hw_info(IntelIOMMUState *s, struct >>>> iommu_hw_info_vtd *vtd, >>>>> + Error **errp) >>>>> +{ >>>>> + uint64_t addr_width; >>>>> + >>>>> + addr_width = (vtd->cap_reg >> 16) & 0x3fULL; >>>>> + if (s->aw_bits > addr_width) { >>>>> + error_setg(errp, "User aw-bits: %u > host address width: %lu", >>>>> + s->aw_bits, addr_width); >>>>> + return false; >>>>> + } >>>>> + >>>>> + /* TODO: check and sync host cap/ecap into vIOMMU cap/ecap */ >>>>> + >>>>> + return true; >>>>> +} >>>>> + >>>>> +/* >>>>> + * virtual VT-d which wants nested needs to check the host IOMMU >>>>> + * nesting cap info behind the assigned devices. Thus that vIOMMU >>>>> + * could bind guest page table to host. >>>>> + */ >>>>> +static bool vtd_check_idev(IntelIOMMUState *s, IOMMUFDDevice >> *idev, >>>>> + Error **errp) >>>>> +{ >>>>> + struct iommu_hw_info_vtd vtd; >>>>> + enum iommu_hw_info_type type = >>>> IOMMU_HW_INFO_TYPE_INTEL_VTD; >>>>> + >>>>> + if (iommufd_device_get_info(idev, &type, sizeof(vtd), &vtd)) { >>>>> + error_setg(errp, "Failed to get IOMMU capability!!!"); >>>>> + return false; >>>>> + } >>>>> + >>>>> + if (type != IOMMU_HW_INFO_TYPE_INTEL_VTD) { >>>>> + error_setg(errp, "IOMMU hardware is not compatible!!!"); >>>>> + return false; >>>>> + } >>>>> + >>>>> + return vtd_sync_hw_info(s, &vtd, errp); >>>>> +} >>>>> + >>>>> static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, >> int32_t >>>> devfn, >>>>> IOMMUFDDevice *idev, Error **errp) >>>>> { >>>>> @@ -3837,6 +3878,10 @@ static int >> vtd_dev_set_iommu_device(PCIBus >>>> *bus, void *opaque, int32_t devfn, >>>>> return 0; >>>>> } >>>>> >>>>> + if (!vtd_check_idev(s, idev, errp)) {In >>>> In >>>> [RFC 0/7] VIRTIO-IOMMU/VFIO: Fix host iommu geometry handling for >>>> hotplugged devices >>>> https://lore.kernel.org/all/20240117080414.316890-1- >>>> eric.auger@redhat.com/ >>>> >>>> I also attempt to pass host iommu info to the virtio-iommu but with >>>> legacy BE. >>> I think your patch works with iommufd BE too😊 Because iommufd BE >>> also fills bcontainer->iova_ranges in iommufd_cdev_get_info_iova_range(). >> correct. I wanted to emphasize that we also have the need to pass host >> iommu info in legacy mode for instance. In this series you introduce an >> object that works with the iommufd backed but I think if we go this way >> we would need another one for the legacy device. So maybe introducing a >> base object derived into 2 ones may be the most appropriate? Maybe, >> given the assumption that we will use iommufd for new use cases this >> legacy object will implement much fewer interfaces but still. > How about this: > > enum IOMMU_LEGACY_DEVICE_TYPE { > IOMMU_LEGACY_VFIO_DEVICE, > IOMMU_LEGACY_VDPA_DEVICE, > } > > typedef struct IOMMULegacyDevice { > enum IOMMU_LEGACY_DEVICE_TYPE type; > > /* common field */ > > union { > .... > } > > } IOMMULegacyDevice; > > typedef struct IOMMUFDDevice { > IOMMUFDBackend *iommufd; > uint32_t dev_id; > uint32_t ioas_id; > } IOMMUFDDevice; > > enum IOMMUDEVICE_TYPE { > IOMMUFD_DEVICE, > IOMMU_LEGACY_DEVICE, > } > > struct IOMMUDevice { > enum IOMMU_DEVICE_TYPE type; > > /* common field */ > GList *iova_ranges; > > union { > IOMMULegacyDevice legacy_dev; yeah but that's not very nice to have this LegacyDevice def in an iommufd.c file Either we define an abstract HostAssignedDevice and derived objects for both legacy and IOMMUFD or we consider using a different API for legacy use cases (retrieving resv regions, page size mask, ...). Eric +int iommufd_device_get_info(IOMMUFDDevice *idev, + enum iommu_hw_info_type *type, + uint32_t len, void *data); +void iommufd_device_init(void *_idev, size_t instance_size, + IOMMUFDBackend *iommufd, uint32_t dev_id); > IOMMUFDDevice idev; > } > } > >>>> In my case I want to pass the reserved memory regions which >>>> also model the aw. >>>> So this is a pretty similar use case. >>> Yes. >>> >>>> Why don't we pass the pointer to an opaque iommu_hw_info instead, >>>> through the PCIIOMMUOps? >>> Passing iommu_hw_info is ok for this series, but we want more from >>> IOMMUFDDevice in nesting series. I.e., allocate/free ioas/hwpt, >>> attach/detach from hwpt, get dirty bitmap, etc. It's more flexible to >>> let vIOMMU get what it want itself. >> OK, would be interesting to define the class for this object. Worth to >> be introduced either in the cover letter or in the 1st patch > Not a QOM class because we don't want it showed out through > query-qmp-schema. > > Thanks > Zhenzhong > >> Eric >>>> >>>>> + return -1; >>>>> + } >>>>> + >>>>> vtd_iommu_lock(s); >>>>> >>>>> vtd_idev = g_hash_table_lookup(s->vtd_iommufd_dev, &key); >>>>> @@ -4071,10 +4116,11 @@ static void vtd_init(IntelIOMMUState *s) >>>>> { >>>>> X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); >>>>> >>>>> - memset(s->csr, 0, DMAR_REG_SIZE); >>>>> - memset(s->wmask, 0, DMAR_REG_SIZE); >>>>> - memset(s->w1cmask, 0, DMAR_REG_SIZE); >>>>> - memset(s->womask, 0, DMAR_REG_SIZE); >>>>> + /* CAP/ECAP are initialized in machine create done stage */ >>>>> + memset(s->csr + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>>> DMAR_GCMD_REG); >>>>> + memset(s->wmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>>> DMAR_GCMD_REG); >>>>> + memset(s->w1cmask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>>> DMAR_GCMD_REG); >>>>> + memset(s->womask + DMAR_GCMD_REG, 0, DMAR_REG_SIZE - >>>> DMAR_GCMD_REG); >>>> This change is not documented in the commit msg. >>>> Sorry I don't get why this is needed? >>> I'll doc it. Above we have one line to explain when cap/ecap are initialized. >>> vtd_init() is called in qemu init and guest reset. In qemu init, >>> Cap/ecap is finalized, after that we don't want cap/ecap to be changed. >>> So we bypass change to cap/ecap here. >>> >>>>> s->root = 0; >>>>> s->root_scalable = false; >>>>> @@ -4110,13 +4156,16 @@ static void vtd_init(IntelIOMMUState *s) >>>>> vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; >>>>> } >>>>> >>>>> - vtd_cap_init(s); >>>>> + if (!s->cap_finalized) { >>>>> + vtd_cap_init(s); >>>>> + s->host_cap = s->cap; >>>>> + s->host_ecap = s->ecap; >>>>> + } >>>>> + >>>>> vtd_reset_caches(s); >>>>> >>>>> /* Define registers with default values and bit semantics */ >>>>> vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0); >>>>> - vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>>>> - vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>>>> vtd_define_long(s, DMAR_GCMD_REG, 0, 0xff800000UL, 0); >>>>> vtd_define_long_wo(s, DMAR_GCMD_REG, 0xff800000UL); >>>>> vtd_define_long(s, DMAR_GSTS_REG, 0, 0, 0); >>>>> @@ -4241,6 +4290,12 @@ static bool >>>> vtd_decide_config(IntelIOMMUState *s, Error **errp) >>>>> return true; >>>>> } >>>>> >>>>> +static void vtd_setup_capability_reg(IntelIOMMUState *s) >>>>> +{ >>>>> + vtd_define_quad(s, DMAR_CAP_REG, s->cap, 0, 0); >>>>> + vtd_define_quad(s, DMAR_ECAP_REG, s->ecap, 0, 0); >>>>> +} >>>>> + >>>>> static int vtd_machine_done_notify_one(Object *child, void *unused) >>>>> { >>>>> IntelIOMMUState *iommu = >>>> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>>>> @@ -4259,6 +4314,14 @@ static int >>>> vtd_machine_done_notify_one(Object *child, void *unused) >>>>> static void vtd_machine_done_hook(Notifier *notifier, void *unused) >>>>> { >>>>> + IntelIOMMUState *iommu = >>>> INTEL_IOMMU_DEVICE(x86_iommu_get_default()); >>>>> + >>>>> + iommu->cap = iommu->host_cap; >>>>> + iommu->ecap = iommu->host_ecap; >>>>> + iommu->cap_finalized = true; >>>> I don't think you can change the defaults like this without taking care >>>> of compats (migration). >>> Will bump vtd_vmstate .version_id works? >>> >>> Thanks >>> Zhenzhong >>> >>>> Thanks >>>> >>>> Eric >>>>> + >>>>> + vtd_setup_capability_reg(iommu); >>>>> + >>>>> object_child_foreach_recursive(object_get_root(), >>>>> vtd_machine_done_notify_one, NULL); >>>>> } >>>>> @@ -4292,6 +4355,7 @@ static void vtd_realize(DeviceState *dev, >> Error >>>> **errp) >>>>> QLIST_INIT(&s->vtd_as_with_notifiers); >>>>> qemu_mutex_init(&s->iommu_lock); >>>>> + s->cap_finalized = false; >>>>> memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, >>>>> "intel_iommu", DMAR_REG_SIZE); >>>>> memory_region_add_subregion(get_system_memory(),
© 2016 - 2024 Red Hat, Inc.