A 5-level paging capable VM may choose to use 57-bit IOVA address width.
E.g. guest applications like DPDK prefer to use its VA as IOVA when
performing VFIO map/unmap operations, to avoid the burden of managing the
IOVA space.
This patch extends the current vIOMMU logic to cover the extended address
width. When creating a VM with 5-level paging feature, one can choose to
create a virtual VTD with 5-level paging capability, with configurations
like "-device intel-iommu,x-aw-bits=57".
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
---
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
---
hw/i386/intel_iommu.c | 54 ++++++++++++++++++++++++++++++++----------
hw/i386/intel_iommu_internal.h | 6 +++++
include/hw/i386/intel_iommu.h | 1 +
3 files changed, 49 insertions(+), 12 deletions(-)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e772fca..9cdf755 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -664,16 +664,16 @@ static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
/*
* Rsvd field masks for spte:
- * Index [1] to [4] 4k pages
- * Index [5] to [8] large pages
+ * Index [1] to [5] 4k pages
+ * Index [6] to [10] large pages
*/
-static uint64_t vtd_paging_entry_rsvd_field[9];
+static uint64_t vtd_paging_entry_rsvd_field[11];
static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
{
if (slpte & VTD_SL_PT_PAGE_SIZE_MASK) {
/* Maybe large page */
- return slpte & vtd_paging_entry_rsvd_field[level + 4];
+ return slpte & vtd_paging_entry_rsvd_field[level + 5];
} else {
return slpte & vtd_paging_entry_rsvd_field[level];
}
@@ -3125,6 +3125,9 @@ static void vtd_init(IntelIOMMUState *s)
if (s->aw_bits == VTD_AW_48BIT) {
s->cap |= VTD_CAP_SAGAW_48bit;
}
+ else if (s->aw_bits == VTD_AW_57BIT) {
+ s->cap |= VTD_CAP_SAGAW_57bit | VTD_CAP_SAGAW_48bit;
+ }
s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
s->haw_bits = cpu->phys_bits;
@@ -3136,10 +3139,12 @@ static void vtd_init(IntelIOMMUState *s)
vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->haw_bits);
vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->haw_bits);
vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->haw_bits);
- vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->haw_bits);
- vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->haw_bits);
- vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->haw_bits);
- vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->haw_bits);
+ vtd_paging_entry_rsvd_field[5] = VTD_SPTE_PAGE_L5_RSVD_MASK(s->haw_bits);
+ vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->haw_bits);
+ vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->haw_bits);
+ vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->haw_bits);
+ vtd_paging_entry_rsvd_field[9] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->haw_bits);
+ vtd_paging_entry_rsvd_field[10] = VTD_SPTE_LPAGE_L5_RSVD_MASK(s->haw_bits);
if (x86_iommu->intr_supported) {
s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
@@ -3238,6 +3243,23 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn)
return &vtd_as->as;
}
+static bool host_has_la57(void)
+{
+ uint32_t ecx, unused;
+
+ host_cpuid(7, 0, &unused, &unused, &ecx, &unused);
+ return ecx & CPUID_7_0_ECX_LA57;
+}
+
+static bool guest_has_la57(void)
+{
+ CPUState *cs = first_cpu;
+ X86CPU *cpu = X86_CPU(cs);
+ CPUX86State *env = &cpu->env;
+
+ return env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_LA57;
+}
+
static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
{
X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
@@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
}
}
- /* Currently only address widths supported are 39 and 48 bits */
+ /* Currently address widths supported are 39, 48, and 57 bits */
if ((s->aw_bits != VTD_AW_39BIT) &&
- (s->aw_bits != VTD_AW_48BIT)) {
- error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
- VTD_AW_39BIT, VTD_AW_48BIT);
+ (s->aw_bits != VTD_AW_48BIT) &&
+ (s->aw_bits != VTD_AW_57BIT)) {
+ error_setg(errp, "Supported values for x-aw-bits are: %d, %d, %d",
+ VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
+ return false;
+ }
+
+ if ((s->aw_bits == VTD_AW_57BIT) &&
+ !(host_has_la57() && guest_has_la57())) {
+ error_setg(errp, "Do not support 57-bit DMA address, unless both "
+ "host and guest are capable of 5-level paging.\n");
return false;
}
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index d084099..a7ef24b 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -212,6 +212,8 @@
#define VTD_CAP_SAGAW_39bit (0x2ULL << VTD_CAP_SAGAW_SHIFT)
/* 48-bit AGAW, 4-level page-table */
#define VTD_CAP_SAGAW_48bit (0x4ULL << VTD_CAP_SAGAW_SHIFT)
+ /* 57-bit AGAW, 5-level page-table */
+#define VTD_CAP_SAGAW_57bit (0x8ULL << VTD_CAP_SAGAW_SHIFT)
/* IQT_REG */
#define VTD_IQT_QT(val) (((val) >> 4) & 0x7fffULL)
@@ -379,6 +381,8 @@ typedef union VTDInvDesc VTDInvDesc;
(0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
(0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L5_RSVD_MASK(aw) \
+ (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
#define VTD_SPTE_LPAGE_L1_RSVD_MASK(aw) \
(0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw) \
@@ -387,6 +391,8 @@ typedef union VTDInvDesc VTDInvDesc;
(0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
#define VTD_SPTE_LPAGE_L4_RSVD_MASK(aw) \
(0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L5_RSVD_MASK(aw) \
+ (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
/* Information about page-selective IOTLB invalidate */
struct VTDIOTLBPageInvInfo {
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 820451c..7474c4f 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -49,6 +49,7 @@
#define DMAR_REG_SIZE 0x230
#define VTD_AW_39BIT 39
#define VTD_AW_48BIT 48
+#define VTD_AW_57BIT 57
#define VTD_ADDRESS_WIDTH VTD_AW_39BIT
#define VTD_HAW_MASK(aw) ((1ULL << (aw)) - 1)
--
1.9.1
On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> E.g. guest applications like DPDK prefer to use its VA as IOVA when
> performing VFIO map/unmap operations, to avoid the burden of managing the
> IOVA space.
Since you mentioned about DPDK... I'm just curious that whether have
you tested the patchset with the 57bit-enabled machines with DPDK VA
mode running in the guest? That would be something nice to mention in
the cover letter if you have.
[...]
> @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
> }
> }
>
> - /* Currently only address widths supported are 39 and 48 bits */
> + /* Currently address widths supported are 39, 48, and 57 bits */
> if ((s->aw_bits != VTD_AW_39BIT) &&
> - (s->aw_bits != VTD_AW_48BIT)) {
> - error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> - VTD_AW_39BIT, VTD_AW_48BIT);
> + (s->aw_bits != VTD_AW_48BIT) &&
> + (s->aw_bits != VTD_AW_57BIT)) {
> + error_setg(errp, "Supported values for x-aw-bits are: %d, %d, %d",
> + VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> + return false;
> + }
> +
> + if ((s->aw_bits == VTD_AW_57BIT) &&
> + !(host_has_la57() && guest_has_la57())) {
> + error_setg(errp, "Do not support 57-bit DMA address, unless both "
> + "host and guest are capable of 5-level paging.\n");
Is there any context (or pointer to previous discussions would work
too) on explaining why we don't support some scenarios like
host_paw=48,guest_paw=48,guest_gaw=57?
Thanks,
--
Peter Xu
On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > performing VFIO map/unmap operations, to avoid the burden of managing the
> > IOVA space.
>
> Since you mentioned about DPDK... I'm just curious that whether have
> you tested the patchset with the 57bit-enabled machines with DPDK VA
> mode running in the guest? That would be something nice to mention in
> the cover letter if you have.
>
Hah. Maybe I shall not mention DPDK here.
The story is that we heard the requirement, saying applications like DPDK
would need 5-level paging in IOMMU side. And I was convinced after checked
DPDK code, seeing it may use VA as IOVA directly. But I did not test this
patch with DPDK.
Instead, I used kvm-unit-test to verify this patch series. And of course, I
also did some modification to the test case. Patch for the test also sent out
at https://www.spinics.net/lists/kvm/msg177425.html.
> [...]
>
> > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > }
> > }
> >
> > - /* Currently only address widths supported are 39 and 48 bits */
> > + /* Currently address widths supported are 39, 48, and 57 bits */
> > if ((s->aw_bits != VTD_AW_39BIT) &&
> > - (s->aw_bits != VTD_AW_48BIT)) {
> > - error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > - VTD_AW_39BIT, VTD_AW_48BIT);
> > + (s->aw_bits != VTD_AW_48BIT) &&
> > + (s->aw_bits != VTD_AW_57BIT)) {
> > + error_setg(errp, "Supported values for x-aw-bits are: %d, %d, %d",
> > + VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > + return false;
> > + }
> > +
> > + if ((s->aw_bits == VTD_AW_57BIT) &&
> > + !(host_has_la57() && guest_has_la57())) {
> > + error_setg(errp, "Do not support 57-bit DMA address, unless both "
> > + "host and guest are capable of 5-level paging.\n");
>
> Is there any context (or pointer to previous discussions would work
> too) on explaining why we don't support some scenarios like
> host_paw=48,guest_paw=48,guest_gaw=57?
>
Well, above check is only to make sure both the host and the guest can
use 57bit linear address, which requires 5-level paging. So I believe
we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
The guest_has_la57() means the guest can use 57-bit linear address,
regardless of its physical address width.
> Thanks,
>
> --
> Peter Xu
>
B.R.
Yu
On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > performing VFIO map/unmap operations, to avoid the burden of managing the
> > > IOVA space.
> >
> > Since you mentioned about DPDK... I'm just curious that whether have
> > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > mode running in the guest? That would be something nice to mention in
> > the cover letter if you have.
> >
>
> Hah. Maybe I shall not mention DPDK here.
>
> The story is that we heard the requirement, saying applications like DPDK
> would need 5-level paging in IOMMU side. And I was convinced after checked
> DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> patch with DPDK.
>
> Instead, I used kvm-unit-test to verify this patch series. And of course, I
> also did some modification to the test case. Patch for the test also sent out
> at https://www.spinics.net/lists/kvm/msg177425.html.
Yeah that's perfectly fine for me. So instead maybe you can also
mention the kvm-unit-test in the cover letter if you gonna repost.
>
> > [...]
> >
> > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > > }
> > > }
> > >
> > > - /* Currently only address widths supported are 39 and 48 bits */
> > > + /* Currently address widths supported are 39, 48, and 57 bits */
> > > if ((s->aw_bits != VTD_AW_39BIT) &&
> > > - (s->aw_bits != VTD_AW_48BIT)) {
> > > - error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > > - VTD_AW_39BIT, VTD_AW_48BIT);
> > > + (s->aw_bits != VTD_AW_48BIT) &&
> > > + (s->aw_bits != VTD_AW_57BIT)) {
> > > + error_setg(errp, "Supported values for x-aw-bits are: %d, %d, %d",
> > > + VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > + return false;
> > > + }
> > > +
> > > + if ((s->aw_bits == VTD_AW_57BIT) &&
> > > + !(host_has_la57() && guest_has_la57())) {
> > > + error_setg(errp, "Do not support 57-bit DMA address, unless both "
> > > + "host and guest are capable of 5-level paging.\n");
> >
> > Is there any context (or pointer to previous discussions would work
> > too) on explaining why we don't support some scenarios like
> > host_paw=48,guest_paw=48,guest_gaw=57?
> >
>
> Well, above check is only to make sure both the host and the guest can
> use 57bit linear address, which requires 5-level paging. So I believe
> we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> The guest_has_la57() means the guest can use 57-bit linear address,
> regardless of its physical address width.
Sorry for my incorrect wording. I mean when host/guest CPU only
support 4-level LA then would/should we allow the guest IOMMU to
support 5-level IOVA? Asked since I'm thinking whether I can run the
series a bit with my laptop/servers.
Since at it, another thing I thought about is making sure the IOMMU
capabilities will match between host and guest IOMMU, which I think
this series has ignorred so far. E.g., when we're having assigned
devices in the guest and with 5-level IOVA, we should make sure the
host IOMMU supports 5-level as well before the guest starts since
otherwise the shadow page synchronization could potentially fail when
the requested IOVA address goes beyond 4-level. One simple solution
is just to disable device assignment for now when we're with 57bits
vIOMMU but I'm not sure whether that's what you want, especially you
mentioned the DPDK case (who may use assigned devices).
(sorry to have mentioned the dpdk case again :)
Regards,
--
Peter Xu
On Tue, Nov 13, 2018 at 11:37:07AM +0800, Peter Xu wrote:
> On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> > On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > > performing VFIO map/unmap operations, to avoid the burden of managing the
> > > > IOVA space.
> > >
> > > Since you mentioned about DPDK... I'm just curious that whether have
> > > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > > mode running in the guest? That would be something nice to mention in
> > > the cover letter if you have.
> > >
> >
> > Hah. Maybe I shall not mention DPDK here.
> >
> > The story is that we heard the requirement, saying applications like DPDK
> > would need 5-level paging in IOMMU side. And I was convinced after checked
> > DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> > patch with DPDK.
> >
> > Instead, I used kvm-unit-test to verify this patch series. And of course, I
> > also did some modification to the test case. Patch for the test also sent out
> > at https://www.spinics.net/lists/kvm/msg177425.html.
>
> Yeah that's perfectly fine for me. So instead maybe you can also
> mention the kvm-unit-test in the cover letter if you gonna repost.
>
> >
> > > [...]
> > >
> > > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > > > }
> > > > }
> > > >
> > > > - /* Currently only address widths supported are 39 and 48 bits */
> > > > + /* Currently address widths supported are 39, 48, and 57 bits */
> > > > if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > - (s->aw_bits != VTD_AW_48BIT)) {
> > > > - error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > > > - VTD_AW_39BIT, VTD_AW_48BIT);
> > > > + (s->aw_bits != VTD_AW_48BIT) &&
> > > > + (s->aw_bits != VTD_AW_57BIT)) {
> > > > + error_setg(errp, "Supported values for x-aw-bits are: %d, %d, %d",
> > > > + VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > + return false;
> > > > + }
> > > > +
> > > > + if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > + !(host_has_la57() && guest_has_la57())) {
> > > > + error_setg(errp, "Do not support 57-bit DMA address, unless both "
> > > > + "host and guest are capable of 5-level paging.\n");
> > >
> > > Is there any context (or pointer to previous discussions would work
> > > too) on explaining why we don't support some scenarios like
> > > host_paw=48,guest_paw=48,guest_gaw=57?
> > >
> >
> > Well, above check is only to make sure both the host and the guest can
> > use 57bit linear address, which requires 5-level paging. So I believe
> > we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> > The guest_has_la57() means the guest can use 57-bit linear address,
> > regardless of its physical address width.
>
> Sorry for my incorrect wording. I mean when host/guest CPU only
> support 4-level LA then would/should we allow the guest IOMMU to
> support 5-level IOVA? Asked since I'm thinking whether I can run the
> series a bit with my laptop/servers.
[...]
>
> Since at it, another thing I thought about is making sure the IOMMU
> capabilities will match between host and guest IOMMU, which I think
> this series has ignorred so far. E.g., when we're having assigned
> devices in the guest and with 5-level IOVA, we should make sure the
> host IOMMU supports 5-level as well before the guest starts since
> otherwise the shadow page synchronization could potentially fail when
> the requested IOVA address goes beyond 4-level. One simple solution
> is just to disable device assignment for now when we're with 57bits
> vIOMMU but I'm not sure whether that's what you want, especially you
> mentioned the DPDK case (who may use assigned devices).
Ok I totally forgot that we don't even support any kind of check like
this before... So feel free to skip this comment if you want, or it
would be even nicer if you want to fix it as a whole. :)
Regards,
--
Peter Xu
On Tue, Nov 13, 2018 at 01:04:51PM +0800, Peter Xu wrote:
> On Tue, Nov 13, 2018 at 11:37:07AM +0800, Peter Xu wrote:
> > On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> > > On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > > > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > > > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > > > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > > > performing VFIO map/unmap operations, to avoid the burden of managing the
> > > > > IOVA space.
> > > >
> > > > Since you mentioned about DPDK... I'm just curious that whether have
> > > > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > > > mode running in the guest? That would be something nice to mention in
> > > > the cover letter if you have.
> > > >
> > >
> > > Hah. Maybe I shall not mention DPDK here.
> > >
> > > The story is that we heard the requirement, saying applications like DPDK
> > > would need 5-level paging in IOMMU side. And I was convinced after checked
> > > DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> > > patch with DPDK.
> > >
> > > Instead, I used kvm-unit-test to verify this patch series. And of course, I
> > > also did some modification to the test case. Patch for the test also sent out
> > > at https://www.spinics.net/lists/kvm/msg177425.html.
> >
> > Yeah that's perfectly fine for me. So instead maybe you can also
> > mention the kvm-unit-test in the cover letter if you gonna repost.
> >
> > >
> > > > [...]
> > > >
> > > > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > > > > }
> > > > > }
> > > > >
> > > > > - /* Currently only address widths supported are 39 and 48 bits */
> > > > > + /* Currently address widths supported are 39, 48, and 57 bits */
> > > > > if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > > - (s->aw_bits != VTD_AW_48BIT)) {
> > > > > - error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > > > > - VTD_AW_39BIT, VTD_AW_48BIT);
> > > > > + (s->aw_bits != VTD_AW_48BIT) &&
> > > > > + (s->aw_bits != VTD_AW_57BIT)) {
> > > > > + error_setg(errp, "Supported values for x-aw-bits are: %d, %d, %d",
> > > > > + VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > > + return false;
> > > > > + }
> > > > > +
> > > > > + if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > > + !(host_has_la57() && guest_has_la57())) {
> > > > > + error_setg(errp, "Do not support 57-bit DMA address, unless both "
> > > > > + "host and guest are capable of 5-level paging.\n");
> > > >
> > > > Is there any context (or pointer to previous discussions would work
> > > > too) on explaining why we don't support some scenarios like
> > > > host_paw=48,guest_paw=48,guest_gaw=57?
> > > >
> > >
> > > Well, above check is only to make sure both the host and the guest can
> > > use 57bit linear address, which requires 5-level paging. So I believe
> > > we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> > > The guest_has_la57() means the guest can use 57-bit linear address,
> > > regardless of its physical address width.
> >
> > Sorry for my incorrect wording. I mean when host/guest CPU only
> > support 4-level LA then would/should we allow the guest IOMMU to
> > support 5-level IOVA? Asked since I'm thinking whether I can run the
> > series a bit with my laptop/servers.
>
> [...]
>
> >
> > Since at it, another thing I thought about is making sure the IOMMU
> > capabilities will match between host and guest IOMMU, which I think
> > this series has ignorred so far. E.g., when we're having assigned
> > devices in the guest and with 5-level IOVA, we should make sure the
> > host IOMMU supports 5-level as well before the guest starts since
> > otherwise the shadow page synchronization could potentially fail when
> > the requested IOVA address goes beyond 4-level. One simple solution
> > is just to disable device assignment for now when we're with 57bits
> > vIOMMU but I'm not sure whether that's what you want, especially you
> > mentioned the DPDK case (who may use assigned devices).
>
> Ok I totally forgot that we don't even support any kind of check like
> this before... So feel free to skip this comment if you want, or it
> would be even nicer if you want to fix it as a whole. :)
>
Indeed. We have talked about this before. How about we focus on the 5-level
extension for now, and solve the check issue in the future? I still do not
have any clean solutions in mind. BTW, any suggestions on this issue? :)
> Regards,
>
> --
> Peter Xu
>
B.R.
Yu
On Tue, Nov 13, 2018 at 01:45:44PM +0800, Yu Zhang wrote:
[...]
> > > Since at it, another thing I thought about is making sure the IOMMU
> > > capabilities will match between host and guest IOMMU, which I think
> > > this series has ignorred so far. E.g., when we're having assigned
> > > devices in the guest and with 5-level IOVA, we should make sure the
> > > host IOMMU supports 5-level as well before the guest starts since
> > > otherwise the shadow page synchronization could potentially fail when
> > > the requested IOVA address goes beyond 4-level. One simple solution
> > > is just to disable device assignment for now when we're with 57bits
> > > vIOMMU but I'm not sure whether that's what you want, especially you
> > > mentioned the DPDK case (who may use assigned devices).
> >
> > Ok I totally forgot that we don't even support any kind of check like
> > this before... So feel free to skip this comment if you want, or it
> > would be even nicer if you want to fix it as a whole. :)
> >
>
> Indeed. We have talked about this before. How about we focus on the 5-level
> extension for now, and solve the check issue in the future? I still do not
> have any clean solutions in mind. BTW, any suggestions on this issue? :)
I started to remember our discussions, sorry I should remember them
earlier... :)
The only thing in my mind (I think I also suggested the same thing
during that discussion, but I don't trust my memory any more...) is to
use sysfs. Say:
1. Scan /sys/class/iommu/dmarN for all the host IOMMUs, read cap of
each IOMMU from /sys/class/iommu/dmar0/intel-iommu/cap,
2. For each host iommu, scan /sys/class/iommu/dmarN/devices for all
the devices under each host IOMMU, then we can know which IOMMU
owns which device,
3. For each assigned device to the guest, we lookup the previous
information to know the mgaw for each host device, raise error
and stop QEMU from booting if any of the host device has less
level supported than the guest vIOMMU (possibly some more checks
in vtd_iommu_notify_flag_changed)
(we still have some issue on vtd_iommu_notify_flag_changed since it's
only run until the first enablement of vIOMMU, so we'll only raise
the error during guest Linux boots with vIOMMU on. But that's another
issue)
Regards,
--
Peter Xu
On Tue, Nov 13, 2018 at 02:12:17PM +0800, Peter Xu wrote: > On Tue, Nov 13, 2018 at 01:45:44PM +0800, Yu Zhang wrote: > > [...] > > > > > Since at it, another thing I thought about is making sure the IOMMU > > > > capabilities will match between host and guest IOMMU, which I think > > > > this series has ignorred so far. E.g., when we're having assigned > > > > devices in the guest and with 5-level IOVA, we should make sure the > > > > host IOMMU supports 5-level as well before the guest starts since > > > > otherwise the shadow page synchronization could potentially fail when > > > > the requested IOVA address goes beyond 4-level. One simple solution > > > > is just to disable device assignment for now when we're with 57bits > > > > vIOMMU but I'm not sure whether that's what you want, especially you > > > > mentioned the DPDK case (who may use assigned devices). > > > > > > Ok I totally forgot that we don't even support any kind of check like > > > this before... So feel free to skip this comment if you want, or it > > > would be even nicer if you want to fix it as a whole. :) > > > > > > > Indeed. We have talked about this before. How about we focus on the 5-level > > extension for now, and solve the check issue in the future? I still do not > > have any clean solutions in mind. BTW, any suggestions on this issue? :) > > I started to remember our discussions, sorry I should remember them > earlier... :) > > The only thing in my mind (I think I also suggested the same thing > during that discussion, but I don't trust my memory any more...) is to > use sysfs. Say: > > 1. Scan /sys/class/iommu/dmarN for all the host IOMMUs, read cap of > each IOMMU from /sys/class/iommu/dmar0/intel-iommu/cap, > > 2. For each host iommu, scan /sys/class/iommu/dmarN/devices for all > the devices under each host IOMMU, then we can know which IOMMU > owns which device, > > 3. For each assigned device to the guest, we lookup the previous > information to know the mgaw for each host device, raise error > and stop QEMU from booting if any of the host device has less > level supported than the guest vIOMMU (possibly some more checks > in vtd_iommu_notify_flag_changed) > > (we still have some issue on vtd_iommu_notify_flag_changed since it's > only run until the first enablement of vIOMMU, so we'll only raise > the error during guest Linux boots with vIOMMU on. But that's another > issue) Thanks for the explanation, Peter. You do have a better memory than I am.:) > > Regards, > > -- > Peter Xu > B.R. Yu
On Tue, Nov 13, 2018 at 11:37:07AM +0800, Peter Xu wrote:
> On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> > On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > > performing VFIO map/unmap operations, to avoid the burden of managing the
> > > > IOVA space.
> > >
> > > Since you mentioned about DPDK... I'm just curious that whether have
> > > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > > mode running in the guest? That would be something nice to mention in
> > > the cover letter if you have.
> > >
> >
> > Hah. Maybe I shall not mention DPDK here.
> >
> > The story is that we heard the requirement, saying applications like DPDK
> > would need 5-level paging in IOMMU side. And I was convinced after checked
> > DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> > patch with DPDK.
> >
> > Instead, I used kvm-unit-test to verify this patch series. And of course, I
> > also did some modification to the test case. Patch for the test also sent out
> > at https://www.spinics.net/lists/kvm/msg177425.html.
>
> Yeah that's perfectly fine for me. So instead maybe you can also
> mention the kvm-unit-test in the cover letter if you gonna repost.
Got it. Thanks!
>
> >
> > > [...]
> > >
> > > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
> > > > }
> > > > }
> > > >
> > > > - /* Currently only address widths supported are 39 and 48 bits */
> > > > + /* Currently address widths supported are 39, 48, and 57 bits */
> > > > if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > - (s->aw_bits != VTD_AW_48BIT)) {
> > > > - error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > > > - VTD_AW_39BIT, VTD_AW_48BIT);
> > > > + (s->aw_bits != VTD_AW_48BIT) &&
> > > > + (s->aw_bits != VTD_AW_57BIT)) {
> > > > + error_setg(errp, "Supported values for x-aw-bits are: %d, %d, %d",
> > > > + VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > + return false;
> > > > + }
> > > > +
> > > > + if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > + !(host_has_la57() && guest_has_la57())) {
> > > > + error_setg(errp, "Do not support 57-bit DMA address, unless both "
> > > > + "host and guest are capable of 5-level paging.\n");
> > >
> > > Is there any context (or pointer to previous discussions would work
> > > too) on explaining why we don't support some scenarios like
> > > host_paw=48,guest_paw=48,guest_gaw=57?
> > >
> >
> > Well, above check is only to make sure both the host and the guest can
> > use 57bit linear address, which requires 5-level paging. So I believe
> > we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> > The guest_has_la57() means the guest can use 57-bit linear address,
> > regardless of its physical address width.
>
> Sorry for my incorrect wording. I mean when host/guest CPU only
> support 4-level LA then would/should we allow the guest IOMMU to
> support 5-level IOVA? Asked since I'm thinking whether I can run the
> series a bit with my laptop/servers.
Well, by "only support", I guess you mean the hardware capability, instead
of its paging mode. So I do not think hardware will support 5-level IOVA for
platforms without 5-level VA. Therefore a 5-level vIOMMU is disallowed here. :)
>
> Since at it, another thing I thought about is making sure the IOMMU
> capabilities will match between host and guest IOMMU, which I think
> this series has ignorred so far. E.g., when we're having assigned
> devices in the guest and with 5-level IOVA, we should make sure the
> host IOMMU supports 5-level as well before the guest starts since
> otherwise the shadow page synchronization could potentially fail when
> the requested IOVA address goes beyond 4-level. One simple solution
> is just to disable device assignment for now when we're with 57bits
> vIOMMU but I'm not sure whether that's what you want, especially you
> mentioned the DPDK case (who may use assigned devices).
>
Thanks, Peter. Replied in the following up mail. :)
> (sorry to have mentioned the dpdk case again :)
>
> Regards,
>
> --
> Peter Xu
>
B.R.
Yu
© 2016 - 2026 Red Hat, Inc.