[PATCH v7 1/2] qemu: Add support for HW-accelerated nested SMMUv3

Nathan Chen via Devel posted 2 patches 2 days, 11 hours ago
[PATCH v7 1/2] qemu: Add support for HW-accelerated nested SMMUv3
Posted by Nathan Chen via Devel 2 days, 11 hours ago
From: Nathan Chen <nathanc@nvidia.com>

Add support for enabling HW-accelerated nested SMMUv3
via <accel> attribute and its additional attributes for
ATS, SSID, RIL, and OAS configuration. Support element
for specifying PCI hostdev PASID capability offset.

Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
 docs/formatdomain.rst             |  36 +++++++++++
 src/conf/domain_conf.c            | 101 +++++++++++++++++++++++++++++-
 src/conf/domain_conf.h            |   6 ++
 src/conf/domain_validate.c        |  36 ++++++++++-
 src/conf/schemas/domaincommon.rng |  32 ++++++++++
 src/qemu/qemu_command.c           |  23 +++++++
 src/util/virpci.h                 |   4 ++
 7 files changed, 233 insertions(+), 5 deletions(-)

diff --git a/docs/formatdomain.rst b/docs/formatdomain.rst
index 04ef319a73..998d289ebf 100644
--- a/docs/formatdomain.rst
+++ b/docs/formatdomain.rst
@@ -4880,6 +4880,13 @@ or:
    device; if PCI ROM loading is disabled through this attribute, attempts to
    tweak the loading process further using the ``bar`` or ``file`` attributes
    will be rejected. :since:`Since 4.3.0 (QEMU and KVM only)`.
+``vpasidCapOffset``
+   The ``vpasidCapOffset`` element is used to change the offset at which a
+   PASID PCIe extended capability is placed in a vfio-pci device's PCIe
+   extended configuration space. If not specified or set to 0, the capability
+   is placed at the end of the extended configuration space when PASID is
+   supported. The offset must be 4-byte aligned and within the PCIe extended
+   configuration space.
 ``address``
    The ``address`` element for USB devices has a ``bus`` and ``device``
    attribute to specify the USB bus and device number the device appears at on
@@ -9264,6 +9271,35 @@ Example:
       The ``pciBus`` attribute notes the index of the controller that an
       IOMMU device is attached to. (QEMU/KVM and ``smmuv3`` model only)
 
+   ``accel``
+      The ``accel`` attribute with possible values ``on`` and ``off`` can be used
+      to enable hardware acceleration support for smmuv3Dev IOMMU devices.
+      (QEMU/KVM and ``smmuv3`` model only)
+
+   ``ats``
+      The ``ats`` attribute with possible values ``on`` and ``off`` can be used
+      to enable reporting Address Translation Services capability to the guest
+      for smmuv3Dev IOMMU devices with ``accel`` set to ``on``, if the host
+      SMMUv3 supports ATS and the associated passthrough device supports ATS.
+      (QEMU/KVM and ``smmuv3`` model only)
+
+   ``ril``
+      The ``ril`` attribute with possible values ``on`` and ``off`` can be used
+      to report whether Range Invalidation for smmuv3Dev IOMMU devices with
+      ``accel`` set to ``on`` is compatible with host SMMUv3 support.
+      (QEMU/KVM and ``smmuv3`` model only)
+
+   ``ssidSize``
+      The ``ssidSize`` attribute sets the number of bits used to represent
+      SubstreamIDs. A value of N allows SSIDs in the range [0 .. 2^N - 1].
+      The valid range is 0-20, and a value greater than 0 is required for
+      enabling PASID support, as doing so advertises PASID capability to
+      the vIOMMU. (QEMU/KVM and ``smmuv3`` model only)
+
+   ``oas``
+      The ``oas`` attribute sets the output address size in units of bits.
+      (QEMU/KVM and ``smmuv3`` model only)
+
 The ``virtio`` IOMMU devices can further have ``address`` element as described
 in `Device addresses`_ (address has to by type of ``pci``).
 
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 9672168df9..e64484a60d 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -13678,6 +13678,7 @@ virDomainHostdevDefParseXML(virDomainXMLOption *xmlopt,
     virDomainHostdevDef *def;
     VIR_XPATH_NODE_AUTORESTORE(ctxt)
     unsigned int type;
+    int rc;
 
     ctxt->node = node;
 
@@ -13731,8 +13732,16 @@ virDomainHostdevDefParseXML(virDomainXMLOption *xmlopt,
                 def->shareable = true;
             break;
 
-        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
         case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
+            rc = virXPathUIntBase("string(./vpasidCapOffset)", ctxt,
+                                  0, &def->vpasidCapOffset);
+            if (rc == -2) {
+                virReportError(VIR_ERR_XML_ERROR, "%s",
+                               _("Invalid format for vpasidCapOffset"));
+                goto error;
+            }
+            break;
+        case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_USB:
         case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI_HOST:
         case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_MDEV:
         case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_LAST:
@@ -14514,6 +14523,26 @@ virDomainIOMMUDefParseXML(virDomainXMLOption *xmlopt,
         if (virXMLPropInt(driver, "pciBus", 10, VIR_XML_PROP_NONE,
                           &iommu->pci_bus, -1) < 0)
             return NULL;
+
+        if (virXMLPropTristateSwitch(driver, "accel", VIR_XML_PROP_NONE,
+                                     &iommu->accel) < 0)
+            return NULL;
+
+        if (virXMLPropTristateSwitch(driver, "ats", VIR_XML_PROP_NONE,
+                                     &iommu->ats) < 0)
+            return NULL;
+
+        if (virXMLPropTristateSwitch(driver, "ril", VIR_XML_PROP_NONE,
+                                     &iommu->ril) < 0)
+            return NULL;
+
+        if (virXMLPropUInt(driver, "ssidSize", 10, VIR_XML_PROP_NONE,
+                           &iommu->ssid_size) < 0)
+            return NULL;
+
+        if (virXMLPropUInt(driver, "oas", 10, VIR_XML_PROP_NONE,
+                           &iommu->oas) < 0)
+            return NULL;
     }
 
     if (virDomainDeviceInfoParseXML(xmlopt, node, ctxt,
@@ -16577,7 +16606,13 @@ virDomainIOMMUDefEquals(const virDomainIOMMUDef *a,
         a->eim != b->eim ||
         a->iotlb != b->iotlb ||
         a->aw_bits != b->aw_bits ||
-        a->dma_translation != b->dma_translation)
+        a->dma_translation != b->dma_translation ||
+        a->pci_bus != b->pci_bus ||
+        a->accel != b->accel ||
+        a->ats != b->ats ||
+        a->ril != b->ril ||
+        a->ssid_size != b->ssid_size ||
+        a->oas != b->oas)
         return false;
 
     if (a->info.type != VIR_DOMAIN_DEVICE_ADDRESS_TYPE_NONE &&
@@ -21349,6 +21384,14 @@ virDomainHostdevDefCheckABIStability(virDomainHostdevDef *src,
         }
     }
 
+    if (src->vpasidCapOffset != dst->vpasidCapOffset) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Target host device vPASID capability offset %1$s does not match source %2$s"),
+                       virDomainHostdevModeTypeToString(dst->vpasidCapOffset),
+                       virDomainHostdevModeTypeToString(src->vpasidCapOffset));
+        return false;
+    }
+
     if (!virDomainDeviceInfoCheckABIStability(src->info, dst->info))
         return false;
 
@@ -22311,6 +22354,36 @@ virDomainIOMMUDefCheckABIStability(virDomainIOMMUDef *src,
                        dst->pci_bus, src->pci_bus);
         return false;
     }
+    if (src->accel != dst->accel) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Target domain IOMMU device accel value '%1$d' does not match source '%2$d'"),
+                       dst->accel, src->accel);
+        return false;
+    }
+    if (src->ats != dst->ats) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Target domain IOMMU device ATS value '%1$d' does not match source '%2$d'"),
+                       dst->ats, src->ats);
+        return false;
+    }
+    if (src->ril != dst->ril) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Target domain IOMMU device ril value '%1$d' does not match source '%2$d'"),
+                       dst->ril, src->ril);
+        return false;
+    }
+    if (src->ssid_size != dst->ssid_size) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Target domain IOMMU device ssid_size value '%1$d' does not match source '%2$d'"),
+                       dst->ssid_size, src->ssid_size);
+        return false;
+    }
+    if (src->oas != dst->oas) {
+        virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                       _("Target domain IOMMU device oas value '%1$d' does not match source '%2$d'"),
+                       dst->oas, src->oas);
+        return false;
+    }
     if (src->dma_translation != dst->dma_translation) {
         virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
                        _("Target domain IOMMU device dma translation '%1$s' does not match source '%2$s'"),
@@ -27732,6 +27805,10 @@ virDomainHostdevDefFormat(virBuffer *buf,
     if (def->shareable)
         virBufferAddLit(buf, "<shareable/>\n");
 
+    if (def->vpasidCapOffset)
+        virBufferAsprintf(buf, "<vpasidCapOffset>0x%x</vpasidCapOffset>\n",
+                          def->vpasidCapOffset);
+
     virDomainDeviceInfoFormat(buf, def->info, flags | VIR_DOMAIN_DEF_FORMAT_ALLOW_BOOT
                                                     | VIR_DOMAIN_DEF_FORMAT_ALLOW_ROM);
 
@@ -28657,6 +28734,26 @@ virDomainIOMMUDefFormat(virBuffer *buf,
         virBufferAsprintf(&driverAttrBuf, " pciBus='%d'",
                           iommu->pci_bus);
     }
+    if (iommu->accel != VIR_TRISTATE_SWITCH_ABSENT) {
+        virBufferAsprintf(&driverAttrBuf, " accel='%s'",
+                          virTristateSwitchTypeToString(iommu->accel));
+    }
+    if (iommu->ats != VIR_TRISTATE_SWITCH_ABSENT) {
+        virBufferAsprintf(&driverAttrBuf, " ats='%s'",
+                          virTristateSwitchTypeToString(iommu->ats));
+    }
+    if (iommu->ril != VIR_TRISTATE_SWITCH_ABSENT) {
+        virBufferAsprintf(&driverAttrBuf, " ril='%s'",
+                          virTristateSwitchTypeToString(iommu->ril));
+    }
+    if (iommu->ssid_size > 0) {
+        virBufferAsprintf(&driverAttrBuf, " ssidSize='%d'",
+                          iommu->ssid_size);
+    }
+    if (iommu->oas > 0) {
+        virBufferAsprintf(&driverAttrBuf, " oas='%d'",
+                          iommu->oas);
+    }
 
     virXMLFormatElement(&childBuf, "driver", &driverAttrBuf, NULL);
 
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 83d49969d3..1396073678 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -370,6 +370,7 @@ struct _virDomainHostdevDef {
     bool missing;
     bool readonly;
     bool shareable;
+    unsigned int vpasidCapOffset;
     virTristateBool writeFiltering;
     union {
         virDomainHostdevSubsys subsys;
@@ -3062,6 +3063,11 @@ struct _virDomainIOMMUDef {
     virTristateSwitch dma_translation;
     virTristateSwitch xtsup;
     virTristateSwitch pt;
+    virTristateSwitch accel;
+    virTristateSwitch ats;
+    virTristateSwitch ril;
+    unsigned int ssid_size;
+    unsigned int oas;
 };
 
 typedef enum {
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
index 4482203087..7b2c32395d 100644
--- a/src/conf/domain_validate.c
+++ b/src/conf/domain_validate.c
@@ -29,6 +29,7 @@
 #include "virutil.h"
 #include "virstring.h"
 #include "virhostmem.h"
+#include "virpci.h"
 
 #define VIR_FROM_THIS VIR_FROM_DOMAIN
 
@@ -2432,6 +2433,20 @@ virDomainHostdevDefValidate(const virDomainHostdevDef *hostdev)
                                _("PCI host devices must use 'pci' or 'unassigned' address type"));
                 return -1;
             }
+            if (hostdev->vpasidCapOffset) {
+                if (hostdev->vpasidCapOffset & 0x3) {
+                    virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                                   _("vpasidCapOffset must be 4-byte aligned"));
+                    return -1;
+                }
+                /* PASID ECAP size of 0x8 */
+                if (hostdev->vpasidCapOffset < VIR_DOMAIN_PCI_CONFIG_SPACE_SIZE ||
+                    hostdev->vpasidCapOffset > VIR_DOMAIN_PCIE_CONFIG_SPACE_SIZE - 0x8) {
+                    virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                                   _("vpasidCapOffset must be within PCIe extended configuration space (0x100-0xFFF)"));
+                    return -1;
+                }
+            }
             break;
         case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_SCSI:
             if (hostdev->info->type != VIR_DOMAIN_DEVICE_ADDRESS_TYPE_NONE &&
@@ -3208,7 +3223,12 @@ virDomainIOMMUDefValidate(const virDomainIOMMUDef *iommu)
             iommu->iotlb != VIR_TRISTATE_SWITCH_ABSENT ||
             iommu->aw_bits != 0 ||
             iommu->dma_translation != VIR_TRISTATE_SWITCH_ABSENT ||
-            iommu->pci_bus >= 0) {
+            iommu->pci_bus >= 0 ||
+            iommu->accel != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ats != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ril != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ssid_size != 0 ||
+            iommu->oas != 0) {
             virReportError(VIR_ERR_XML_ERROR,
                            _("iommu model '%1$s' doesn't support additional attributes"),
                            virDomainIOMMUModelTypeToString(iommu->model));
@@ -3221,7 +3241,12 @@ virDomainIOMMUDefValidate(const virDomainIOMMUDef *iommu)
             iommu->eim != VIR_TRISTATE_SWITCH_ABSENT ||
             iommu->aw_bits != 0 ||
             iommu->dma_translation != VIR_TRISTATE_SWITCH_ABSENT ||
-            iommu->pci_bus >= 0) {
+            iommu->pci_bus >= 0 ||
+            iommu->accel != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ats != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ril != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ssid_size != 0 ||
+            iommu->oas != 0) {
             virReportError(VIR_ERR_XML_ERROR,
                            _("iommu model '%1$s' doesn't support some additional attributes"),
                            virDomainIOMMUModelTypeToString(iommu->model));
@@ -3232,7 +3257,12 @@ virDomainIOMMUDefValidate(const virDomainIOMMUDef *iommu)
     case VIR_DOMAIN_IOMMU_MODEL_INTEL:
         if (iommu->pt != VIR_TRISTATE_SWITCH_ABSENT ||
             iommu->xtsup != VIR_TRISTATE_SWITCH_ABSENT ||
-            iommu->pci_bus >= 0) {
+            iommu->pci_bus >= 0 ||
+            iommu->accel != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ats != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ril != VIR_TRISTATE_SWITCH_ABSENT ||
+            iommu->ssid_size != 0 ||
+            iommu->oas != 0) {
             virReportError(VIR_ERR_XML_ERROR,
                            _("iommu model '%1$s' doesn't support some additional attributes"),
                            virDomainIOMMUModelTypeToString(iommu->model));
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
index 114dd3f96f..234aae5459 100644
--- a/src/conf/schemas/domaincommon.rng
+++ b/src/conf/schemas/domaincommon.rng
@@ -6329,6 +6329,31 @@
                 <data type="unsignedInt"/>
               </attribute>
             </optional>
+            <optional>
+              <attribute name="accel">
+                <ref name="virOnOff"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="ats">
+                <ref name="virOnOff"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="ril">
+                <ref name="virOnOff"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="ssidSize">
+                <data type="unsignedInt"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="oas">
+                <data type="unsignedInt"/>
+              </attribute>
+            </optional>
           </element>
         </optional>
         <optional>
@@ -6610,6 +6635,13 @@
           <ref name="pciaddress"/>
         </element>
       </element>
+      <optional>
+        <element name="vpasidCapOffset">
+          <data type="string">
+            <param name="pattern">(0x[0-9a-fA-F]+|[0-9]+)</param>
+          </data>
+        </element>
+      </optional>
       <ref name="hostdevsubsysvfiodisplay"/>
     </interleave>
   </define>
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index e81efdfde7..aa5ee2a787 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4802,6 +4802,7 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
                               "S:failover_pair_id", failover_pair_id,
                               "S:display", qemuOnOffAuto(pcisrc->display),
                               "B:ramfb", ramfb,
+                              "p:x-vpasid-cap-offset", dev->vpasidCapOffset,
                               NULL) < 0)
         return NULL;
 
@@ -6267,9 +6268,31 @@ qemuBuildPCINestedSmmuv3DevProps(const virDomainDef *def,
                               "s:driver", "arm-smmuv3",
                               "s:primary-bus", bus,
                               "s:id", iommu->info.alias,
+                              "B:accel", (iommu->accel == VIR_TRISTATE_SWITCH_ON),
+                              "B:ats", (iommu->ats == VIR_TRISTATE_SWITCH_ON),
                               NULL) < 0)
         return NULL;
 
+    /* QEMU SMMUv3 has RIL support by default; only emit when explicitly disabling */
+    if (iommu->ril == VIR_TRISTATE_SWITCH_OFF) {
+        if (virJSONValueObjectAppendBoolean(props, "ril", false) < 0)
+            return NULL;
+    }
+
+    if (iommu->ssid_size > 0) {
+        if (virJSONValueObjectAdd(&props,
+                                  "p:ssidsize", iommu->ssid_size,
+                                  NULL) < 0)
+            return NULL;
+    }
+
+    if (iommu->oas > 0) {
+        if (virJSONValueObjectAdd(&props,
+                                  "p:oas", iommu->oas,
+                                  NULL) < 0)
+            return NULL;
+    }
+
     return g_steal_pointer(&props);
 }
 
diff --git a/src/util/virpci.h b/src/util/virpci.h
index fc538566e1..7e78cb267c 100644
--- a/src/util/virpci.h
+++ b/src/util/virpci.h
@@ -35,6 +35,10 @@ G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIDeviceList, virObjectUnref);
 
 #define VIR_DOMAIN_DEVICE_ZPCI_MAX_UID UINT16_MAX
 #define VIR_DOMAIN_DEVICE_ZPCI_MAX_FID UINT32_MAX
+/* Size of the standard PCI config space */
+#define VIR_DOMAIN_PCI_CONFIG_SPACE_SIZE 0x100
+/* Size of the standard PCIe config space: 4KB */
+#define VIR_DOMAIN_PCIE_CONFIG_SPACE_SIZE  0x1000
 
 typedef struct _virZPCIDeviceAddressID virZPCIDeviceAddressID;
 typedef struct _virZPCIDeviceAddress virZPCIDeviceAddress;
-- 
2.43.0