[RFC PATCH 3/5] qemu: Create PXBs and auto-assign VFIO devs and nested SMMUs

Nathan Chen via Devel posted 5 patches 1 year, 1 month ago
[RFC PATCH 3/5] qemu: Create PXBs and auto-assign VFIO devs and nested SMMUs
Posted by Nathan Chen via Devel 1 year, 1 month ago
Add a pcie-expander-bus controller to the VM definition for each "nestedSmmuv3"
device that is generated when the "nestedSmmuv3" IOMMU model is parsed from the
VM definition. Assign each "nestedSmmuv3" device to one PXB controller, and
route any unmanaged "hostdev" VFIO devices with associated host SMMU nodes to
their corresponding PXB controller based on the "name" attributes of
"nestedSmmuv3" devices attached to these PXB controllers.

Signed-off-by: Nathan Chen <nathanc@nvidia.com>
---
 src/conf/domain_addr.c         |  26 ++++++-
 src/conf/domain_addr.h         |   3 +-
 src/conf/domain_conf.c         |   1 +
 src/qemu/qemu_domain_address.c | 134 +++++++++++++++++++++++++++++++++
 4 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/src/conf/domain_addr.c b/src/conf/domain_addr.c
index a53ff6df6c..6d8ca89025 100644
--- a/src/conf/domain_addr.c
+++ b/src/conf/domain_addr.c
@@ -386,6 +386,8 @@ virDomainPCIAddressFlagsCompatible(virPCIDeviceAddress *addr,
             connectStr = "pcie-expander-bus";
         } else if (devFlags & VIR_PCI_CONNECT_TYPE_PCI_BRIDGE) {
             connectStr = "pci-bridge";
+        } else if (devFlags & VIR_PCI_CONNECT_TYPE_NESTED_SMMUV3) {
+            connectStr = "nestedSmmuv3 device";
         } else {
             /* this should never happen. If it does, there is a
              * bug in the code that sets the flag bits for devices.
@@ -565,7 +567,8 @@ virDomainPCIAddressBusSetModel(virDomainPCIAddressBus *bus,
          * dmi-to-pci-bridge
          */
         bus->flags = (VIR_PCI_CONNECT_TYPE_PCIE_ROOT_PORT |
-                      VIR_PCI_CONNECT_TYPE_DMI_TO_PCI_BRIDGE);
+                      VIR_PCI_CONNECT_TYPE_DMI_TO_PCI_BRIDGE |
+                      VIR_PCI_CONNECT_TYPE_NESTED_SMMUV3);
         bus->minSlot = 0;
         bus->maxSlot = VIR_PCI_ADDRESS_SLOT_LAST;
         break;
@@ -690,6 +693,8 @@ virDomainPCIAddressSetGrow(virDomainPCIAddressSet *addrs,
     } else if (flags & (VIR_PCI_CONNECT_TYPE_PCIE_DEVICE |
                         VIR_PCI_CONNECT_TYPE_PCIE_SWITCH_UPSTREAM_PORT)) {
         model = VIR_DOMAIN_CONTROLLER_MODEL_PCIE_ROOT_PORT;
+    } else if (flags & VIR_PCI_CONNECT_TYPE_NESTED_SMMUV3) {
+        model = VIR_DOMAIN_CONTROLLER_MODEL_PCIE_EXPANDER_BUS;
     } else {
         /* The types of devices that we can't auto-add a controller for:
          *
@@ -1030,6 +1035,11 @@ virDomainPCIAddressFindUnusedFunctionOnBus(virDomainPCIAddressBus *bus,
                 break;
             }
 
+            if (flags == VIR_PCI_CONNECT_TYPE_NESTED_SMMUV3) {
+                *found = false;
+                break;
+            }
+
             if (flags & VIR_PCI_CONNECT_AGGREGATE_SLOT &&
                 bus->slot[searchAddr->slot].aggregate) {
                 /* slot and device are okay with aggregating devices */
@@ -1087,6 +1097,20 @@ virDomainPCIAddressGetNextAddr(virDomainPCIAddressSet *addrs,
     else
         a.function = function;
 
+    if (flags == VIR_PCI_CONNECT_TYPE_NESTED_SMMUV3) {
+        if (addrs->dryRun) {
+            virDomainPCIAddressBus *bus = &addrs->buses[addrs->nbuses - 1];
+            /* a is already set to the first new bus */
+            a.bus = addrs->nbuses;
+            a.slot = bus->minSlot;
+            if (virDomainPCIAddressSetGrow(addrs, &a, flags) < 0)
+                return -1;
+            /* this device will use the first slot of the new bus */
+            a.slot = addrs->buses[a.bus].minSlot;
+            goto success;
+        }
+    }
+
     /* When looking for a suitable bus for the device, start by being
      * very strict and ignoring all those where the isolation groups
      * don't match. This ensures all devices sharing the same isolation
diff --git a/src/conf/domain_addr.h b/src/conf/domain_addr.h
index 9781685903..2881f2dadb 100644
--- a/src/conf/domain_addr.h
+++ b/src/conf/domain_addr.h
@@ -72,7 +72,8 @@ typedef enum {
     VIR_PCI_CONNECT_TYPE_PCI_EXPANDER_BUS | \
     VIR_PCI_CONNECT_TYPE_PCIE_EXPANDER_BUS | \
     VIR_PCI_CONNECT_TYPE_PCI_BRIDGE | \
-    VIR_PCI_CONNECT_TYPE_PCIE_TO_PCI_BRIDGE)
+    VIR_PCI_CONNECT_TYPE_PCIE_TO_PCI_BRIDGE | \
+    VIR_PCI_CONNECT_TYPE_NESTED_SMMUV3)
 
 /* combination of all bits that could be used to connect a normal
  * endpoint device (i.e. excluding the connection possible between an
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 24aff1cfbe..46f9b9b0cf 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -25,6 +25,7 @@
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <dirent.h>
 
 #include "configmake.h"
 #include "internal.h"
diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c
index 31004bfc7e..dee198a7d2 100644
--- a/src/qemu/qemu_domain_address.c
+++ b/src/qemu/qemu_domain_address.c
@@ -1627,6 +1627,19 @@ qemuDomainPCIAddressSetCreate(virDomainDef *def,
 
     addrs->dryRun = dryRun;
 
+    /* PXB indices must come before pcie-root-port indices in qemu,
+     * so add PXB buses to addrs before the pcie-root-ports. */
+
+    if (addrs->dryRun) {
+        for (i = 0; i < def->nnestedsmmus; i++) {
+            if (!virDeviceInfoPCIAddressIsWanted(def->nestedsmmus[i]->info))
+                continue;
+            if (qemuDomainPCIAddressReserveNextAddr(addrs,
+                                                    def->nestedsmmus[i]->info) < 0)
+                return NULL;
+        }
+    }
+
     /* pSeries domains support multiple pci-root controllers */
     if (qemuDomainIsPSeries(def))
         addrs->areMultipleRootsSupported = true;
@@ -2030,6 +2043,109 @@ qemuDomainValidateDevicePCISlotsChipsets(virDomainDef *def,
 }
 
 
+static char*
+retrieveSysfsDevPath(virPCIDeviceAddress* addr, const char* path)
+{
+    return g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x%s",
+                addr->domain,
+                addr->bus,
+                addr->slot,
+                addr->function,
+                path ? path : "");
+}
+
+
+static char *
+nestedSmmuVfioHostdevFound(virDomainHostdevDef *hostdev, bool dryRun)
+{
+    char* devPath = NULL;
+    char* devSmmuPath = NULL;
+    char* devVFIOPath = NULL;
+    g_autoptr(DIR) dir = NULL;
+    g_autoptr(DIR) smmuDir = NULL;
+    g_autoptr(DIR) VFIODir = NULL;
+    char* dir_iommu = NULL;
+    char* smmu_node = NULL;
+    devPath = retrieveSysfsDevPath(&hostdev->source.subsys.u.pci.addr, "");
+    if (virDirOpenIfExists(&dir, devPath) < 1)
+        return NULL;
+    devSmmuPath = retrieveSysfsDevPath(&hostdev->source.subsys.u.pci.addr, "/iommu");
+    if (virDirOpenIfExists(&smmuDir, devSmmuPath) < 1)
+        return NULL;
+    devVFIOPath = retrieveSysfsDevPath(&hostdev->source.subsys.u.pci.addr, "/vfio-dev");
+    if (hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI &&
+        (hostdev->info->addr.pci.bus != 0 || dryRun)) {
+        // We only want to route vfio hostdevs
+        if (hostdev->managed ||
+            (virDirOpenIfExists(&VFIODir, devVFIOPath) == 1)) {
+            // Get the hostdev's associated SMMU node name
+            dir_iommu = realpath(devSmmuPath, NULL);
+            if (!dir_iommu)
+                return NULL;
+            smmu_node = g_path_get_basename(dir_iommu);
+            if (!smmu_node)
+                return NULL;
+        }
+    }
+    return smmu_node;
+}
+
+
+static virDomainControllerDef *
+qemuDomainGetUpstreamCont(virDomainDef *def,
+                          virDomainDeviceInfo *downstreamInfo,
+                          int model)
+{
+    size_t i;
+    for (i = 0; i < def->ncontrollers; i++) {
+        if (def->controllers[i]->idx == downstreamInfo->addr.pci.bus &&
+            def->controllers[i]->model == model)
+            return def->controllers[i];
+    }
+    return NULL;
+}
+
+
+static int
+qemuDomainAssignNestedSmmuv3HostdevSlots(virDomainDef *def,
+                                         virDomainPCIAddressSet *addrs)
+{
+    size_t i, j;
+    char* smmu_node = NULL;
+    virDomainControllerDef *rootPort;
+    virDomainPCIAddressSet *set = NULL;
+    if (def->iommu != NULL && def->iommu->model == VIR_DOMAIN_IOMMU_MODEL_NESTED_SMMUV3 &&
+        def->nnestedsmmus > 0) {
+        for (i = 0; i < def->nhostdevs; i++) {
+            if (!(smmu_node = nestedSmmuVfioHostdevFound(def->hostdevs[i], addrs->dryRun)))
+                continue;
+            /* Find a hostdev and nested SMMU pair */
+            for (j = 0; j < def->nnestedsmmus; j++) {
+                unsigned int nestedSmmuBus = def->nestedsmmus[j]->info->addr.pci.bus;
+                virDomainControllerDef *pxb;
+                if (!STREQLEN(def->nestedsmmus[j]->name, smmu_node, strlen(smmu_node)))
+                    continue;
+                /* Get the hostdev's pcie-root-port controller */
+                rootPort = qemuDomainGetUpstreamCont(def, def->hostdevs[i]->info,
+                                                     VIR_DOMAIN_CONTROLLER_MODEL_PCIE_ROOT_PORT);
+                /* Skip if already assigned */
+                pxb = qemuDomainGetUpstreamCont(def, &rootPort->info,
+                                                VIR_DOMAIN_CONTROLLER_MODEL_PCIE_EXPANDER_BUS);
+                if (pxb)
+                    break;
+                /* Assign the controller to the next available slot/func on
+                 * the corresponding PXB */
+                set = virDomainPCIAddressSetAlloc(nestedSmmuBus + 1,
+                                                  VIR_PCI_ADDRESS_EXTENSION_NONE);
+                set->buses[nestedSmmuBus] = addrs->buses[nestedSmmuBus];
+                qemuDomainPCIAddressReserveNextAddr(set, &rootPort->info);
+                break;
+            }
+        }
+    }
+    return 0;
+}
+
 /*
  * This assigns static PCI slots to all configured devices.
  * The ordering here is chosen to match the ordering used
@@ -2262,6 +2378,18 @@ qemuDomainAssignDevicePCISlots(virDomainDef *def,
             return -1;
     }
 
+    /* Nested SMMUs */
+    if (!addrs->dryRun) {
+        for (i = 0; i < def->nnestedsmmus; i++) {
+            if (!virDeviceInfoPCIAddressIsWanted(def->nestedsmmus[i]->info))
+                continue;
+
+            if (qemuDomainPCIAddressReserveNextAddr(addrs,
+                                                    def->nestedsmmus[i]->info) < 0)
+                return -1;
+        }
+    }
+
     /* Host PCI devices */
     for (i = 0; i < def->nhostdevs; i++) {
         virDomainHostdevSubsys *subsys = &def->hostdevs[i]->source.subsys;
@@ -2286,6 +2414,12 @@ qemuDomainAssignDevicePCISlots(virDomainDef *def,
             return -1;
     }
 
+    // Route hostdevs to nested SMMUs
+    if (!addrs->dryRun) {
+        if (qemuDomainAssignNestedSmmuv3HostdevSlots(def, addrs) < 0)
+            return -1;
+    }
+
     /* memballoon. the qemu driver only accepts virtio memballoon devices */
     if (virDomainDefHasMemballoon(def) &&
         virDeviceInfoPCIAddressIsWanted(&def->memballoon->info)) {
-- 
2.34.1
Re: [RFC PATCH 3/5] qemu: Create PXBs and auto-assign VFIO devs and nested SMMUs
Posted by Daniel P. Berrangé 1 year ago
On Wed, Dec 11, 2024 at 04:24:21PM -0800, Nathan Chen via Devel wrote:
> Add a pcie-expander-bus controller to the VM definition for each "nestedSmmuv3"
> device that is generated when the "nestedSmmuv3" IOMMU model is parsed from the
> VM definition. Assign each "nestedSmmuv3" device to one PXB controller, and
> route any unmanaged "hostdev" VFIO devices with associated host SMMU nodes to
> their corresponding PXB controller based on the "name" attributes of
> "nestedSmmuv3" devices attached to these PXB controllers.
> 
> Signed-off-by: Nathan Chen <nathanc@nvidia.com>
> ---
>  src/conf/domain_addr.c         |  26 ++++++-
>  src/conf/domain_addr.h         |   3 +-
>  src/conf/domain_conf.c         |   1 +
>  src/qemu/qemu_domain_address.c | 134 +++++++++++++++++++++++++++++++++
>  4 files changed, 162 insertions(+), 2 deletions(-)

> diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c
> index 31004bfc7e..dee198a7d2 100644
> --- a/src/qemu/qemu_domain_address.c
> +++ b/src/qemu/qemu_domain_address.c
> @@ -1627,6 +1627,19 @@ qemuDomainPCIAddressSetCreate(virDomainDef *def,
>  
>      addrs->dryRun = dryRun;
>  
> +    /* PXB indices must come before pcie-root-port indices in qemu,
> +     * so add PXB buses to addrs before the pcie-root-ports. */

Can you elaborate on that requirement - I don't get why QEMU cares
which order slots are chosen in for PXB vs PCIe-Root-Port devices.

> +
> +    if (addrs->dryRun) {
> +        for (i = 0; i < def->nnestedsmmus; i++) {
> +            if (!virDeviceInfoPCIAddressIsWanted(def->nestedsmmus[i]->info))
> +                continue;
> +            if (qemuDomainPCIAddressReserveNextAddr(addrs,
> +                                                    def->nestedsmmus[i]->info) < 0)
> +                return NULL;
> +        }
> +    }
> +

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
Re: [RFC PATCH 3/5] qemu: Create PXBs and auto-assign VFIO devs and nested SMMUs
Posted by Nathan Chen via Devel 1 year ago
>> Add a pcie-expander-bus controller to the VM definition for each "nestedSmmuv3"
>> device that is generated when the "nestedSmmuv3" IOMMU model is parsed from the
>> VM definition. Assign each "nestedSmmuv3" device to one PXB controller, and
>> route any unmanaged "hostdev" VFIO devices with associated host SMMU nodes to
>> their corresponding PXB controller based on the "name" attributes of
>> "nestedSmmuv3" devices attached to these PXB controllers.
>>
>> Signed-off-by: Nathan Chen<nathanc@nvidia.com>
>> ---
>>   src/conf/domain_addr.c         |  26 ++++++-
>>   src/conf/domain_addr.h         |   3 +-
>>   src/conf/domain_conf.c         |   1 +
>>   src/qemu/qemu_domain_address.c | 134 +++++++++++++++++++++++++++++++++
>>   4 files changed, 162 insertions(+), 2 deletions(-)
>> diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c
>> index 31004bfc7e..dee198a7d2 100644
>> --- a/src/qemu/qemu_domain_address.c
>> +++ b/src/qemu/qemu_domain_address.c
>> @@ -1627,6 +1627,19 @@ qemuDomainPCIAddressSetCreate(virDomainDef *def,
>>   
>>       addrs->dryRun = dryRun;
>>   
>> +    /* PXB indices must come before pcie-root-port indices in qemu,
>> +     * so add PXB buses to addrs before the pcie-root-ports. */
> Can you elaborate on that requirement - I don't get why QEMU cares
> which order slots are chosen in for PXB vs PCIe-Root-Port devices.

The libvirt controller index describes the order in which bus 
controllers are encountered; if the pcie-root-port index is less than 
the PXB index, the pcie-root-port will not find a PXB to attach to and 
VM creation will fail. To address this, PXBs must be added to the VM 
definition first and get assigned a lower index than the pcie-root-ports.
Re: [RFC PATCH 3/5] qemu: Create PXBs and auto-assign VFIO devs and nested SMMUs
Posted by Daniel P. Berrangé 1 year ago
On Thu, Jan 30, 2025 at 10:00:21AM -0800, Nathan Chen wrote:
> 
> > > Add a pcie-expander-bus controller to the VM definition for each "nestedSmmuv3"
> > > device that is generated when the "nestedSmmuv3" IOMMU model is parsed from the
> > > VM definition. Assign each "nestedSmmuv3" device to one PXB controller, and
> > > route any unmanaged "hostdev" VFIO devices with associated host SMMU nodes to
> > > their corresponding PXB controller based on the "name" attributes of
> > > "nestedSmmuv3" devices attached to these PXB controllers.
> > > 
> > > Signed-off-by: Nathan Chen<nathanc@nvidia.com>
> > > ---
> > >   src/conf/domain_addr.c         |  26 ++++++-
> > >   src/conf/domain_addr.h         |   3 +-
> > >   src/conf/domain_conf.c         |   1 +
> > >   src/qemu/qemu_domain_address.c | 134 +++++++++++++++++++++++++++++++++
> > >   4 files changed, 162 insertions(+), 2 deletions(-)
> > > diff --git a/src/qemu/qemu_domain_address.c b/src/qemu/qemu_domain_address.c
> > > index 31004bfc7e..dee198a7d2 100644
> > > --- a/src/qemu/qemu_domain_address.c
> > > +++ b/src/qemu/qemu_domain_address.c
> > > @@ -1627,6 +1627,19 @@ qemuDomainPCIAddressSetCreate(virDomainDef *def,
> > >       addrs->dryRun = dryRun;
> > > +    /* PXB indices must come before pcie-root-port indices in qemu,
> > > +     * so add PXB buses to addrs before the pcie-root-ports. */
> > Can you elaborate on that requirement - I don't get why QEMU cares
> > which order slots are chosen in for PXB vs PCIe-Root-Port devices.
> 
> The libvirt controller index describes the order in which bus controllers
> are encountered; if the pcie-root-port index is less than the PXB index, the
> pcie-root-port will not find a PXB to attach to and VM creation will fail.
> To address this, PXBs must be added to the VM definition first and get
> assigned a lower index than the pcie-root-ports.

Oh, so you're specifically refering to a case of a pcie-root-port that
is attached to a PXB that is listed 2nd in the XML, rather than relative
to all possible PXBs.

Assigning PXB indexes first fixes the problem in cases where libvirt is
the one assigning indexes, but if users manually assign any indexes the
problem could still hit.

Possibly libvirt needs to be more intelligent about the order in which
it specifies the controller devices on the QEMU command line, rather
than blindly following XML or index order 

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|