[PATCH v4 18/27] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding

Shameer Kolothum posted 27 patches 1 month, 2 weeks ago
There is a newer version of this series
[PATCH v4 18/27] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding
Posted by Shameer Kolothum 1 month, 2 weeks ago
From: Eric Auger <eric.auger@redhat.com>

To handle SMMUv3 nested stage support it is practical to expose the guest
with reserved memory regions (RMRs) covering the IOVAs used by the host
kernel to map physical MSI doorbells.

Those IOVAs belong to [0x8000000, 0x8100000] matching MSI_IOVA_BASE and
MSI_IOVA_LENGTH definitions in kernel arm-smmu-v3 driver. This is the
window used to allocate IOVAs matching physical MSI doorbells.

With those RMRs, the guest is forced to use a flat mapping for this range.
Hence the assigned device is programmed with one IOVA from this range.
Stage 1, owned by the guest has a flat mapping for this IOVA. Stage2,
owned by the VMM then enforces a mapping from this IOVA to the physical
MSI doorbell.

The creation of those RMR nodes is only relevant if nested stage SMMU is
in use, along with VFIO. As VFIO devices can be hotplugged, all RMRs need
to be created in advance.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Suggested-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
 hw/arm/virt-acpi-build.c | 75 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index fd03b7f6a9..d0c1e10019 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -264,7 +264,8 @@ static void acpi_dsdt_add_tpm(Aml *scope, VirtMachineState *vms)
  * Note that @id_count gets internally subtracted by one, following the spec.
  */
 static void build_iort_id_mapping(GArray *table_data, uint32_t input_base,
-                                  uint32_t id_count, uint32_t out_ref)
+                                  uint32_t id_count, uint32_t out_ref,
+                                  uint32_t flags)
 {
     build_append_int_noprefix(table_data, input_base, 4); /* Input base */
     /* Number of IDs - The number of IDs in the range minus one */
@@ -272,7 +273,7 @@ static void build_iort_id_mapping(GArray *table_data, uint32_t input_base,
     build_append_int_noprefix(table_data, input_base, 4); /* Output base */
     build_append_int_noprefix(table_data, out_ref, 4); /* Output Reference */
     /* Flags */
-    build_append_int_noprefix(table_data, 0 /* Single mapping (disabled) */, 4);
+    build_append_int_noprefix(table_data, flags, 4);
 }
 
 struct AcpiIortIdMapping {
@@ -320,6 +321,7 @@ typedef struct AcpiIortSMMUv3Dev {
     GArray *rc_smmu_idmaps;
     /* Offset of the SMMUv3 IORT Node relative to the start of the IORT */
     size_t offset;
+    bool accel;
 } AcpiIortSMMUv3Dev;
 
 /*
@@ -374,6 +376,7 @@ static int iort_smmuv3_devices(Object *obj, void *opaque)
     }
 
     bus = PCI_BUS(object_property_get_link(obj, "primary-bus", &error_abort));
+    sdev.accel = object_property_get_bool(obj, "accel", &error_abort);
     pbus = PLATFORM_BUS_DEVICE(vms->platform_bus_dev);
     sbdev = SYS_BUS_DEVICE(obj);
     sdev.base = platform_bus_get_mmio_addr(pbus, sbdev, 0);
@@ -447,6 +450,56 @@ static void create_rc_its_idmaps(GArray *its_idmaps, GArray *smmuv3_devs)
     }
 }
 
+static void
+build_iort_rmr_nodes(GArray *table_data, GArray *smmuv3_devices, uint32_t *id)
+{
+    AcpiIortSMMUv3Dev *sdev;
+    AcpiIortIdMapping *idmap;
+    int i;
+
+    for (i = 0; i < smmuv3_devices->len; i++) {
+        sdev = &g_array_index(smmuv3_devices, AcpiIortSMMUv3Dev, i);
+        idmap = &g_array_index(sdev->rc_smmu_idmaps, AcpiIortIdMapping, 0);
+        int bdf = idmap->input_base;
+
+        if (!sdev->accel) {
+            continue;
+        }
+
+        /* Table 18 Reserved Memory Range Node */
+        build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */
+        /* Length */
+        build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE + 20,
+                                  2);
+        build_append_int_noprefix(table_data, 3, 1); /* Revision */
+        build_append_int_noprefix(table_data, *id, 4); /* Identifier */
+        /* Number of ID mappings */
+        build_append_int_noprefix(table_data, 1, 4);
+        /* Reference to ID Array */
+        build_append_int_noprefix(table_data, 28, 4);
+
+        /* RMR specific data */
+
+        /* Flags */
+        build_append_int_noprefix(table_data, 0 /* Disallow remapping */, 4);
+        /* Number of Memory Range Descriptors */
+        build_append_int_noprefix(table_data, 1 , 4);
+        /* Reference to Memory Range Descriptors */
+        build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE, 4);
+        build_iort_id_mapping(table_data, bdf, idmap->id_count, sdev->offset,
+                              1);
+
+        /* Table 19 Memory Range Descriptor */
+
+        /* Physical Range offset */
+        build_append_int_noprefix(table_data, 0x8000000, 8);
+        /* Physical Range length */
+        build_append_int_noprefix(table_data, 0x100000, 8);
+        build_append_int_noprefix(table_data, 0, 4); /* Reserved */
+        *id += 1;
+    }
+}
+
 /*
  * Input Output Remapping Table (IORT)
  * Conforms to "IO Remapping Table System Software on ARM Platforms",
@@ -464,7 +517,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     GArray *smmuv3_devs = g_array_new(false, true, sizeof(AcpiIortSMMUv3Dev));
     GArray *rc_its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
 
-    AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id,
+    AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id,
                         .oem_table_id = vms->oem_table_id };
     /* Table 2 The IORT */
     acpi_table_begin(&table, table_data);
@@ -490,6 +543,13 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
             nb_nodes++; /* ITS */
             rc_mapping_count += rc_its_idmaps->len;
         }
+        /* Calculate RMR nodes required. One per SMMUv3 with accelerated mode */
+        for (i = 0; i < num_smmus; i++) {
+            sdev = &g_array_index(smmuv3_devs, AcpiIortSMMUv3Dev, i);
+            if (sdev->accel) {
+                nb_nodes++;
+            }
+        }
     } else {
         if (vms->its) {
             nb_nodes = 2; /* RC and ITS */
@@ -562,7 +622,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         /* Array of ID mappings */
         if (smmu_mapping_count) {
             /* Output IORT node is the ITS Group node (the first node). */
-            build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET);
+            build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0);
         }
     }
 
@@ -614,7 +674,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
                                        AcpiIortIdMapping, j);
                 /* Output IORT node is the SMMUv3 node. */
                 build_iort_id_mapping(table_data, range->input_base,
-                                      range->id_count, sdev->offset);
+                                      range->id_count, sdev->offset, 0);
             }
         }
 
@@ -627,7 +687,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
                 range = &g_array_index(rc_its_idmaps, AcpiIortIdMapping, i);
                 /* Output IORT node is the ITS Group node (the first node). */
                 build_iort_id_mapping(table_data, range->input_base,
-                                      range->id_count, IORT_NODE_OFFSET);
+                                      range->id_count, IORT_NODE_OFFSET, 0);
             }
         }
     } else {
@@ -636,9 +696,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
          * SMMU: RC -> ITS.
          * Output IORT node is the ITS Group node (the first node).
          */
-        build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET);
+        build_iort_id_mapping(table_data, 0, 0x10000, IORT_NODE_OFFSET, 0);
     }
 
+    build_iort_rmr_nodes(table_data, smmuv3_devs, &id);
     acpi_table_end(linker, &table);
     g_array_free(rc_its_idmaps, true);
     for (i = 0; i < num_smmus; i++) {
-- 
2.43.0
Re: [PATCH v4 18/27] hw/arm/virt-acpi-build: Add IORT RMR regions to handle MSI nested binding
Posted by Jonathan Cameron via 1 month, 2 weeks ago
On Mon, 29 Sep 2025 14:36:34 +0100
Shameer Kolothum <skolothumtho@nvidia.com> wrote:

> From: Eric Auger <eric.auger@redhat.com>
> 
> To handle SMMUv3 nested stage support it is practical to expose the guest
> with reserved memory regions (RMRs) covering the IOVAs used by the host
> kernel to map physical MSI doorbells.
> 
> Those IOVAs belong to [0x8000000, 0x8100000] matching MSI_IOVA_BASE and
> MSI_IOVA_LENGTH definitions in kernel arm-smmu-v3 driver. This is the
> window used to allocate IOVAs matching physical MSI doorbells.
> 
> With those RMRs, the guest is forced to use a flat mapping for this range.
> Hence the assigned device is programmed with one IOVA from this range.
> Stage 1, owned by the guest has a flat mapping for this IOVA. Stage2,
> owned by the VMM then enforces a mapping from this IOVA to the physical
> MSI doorbell.
> 
> The creation of those RMR nodes is only relevant if nested stage SMMU is
> in use, along with VFIO. As VFIO devices can be hotplugged, all RMRs need
> to be created in advance.
> 
> Signed-off-by: Eric Auger <eric.auger@redhat.com>
> Suggested-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
Various comments inline on references and making the code a little more resilient
and obvious wrt to the things that there happen to be 1 of but which the spec
allows for multiples of.

> ---
>  hw/arm/virt-acpi-build.c | 75 ++++++++++++++++++++++++++++++++++++----
>  1 file changed, 68 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index fd03b7f6a9..d0c1e10019 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c


> @@ -447,6 +450,56 @@ static void create_rc_its_idmaps(GArray *its_idmaps, GArray *smmuv3_devs)
>      }
>  }
>  
> +static void
> +build_iort_rmr_nodes(GArray *table_data, GArray *smmuv3_devices, uint32_t *id)
> +{
> +    AcpiIortSMMUv3Dev *sdev;
> +    AcpiIortIdMapping *idmap;
> +    int i;
> +
> +    for (i = 0; i < smmuv3_devices->len; i++) {
> +        sdev = &g_array_index(smmuv3_devices, AcpiIortSMMUv3Dev, i);
> +        idmap = &g_array_index(sdev->rc_smmu_idmaps, AcpiIortIdMapping, 0);
> +        int bdf = idmap->input_base;
> +
> +        if (!sdev->accel) {
> +            continue;
> +        }
> +
> +        /* Table 18 Reserved Memory Range Node */

Reference a spec version somewhere.  Table numbers tend to change over time.
Table 18 in E.g version of spec is Root Complex Node for example. This is now
table 19.


> +        build_append_int_noprefix(table_data, 6 /* RMR */, 1); /* Type */
> +        /* Length */
> +        build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE + 20,

Add something to justify that + 20 (which I think is size of the Memory Range descriptors?)
The +28 is to start of bit after RMR specific data so that is kind of fine. As below
I'd add a constant for the number of ID mapping entries.


> +                                  2);
> +        build_append_int_noprefix(table_data, 3, 1); /* Revision */
> +        build_append_int_noprefix(table_data, *id, 4); /* Identifier */
> +        /* Number of ID mappings */
> +        build_append_int_noprefix(table_data, 1, 4);

I'd define a constant for the number of these and use it to help build the size.
Sure it is 1, but even so that would make the logic of placement simpler I think.

> +        /* Reference to ID Array */
> +        build_append_int_noprefix(table_data, 28, 4);
> +
> +        /* RMR specific data */
> +
> +        /* Flags */
> +        build_append_int_noprefix(table_data, 0 /* Disallow remapping */, 4);

Whilst we are disallowing remapping as this says, we are also saying a few
other things as there are more things in this flags field. Such as that it's
nGnRnE

> +        /* Number of Memory Range Descriptors */
> +        build_append_int_noprefix(table_data, 1 , 4);

I'd use a constant for this as well so that can use it in the size generation
and here.

> +        /* Reference to Memory Range Descriptors */
> +        build_append_int_noprefix(table_data, 28 + ID_MAPPING_ENTRY_SIZE, 4);
> +        build_iort_id_mapping(table_data, bdf, idmap->id_count, sdev->offset,
> +                              1);
> +
> +        /* Table 19 Memory Range Descriptor */

As above numbers changed, so specific spec version in the references.
It's 20 in the spec I downloaded today.


> +
> +        /* Physical Range offset */
> +        build_append_int_noprefix(table_data, 0x8000000, 8);

Can we get these from any defines?  Feels like make these values match in a number
of places is necessary so we shouldn't really hard code them here.

> +        /* Physical Range length */
> +        build_append_int_noprefix(table_data, 0x100000, 8);
> +        build_append_int_noprefix(table_data, 0, 4); /* Reserved */
> +        *id += 1;

Trivial but why not 
	   *id++;
better yet, do it where it's used rather than leaving it down here.
That way if in future multiple IDs are added for some reason then the increments
will go with them calls to add them.


> +    }
> +}
> +
>  /*
>   * Input Output Remapping Table (IORT)
>   * Conforms to "IO Remapping Table System Software on ARM Platforms",
> @@ -464,7 +517,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>      GArray *smmuv3_devs = g_array_new(false, true, sizeof(AcpiIortSMMUv3Dev));
>      GArray *rc_its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
>  
> -    AcpiTable table = { .sig = "IORT", .rev = 3, .oem_id = vms->oem_id,
> +    AcpiTable table = { .sig = "IORT", .rev = 5, .oem_id = vms->oem_id,
>                          .oem_table_id = vms->oem_table_id };

Seem to be missing the bios table test updates that will break with that version uptick.
Probably add a doc reference here so we can keep aligned.  The spec E.g has reached revision 7
whilst this work has been ongoing.


Jonathan