[RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes

Shameer Kolothum via posted 5 patches 1 year, 3 months ago
[RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Shameer Kolothum via 1 year, 3 months ago
From: Nicolin Chen <nicolinc@nvidia.com>

Now that we can have multiple user-creatable smmuv3-nested
devices, each associated with different pci buses, update
IORT ID mappings accordingly.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 hw/arm/virt-acpi-build.c | 34 ++++++++++++++++++++++++----------
 include/hw/arm/virt.h    |  6 ++++++
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index e10cad86dd..ec4cdfb2d7 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -276,8 +276,10 @@ static void
 build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
     int i, nb_nodes, rc_mapping_count;
-    size_t node_size, smmu_offset = 0;
+    size_t node_size, *smmu_offset;
     AcpiIortIdMapping *idmap;
+    hwaddr base;
+    int irq, num_smmus = 0;
     uint32_t id = 0;
     GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
     GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
@@ -287,7 +289,21 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     /* Table 2 The IORT */
     acpi_table_begin(&table, table_data);
 
-    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
+    if (vms->smmu_nested_count) {
+        irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE;
+        base = vms->memmap[VIRT_SMMU_NESTED].base;
+        num_smmus = vms->smmu_nested_count;
+    } else if (virt_has_smmuv3(vms)) {
+        irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
+        base = vms->memmap[VIRT_SMMU].base;
+        num_smmus = 1;
+    }
+
+    smmu_offset = g_new0(size_t, num_smmus);
+    nb_nodes = 2; /* RC, ITS */
+    nb_nodes += num_smmus; /* SMMU nodes */
+
+    if (virt_has_smmuv3(vms)) {
         AcpiIortIdMapping next_range = {0};
 
         object_child_foreach_recursive(object_get_root(),
@@ -317,10 +333,8 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
             g_array_append_val(its_idmaps, next_range);
         }
 
-        nb_nodes = 3; /* RC, ITS, SMMUv3 */
         rc_mapping_count = smmu_idmaps->len + its_idmaps->len;
     } else {
-        nb_nodes = 2; /* RC, ITS */
         rc_mapping_count = 1;
     }
     /* Number of IORT Nodes */
@@ -342,10 +356,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     /* GIC ITS Identifier Array */
     build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4);
 
-    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
-        int irq =  vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
+    for (i = 0; i < num_smmus; i++) {
+        smmu_offset[i] = table_data->len - table.table_offset;
 
-        smmu_offset = table_data->len - table.table_offset;
         /* Table 9 SMMUv3 Format */
         build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */
         node_size =  SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE;
@@ -356,7 +369,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         /* Reference to ID Array */
         build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4);
         /* Base address */
-        build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8);
+        build_append_int_noprefix(table_data, base + (i * SMMU_IO_LEN), 8);
         /* Flags */
         build_append_int_noprefix(table_data, 1 /* COHACC Override */, 4);
         build_append_int_noprefix(table_data, 0, 4); /* Reserved */
@@ -367,6 +380,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
         build_append_int_noprefix(table_data, irq + 1, 4); /* PRI */
         build_append_int_noprefix(table_data, irq + 3, 4); /* GERR */
         build_append_int_noprefix(table_data, irq + 2, 4); /* Sync */
+        irq += NUM_SMMU_IRQS;
         build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */
         /* DeviceID mapping index (ignored since interrupts are GSIV based) */
         build_append_int_noprefix(table_data, 0, 4);
@@ -405,7 +419,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
     build_append_int_noprefix(table_data, 0, 3); /* Reserved */
 
     /* Output Reference */
-    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
+    if (virt_has_smmuv3(vms)) {
         AcpiIortIdMapping *range;
 
         /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */
@@ -413,7 +427,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
             range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i);
             /* output IORT node is the smmuv3 node */
             build_iort_id_mapping(table_data, range->input_base,
-                                  range->id_count, smmu_offset);
+                                  range->id_count, smmu_offset[i]);
         }
 
         /* bypassed RIDs connect to ITS group node directly: RC -> ITS */
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 50e47a4ef3..304ab134ae 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -219,4 +219,10 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms)
             vms->highmem_redists) ? 2 : 1;
 }
 
+static inline bool virt_has_smmuv3(const VirtMachineState *vms)
+{
+    return vms->iommu == VIRT_IOMMU_SMMUV3 ||
+           vms->iommu == VIRT_IOMMU_SMMUV3_NESTED;
+}
+
 #endif /* QEMU_ARM_VIRT_H */
-- 
2.34.1


Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Eric Auger 1 year, 2 months ago
Hi Shameer,

On 11/8/24 13:52, Shameer Kolothum wrote:
> From: Nicolin Chen <nicolinc@nvidia.com>
>
> Now that we can have multiple user-creatable smmuv3-nested
> devices, each associated with different pci buses, update
> IORT ID mappings accordingly.
>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  hw/arm/virt-acpi-build.c | 34 ++++++++++++++++++++++++----------
>  include/hw/arm/virt.h    |  6 ++++++
>  2 files changed, 30 insertions(+), 10 deletions(-)
>
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index e10cad86dd..ec4cdfb2d7 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c
> @@ -276,8 +276,10 @@ static void
>  build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>  {
>      int i, nb_nodes, rc_mapping_count;
> -    size_t node_size, smmu_offset = 0;
> +    size_t node_size, *smmu_offset;
>      AcpiIortIdMapping *idmap;
> +    hwaddr base;
> +    int irq, num_smmus = 0;
>      uint32_t id = 0;
>      GArray *smmu_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
>      GArray *its_idmaps = g_array_new(false, true, sizeof(AcpiIortIdMapping));
> @@ -287,7 +289,21 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>      /* Table 2 The IORT */
>      acpi_table_begin(&table, table_data);
>  
> -    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
> +    if (vms->smmu_nested_count) {
> +        irq = vms->irqmap[VIRT_SMMU_NESTED] + ARM_SPI_BASE;
> +        base = vms->memmap[VIRT_SMMU_NESTED].base;
> +        num_smmus = vms->smmu_nested_count;
> +    } else if (virt_has_smmuv3(vms)) {
> +        irq = vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
> +        base = vms->memmap[VIRT_SMMU].base;
> +        num_smmus = 1;
> +    }
> +
> +    smmu_offset = g_new0(size_t, num_smmus);
> +    nb_nodes = 2; /* RC, ITS */
> +    nb_nodes += num_smmus; /* SMMU nodes */
> +
> +    if (virt_has_smmuv3(vms)) {
>          AcpiIortIdMapping next_range = {0};
>  
>          object_child_foreach_recursive(object_get_root(),
> @@ -317,10 +333,8 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>              g_array_append_val(its_idmaps, next_range);
>          }
>  
> -        nb_nodes = 3; /* RC, ITS, SMMUv3 */
>          rc_mapping_count = smmu_idmaps->len + its_idmaps->len;
>      } else {
> -        nb_nodes = 2; /* RC, ITS */
>          rc_mapping_count = 1;
>      }
>      /* Number of IORT Nodes */
> @@ -342,10 +356,9 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>      /* GIC ITS Identifier Array */
>      build_append_int_noprefix(table_data, 0 /* MADT translation_id */, 4);
>  
> -    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
> -        int irq =  vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
> +    for (i = 0; i < num_smmus; i++) {
> +        smmu_offset[i] = table_data->len - table.table_offset;
>  
I would have expected changes in the smmu idmap has well. If a given
SMMU instance now protects a given bus hierarchy shouldn't it be
reflected in a differentiated SMMU idmap for each of them (RID subset of
SMMU->pci-bus mapping to a specific IORT SMMU node)? How is it done
currently?

Thanks

Eric
> -        smmu_offset = table_data->len - table.table_offset;
>          /* Table 9 SMMUv3 Format */
>          build_append_int_noprefix(table_data, 4 /* SMMUv3 */, 1); /* Type */
>          node_size =  SMMU_V3_ENTRY_SIZE + ID_MAPPING_ENTRY_SIZE;
> @@ -356,7 +369,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>          /* Reference to ID Array */
>          build_append_int_noprefix(table_data, SMMU_V3_ENTRY_SIZE, 4);
>          /* Base address */
> -        build_append_int_noprefix(table_data, vms->memmap[VIRT_SMMU].base, 8);
> +        build_append_int_noprefix(table_data, base + (i * SMMU_IO_LEN), 8);
>          /* Flags */
>          build_append_int_noprefix(table_data, 1 /* COHACC Override */, 4);
>          build_append_int_noprefix(table_data, 0, 4); /* Reserved */
> @@ -367,6 +380,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>          build_append_int_noprefix(table_data, irq + 1, 4); /* PRI */
>          build_append_int_noprefix(table_data, irq + 3, 4); /* GERR */
>          build_append_int_noprefix(table_data, irq + 2, 4); /* Sync */
> +        irq += NUM_SMMU_IRQS;
>          build_append_int_noprefix(table_data, 0, 4); /* Proximity domain */
>          /* DeviceID mapping index (ignored since interrupts are GSIV based) */
>          build_append_int_noprefix(table_data, 0, 4);
> @@ -405,7 +419,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>      build_append_int_noprefix(table_data, 0, 3); /* Reserved */
>  
>      /* Output Reference */
> -    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
> +    if (virt_has_smmuv3(vms)) {
>          AcpiIortIdMapping *range;
>  
>          /* translated RIDs connect to SMMUv3 node: RC -> SMMUv3 -> ITS */
> @@ -413,7 +427,7 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
>              range = &g_array_index(smmu_idmaps, AcpiIortIdMapping, i);
>              /* output IORT node is the smmuv3 node */
>              build_iort_id_mapping(table_data, range->input_base,
> -                                  range->id_count, smmu_offset);
> +                                  range->id_count, smmu_offset[i]);
>          }
>  
>          /* bypassed RIDs connect to ITS group node directly: RC -> ITS */
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index 50e47a4ef3..304ab134ae 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -219,4 +219,10 @@ static inline int virt_gicv3_redist_region_count(VirtMachineState *vms)
>              vms->highmem_redists) ? 2 : 1;
>  }
>  
> +static inline bool virt_has_smmuv3(const VirtMachineState *vms)
> +{
> +    return vms->iommu == VIRT_IOMMU_SMMUV3 ||
> +           vms->iommu == VIRT_IOMMU_SMMUV3_NESTED;
> +}
> +
>  #endif /* QEMU_ARM_VIRT_H */


RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Shameerali Kolothum Thodi via 1 year, 2 months ago

> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: Monday, November 18, 2024 10:02 AM
> To: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
> qemu-devel@nongnu.org
> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> Jonathan Cameron <jonathan.cameron@huawei.com>;
> zhangfei.gao@linaro.org
> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> multiple SMMU nodes
> 

> >      /* Number of IORT Nodes */
> > @@ -342,10 +356,9 @@ build_iort(GArray *table_data, BIOSLinker
> *linker, VirtMachineState *vms)
> >      /* GIC ITS Identifier Array */
> >      build_append_int_noprefix(table_data, 0 /* MADT translation_id */,
> 4);
> >
> > -    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
> > -        int irq =  vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
> > +    for (i = 0; i < num_smmus; i++) {
> > +        smmu_offset[i] = table_data->len - table.table_offset;
> >
> I would have expected changes in the smmu idmap has well. If a given
> SMMU instance now protects a given bus hierarchy shouldn't it be
> reflected in a differentiated SMMU idmap for each of them (RID subset of
> SMMU->pci-bus mapping to a specific IORT SMMU node)? How is it done
> currently?

I thought that smmu_idmaps will be handled by this ?

object_child_foreach_recursive(object_get_root(),
                                       iort_host_bridges, smmu_idmaps);

But it is possible that, there is a bug in this IORT generation here as I am not
able to hot add  devices. It looks like the pciehp interrupt is not generated/received
for some reason. Nicolin[0] is suspecting the min/max bus range in
iort_host_bridges() may not leave enough ranges for hot add later.

Cold plugging devices to different SMMUv3/pcie-pxb seems to be alright.

I will debug that soon.

Thanks,
Shameer
[0] https://lore.kernel.org/qemu-devel/ZzPd1F%2FUA2MKMbwl@Asurada-Nvidia/


Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Eric Auger 1 year, 2 months ago
Hi Shameer,

On 11/18/24 12:44, Shameerali Kolothum Thodi wrote:
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Sent: Monday, November 18, 2024 10:02 AM
>> To: Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
>> qemu-devel@nongnu.org
>> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
>> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
>> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
>> Jonathan Cameron <jonathan.cameron@huawei.com>;
>> zhangfei.gao@linaro.org
>> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
>> multiple SMMU nodes
>>
>>>      /* Number of IORT Nodes */
>>> @@ -342,10 +356,9 @@ build_iort(GArray *table_data, BIOSLinker
>> *linker, VirtMachineState *vms)
>>>      /* GIC ITS Identifier Array */
>>>      build_append_int_noprefix(table_data, 0 /* MADT translation_id */,
>> 4);
>>> -    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
>>> -        int irq =  vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
>>> +    for (i = 0; i < num_smmus; i++) {
>>> +        smmu_offset[i] = table_data->len - table.table_offset;
>>>
>> I would have expected changes in the smmu idmap has well. If a given
>> SMMU instance now protects a given bus hierarchy shouldn't it be
>> reflected in a differentiated SMMU idmap for each of them (RID subset of
>> SMMU->pci-bus mapping to a specific IORT SMMU node)? How is it done
>> currently?
> I thought that smmu_idmaps will be handled by this ?
>
> object_child_foreach_recursive(object_get_root(),
>                                        iort_host_bridges, smmu_idmaps);
to me this traverses the qemu object hierarchy to find all host bridges
and for each of them builds an idmap array (smmu_idmaps mapping this RC
RID range to this SMMU). But to me those idmaps will be assigned to
*all* SMMU insteaces leading to a wong IORT description because all
SMMUs will be protecting all devices. You shall only retain idmaps which
correspond to the pci_bus a given vSMMU is attached to. Then each SMMU
will protect a distinct PCIe subtree which does not seem the case today.
At least that's my current understanding.

Eric


>
> But it is possible that, there is a bug in this IORT generation here as I am not
> able to hot add  devices. It looks like the pciehp interrupt is not generated/received
> for some reason. Nicolin[0] is suspecting the min/max bus range in
> iort_host_bridges() may not leave enough ranges for hot add later.
>
> Cold plugging devices to different SMMUv3/pcie-pxb seems to be alright.
>
> I will debug that soon.
>
> Thanks,
> Shameer
> [0] https://lore.kernel.org/qemu-devel/ZzPd1F%2FUA2MKMbwl@Asurada-Nvidia/
>
>
RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Shameerali Kolothum Thodi via 1 year, 2 months ago
Hi Eric,

> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: Monday, November 18, 2024 1:46 PM
> To: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
> qemu-devel@nongnu.org
> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> Jonathan Cameron <jonathan.cameron@huawei.com>;
> zhangfei.gao@linaro.org
> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> multiple SMMU nodes
> 

> >>>      /* Number of IORT Nodes */
> >>> @@ -342,10 +356,9 @@ build_iort(GArray *table_data, BIOSLinker
> >> *linker, VirtMachineState *vms)
> >>>      /* GIC ITS Identifier Array */
> >>>      build_append_int_noprefix(table_data, 0 /* MADT translation_id */,
> >> 4);
> >>> -    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
> >>> -        int irq =  vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
> >>> +    for (i = 0; i < num_smmus; i++) {
> >>> +        smmu_offset[i] = table_data->len - table.table_offset;
> >>>
> >> I would have expected changes in the smmu idmap has well. If a given
> >> SMMU instance now protects a given bus hierarchy shouldn't it be
> >> reflected in a differentiated SMMU idmap for each of them (RID subset
> of
> >> SMMU->pci-bus mapping to a specific IORT SMMU node)? How is it done
> >> currently?
> > I thought that smmu_idmaps will be handled by this ?
> >
> > object_child_foreach_recursive(object_get_root(),
> >                                        iort_host_bridges, smmu_idmaps);
> to me this traverses the qemu object hierarchy to find all host bridges
> and for each of them builds an idmap array (smmu_idmaps mapping this
> RC
> RID range to this SMMU). But to me those idmaps will be assigned to
> *all* SMMU insteaces leading to a wong IORT description because all
> SMMUs will be protecting all devices. You shall only retain idmaps which
> correspond to the pci_bus a given vSMMU is attached to. Then each SMMU
> will protect a distinct PCIe subtree which does not seem the case today.
> At least that's my current understanding.

Ah..right. I will fix that in next version. 

I think the above won't affect the basic case where I have only one
pcie-pxb/SMMUv3. But even in that case hot add seems not working.

I tried hacking the min/max ranges as suspected by Nicolin. But still not enough to 
get it working.  Do you have any hint on why the hot add(described below) is not
working?

Thanks,
Shameer

> 
> Eric
> 
> 
> >
> > But it is possible that, there is a bug in this IORT generation here as I am
> not
> > able to hot add  devices. It looks like the pciehp interrupt is not
> generated/received
> > for some reason. Nicolin[0] is suspecting the min/max bus range in
> > iort_host_bridges() may not leave enough ranges for hot add later.
> >
> > Cold plugging devices to different SMMUv3/pcie-pxb seems to be alright.
> >
> > I will debug that soon.
> >
> > Thanks,
> > Shameer
> > [0] https://lore.kernel.org/qemu-devel/ZzPd1F%2FUA2MKMbwl@Asurada-
> Nvidia/
> >
> >
> 

Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Eric Auger 1 year, 2 months ago
Hi Shameer,

On 11/18/24 16:00, Shameerali Kolothum Thodi wrote:
> Hi Eric,
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Sent: Monday, November 18, 2024 1:46 PM
>> To: Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
>> qemu-devel@nongnu.org
>> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
>> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
>> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
>> Jonathan Cameron <jonathan.cameron@huawei.com>;
>> zhangfei.gao@linaro.org
>> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
>> multiple SMMU nodes
>>
>>>>>      /* Number of IORT Nodes */
>>>>> @@ -342,10 +356,9 @@ build_iort(GArray *table_data, BIOSLinker
>>>> *linker, VirtMachineState *vms)
>>>>>      /* GIC ITS Identifier Array */
>>>>>      build_append_int_noprefix(table_data, 0 /* MADT translation_id */,
>>>> 4);
>>>>> -    if (vms->iommu == VIRT_IOMMU_SMMUV3) {
>>>>> -        int irq =  vms->irqmap[VIRT_SMMU] + ARM_SPI_BASE;
>>>>> +    for (i = 0; i < num_smmus; i++) {
>>>>> +        smmu_offset[i] = table_data->len - table.table_offset;
>>>>>
>>>> I would have expected changes in the smmu idmap has well. If a given
>>>> SMMU instance now protects a given bus hierarchy shouldn't it be
>>>> reflected in a differentiated SMMU idmap for each of them (RID subset
>> of
>>>> SMMU->pci-bus mapping to a specific IORT SMMU node)? How is it done
>>>> currently?
>>> I thought that smmu_idmaps will be handled by this ?
>>>
>>> object_child_foreach_recursive(object_get_root(),
>>>                                        iort_host_bridges, smmu_idmaps);
>> to me this traverses the qemu object hierarchy to find all host bridges
>> and for each of them builds an idmap array (smmu_idmaps mapping this
>> RC
>> RID range to this SMMU). But to me those idmaps will be assigned to
>> *all* SMMU insteaces leading to a wong IORT description because all
>> SMMUs will be protecting all devices. You shall only retain idmaps which
>> correspond to the pci_bus a given vSMMU is attached to. Then each SMMU
>> will protect a distinct PCIe subtree which does not seem the case today.
>> At least that's my current understanding.
> Ah..right. I will fix that in next version. 
>
> I think the above won't affect the basic case where I have only one
> pcie-pxb/SMMUv3. But even in that case hot add seems not working.
>
> I tried hacking the min/max ranges as suspected by Nicolin. But still not enough to 
> get it working.  Do you have any hint on why the hot add(described below) is not
> working?
Hum thought the duplicate idmap could be the cause. Otherwise I have no
clue. I would advice to fix it first.

Eric
>
> Thanks,
> Shameer
>
>> Eric
>>
>>
>>> But it is possible that, there is a bug in this IORT generation here as I am
>> not
>>> able to hot add  devices. It looks like the pciehp interrupt is not
>> generated/received
>>> for some reason. Nicolin[0] is suspecting the min/max bus range in
>>> iort_host_bridges() may not leave enough ranges for hot add later.
>>>
>>> Cold plugging devices to different SMMUv3/pcie-pxb seems to be alright.
>>>
>>> I will debug that soon.
>>>
>>> Thanks,
>>> Shameer
>>> [0] https://lore.kernel.org/qemu-devel/ZzPd1F%2FUA2MKMbwl@Asurada-
>> Nvidia/
>>>
RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Shameerali Kolothum Thodi via 1 year, 2 months ago

> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: Monday, November 18, 2024 6:10 PM
> To: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
> qemu-devel@nongnu.org
> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> Jonathan Cameron <jonathan.cameron@huawei.com>;
> zhangfei.gao@linaro.org
> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> multiple SMMU nodes

[...]

> > I think the above won't affect the basic case where I have only one
> > pcie-pxb/SMMUv3. But even in that case hot add seems not working.
> >
> > I tried hacking the min/max ranges as suspected by Nicolin. But still not
> enough to
> > get it working.  Do you have any hint on why the hot add(described
> below) is not
> > working?
> Hum thought the duplicate idmap could be the cause. Otherwise I have no
> clue. I would advice to fix it first.

I think I have an idea why the hot add was not working. 

When we have the PCIe topology as something like below,

-device pxb-pcie,id=pcie.1,bus_nr=8,bus=pcie.0 \
-device pcie-root-port,id=pcie.port1,bus=pcie.1,chassis=1 \
-device pcie-root-port,id=pcie.port2,bus=pcie.1,chassis=2 \
-device arm-smmuv3-nested,id=smmuv1,pci-bus=pcie.1 \
...

The current IORT generation includes the pcie-root-port dev ids also
in the SMMUv3 node idmaps.

Hence, when Guest kernel loads, pcieport is also behind the SMMUv3.

[    1.466670] pcieport 0000:64:00.0: Adding to iommu group 1
...
[    1.448205] pcieport 0000:64:01.0: Adding to iommu group 2


So when we do a  hot add,
device_add vfio-pci,host=0000:75:00.1,bus=pcie.port1,iommufd=iommufd0

The Qemu hotplug event handler tries to inject an IRQ to the Guest pcieport
by retrieving the MSI address it is configured with. 

hotplug_event_notify()
    msix_prepare_message(): [address: 0xfffff040]
         msix_notify()

The ITS address retrieved here is actually the SMMUv3 translated iova addr,
not the Guest PA.  So Guest never sees/receives the interrupt.

I did hack the IORT code to exclude the pcie-root-port dev ids from the SMMUv3
node idmaps and the hot add seems to work fine.

Looks like we need to find all the pcie-root-port dev ids associated with a
SMMUv3/pxb-pcie and exclude them from SMMUv3 node idmaps to get
the hot add working.

I am not sure though this will  create any other issues in IOMMU isolation criteria
(ACS etc,), especially if we want to access the device in Guest user space( I hope
not).

Thanks,
Shameer




Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Eric Auger 1 year, 2 months ago
Hi Shameer,

On 11/20/24 15:16, Shameerali Kolothum Thodi wrote:
>
>> -----Original Message-----
>> From: Eric Auger <eric.auger@redhat.com>
>> Sent: Monday, November 18, 2024 6:10 PM
>> To: Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
>> qemu-devel@nongnu.org
>> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
>> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
>> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
>> Jonathan Cameron <jonathan.cameron@huawei.com>;
>> zhangfei.gao@linaro.org
>> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
>> multiple SMMU nodes
> [...]
>
>>> I think the above won't affect the basic case where I have only one
>>> pcie-pxb/SMMUv3. But even in that case hot add seems not working.
>>>
>>> I tried hacking the min/max ranges as suspected by Nicolin. But still not
>> enough to
>>> get it working.  Do you have any hint on why the hot add(described
>> below) is not
>>> working?
>> Hum thought the duplicate idmap could be the cause. Otherwise I have no
>> clue. I would advice to fix it first.
> I think I have an idea why the hot add was not working. 
>
> When we have the PCIe topology as something like below,
>
> -device pxb-pcie,id=pcie.1,bus_nr=8,bus=pcie.0 \
> -device pcie-root-port,id=pcie.port1,bus=pcie.1,chassis=1 \
> -device pcie-root-port,id=pcie.port2,bus=pcie.1,chassis=2 \
> -device arm-smmuv3-nested,id=smmuv1,pci-bus=pcie.1 \
> ...
>
> The current IORT generation includes the pcie-root-port dev ids also
> in the SMMUv3 node idmaps.
>
> Hence, when Guest kernel loads, pcieport is also behind the SMMUv3.
>
> [    1.466670] pcieport 0000:64:00.0: Adding to iommu group 1
> ...
> [    1.448205] pcieport 0000:64:01.0: Adding to iommu group 2

But it should be the same without multi-instantiation, no? I would have
expected this as normal. Has you tested hot-plug without the series
laterly? Do you have the same pb?

Thanks

Eric
>
>
> So when we do a  hot add,
> device_add vfio-pci,host=0000:75:00.1,bus=pcie.port1,iommufd=iommufd0
>
> The Qemu hotplug event handler tries to inject an IRQ to the Guest pcieport
> by retrieving the MSI address it is configured with. 
>
> hotplug_event_notify()
>     msix_prepare_message(): [address: 0xfffff040]
>          msix_notify()
>
> The ITS address retrieved here is actually the SMMUv3 translated iova addr,
> not the Guest PA.  So Guest never sees/receives the interrupt.
>
> I did hack the IORT code to exclude the pcie-root-port dev ids from the SMMUv3
> node idmaps and the hot add seems to work fine.
>
> Looks like we need to find all the pcie-root-port dev ids associated with a
> SMMUv3/pxb-pcie and exclude them from SMMUv3 node idmaps to get
> the hot add working.
>
> I am not sure though this will  create any other issues in IOMMU isolation criteria
> (ACS etc,), especially if we want to access the device in Guest user space( I hope
> not).
>
> Thanks,
> Shameer
>
>
>
>
RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Shameerali Kolothum Thodi via 1 year, 2 months ago
Hi Eric,

> -----Original Message-----
> From: Eric Auger <eric.auger@redhat.com>
> Sent: Wednesday, November 20, 2024 4:11 PM
> To: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
> qemu-devel@nongnu.org
> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> Jonathan Cameron <jonathan.cameron@huawei.com>;
> zhangfei.gao@linaro.org
> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> multiple SMMU nodes
> 
> Hi Shameer,
> 
> On 11/20/24 15:16, Shameerali Kolothum Thodi wrote:
> >
> >> -----Original Message-----
> >> From: Eric Auger <eric.auger@redhat.com>
> >> Sent: Monday, November 18, 2024 6:10 PM
> >> To: Shameerali Kolothum Thodi
> >> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org;
> >> qemu-devel@nongnu.org
> >> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
> >> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> >> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> >> Jonathan Cameron <jonathan.cameron@huawei.com>;
> >> zhangfei.gao@linaro.org
> >> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> >> multiple SMMU nodes
> > [...]
> >
> >>> I think the above won't affect the basic case where I have only one
> >>> pcie-pxb/SMMUv3. But even in that case hot add seems not working.
> >>>
> >>> I tried hacking the min/max ranges as suspected by Nicolin. But still not
> >> enough to
> >>> get it working.  Do you have any hint on why the hot add(described
> >> below) is not
> >>> working?
> >> Hum thought the duplicate idmap could be the cause. Otherwise I have
> no
> >> clue. I would advice to fix it first.
> > I think I have an idea why the hot add was not working.
> >
> > When we have the PCIe topology as something like below,
> >
> > -device pxb-pcie,id=pcie.1,bus_nr=8,bus=pcie.0 \
> > -device pcie-root-port,id=pcie.port1,bus=pcie.1,chassis=1 \
> > -device pcie-root-port,id=pcie.port2,bus=pcie.1,chassis=2 \
> > -device arm-smmuv3-nested,id=smmuv1,pci-bus=pcie.1 \
> > ...
> >
> > The current IORT generation includes the pcie-root-port dev ids also
> > in the SMMUv3 node idmaps.
> >
> > Hence, when Guest kernel loads, pcieport is also behind the SMMUv3.
> >
> > [    1.466670] pcieport 0000:64:00.0: Adding to iommu group 1
> > ...
> > [    1.448205] pcieport 0000:64:01.0: Adding to iommu group 2
> 
> But it should be the same without multi-instantiation, no? I would have
> expected this as normal. Has you tested hot-plug without the series
> laterly? Do you have the same pb?

That is a good question. I will give it a try soon and update.

Thanks,
Shameer.
RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Shameerali Kolothum Thodi via 1 year, 2 months ago
Hi Eric,

> -----Original Message-----
> From: Shameerali Kolothum Thodi
> Sent: Wednesday, November 20, 2024 4:26 PM
> To: 'eric.auger@redhat.com' <eric.auger@redhat.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> Jonathan Cameron <jonathan.cameron@huawei.com>;
> zhangfei.gao@linaro.org
> Subject: RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> multiple SMMU nodes
> 
> > > I think I have an idea why the hot add was not working.
> > >
> > > When we have the PCIe topology as something like below,
> > >
> > > -device pxb-pcie,id=pcie.1,bus_nr=8,bus=pcie.0 \ -device
> > > pcie-root-port,id=pcie.port1,bus=pcie.1,chassis=1 \ -device
> > > pcie-root-port,id=pcie.port2,bus=pcie.1,chassis=2 \ -device
> > > arm-smmuv3-nested,id=smmuv1,pci-bus=pcie.1 \ ...
> > >
> > > The current IORT generation includes the pcie-root-port dev ids also
> > > in the SMMUv3 node idmaps.
> > >
> > > Hence, when Guest kernel loads, pcieport is also behind the SMMUv3.
> > >
> > > [    1.466670] pcieport 0000:64:00.0: Adding to iommu group 1
> > > ...
> > > [    1.448205] pcieport 0000:64:01.0: Adding to iommu group 2
> >
> > But it should be the same without multi-instantiation, no? I would
> > have expected this as normal. Has you tested hot-plug without the
> > series laterly? Do you have the same pb?
> 
> That is a good question. I will give it a try soon and update.

I tried hot add with the current SMMUv3(iommu=smmuv3) and hot add
works when I added a virtio dev to pcie-root-port connected to a pxb-pcie.

And now I think I know(hopefully) the reason why it is not working with
smmuv3-nested case. I think the root cause is this commit here,

(series: " cover-letter: Add HW accelerated nesting support for arm SMMUv3")
https://github.com/hisilicon/qemu/commit/9b21f28595cef7b1100ae130974605f357ef75d3

This changes the way address space is returned for the devices.

static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
{
    SMMUState *s = opaque;
    SMMUPciBus *sbus = smmu_get_sbus(s, bus);
    SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn);

    /* Return the system as if the device uses stage-2 only */
    if (s->nested && !sdev->s1_hwpt) {
        return &sdev->as_sysmem;
    } else {
        return &sdev->as;
    }
}

If we have entries in the SMMUv3 idmap for bus:devfn, then I think we should
return IOMMU address space here. But the logic above returns sysmem
address space for anything other than vfio/iommufd devices. 

The hot add works when I hacked the logic to return IOMMU address space
for pcie root port devices.

Could you please take a look at the commit above and let me know if this
indeed could be the problem?

Thanks,
Shameer






Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Nicolin Chen 1 year, 2 months ago
On Thu, Nov 21, 2024 at 09:46:16AM +0000, Shameerali Kolothum Thodi wrote:
> Hi Eric,
> 
> > -----Original Message-----
> > From: Shameerali Kolothum Thodi
> > Sent: Wednesday, November 20, 2024 4:26 PM
> > To: 'eric.auger@redhat.com' <eric.auger@redhat.com>; qemu-
> > arm@nongnu.org; qemu-devel@nongnu.org
> > Cc: peter.maydell@linaro.org; jgg@nvidia.com; nicolinc@nvidia.com;
> > ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> > <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> > Jonathan Cameron <jonathan.cameron@huawei.com>;
> > zhangfei.gao@linaro.org
> > Subject: RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> > multiple SMMU nodes
> > 
> > > > I think I have an idea why the hot add was not working.
> > > >
> > > > When we have the PCIe topology as something like below,
> > > >
> > > > -device pxb-pcie,id=pcie.1,bus_nr=8,bus=pcie.0 \ -device
> > > > pcie-root-port,id=pcie.port1,bus=pcie.1,chassis=1 \ -device
> > > > pcie-root-port,id=pcie.port2,bus=pcie.1,chassis=2 \ -device
> > > > arm-smmuv3-nested,id=smmuv1,pci-bus=pcie.1 \ ...
> > > >
> > > > The current IORT generation includes the pcie-root-port dev ids also
> > > > in the SMMUv3 node idmaps.
> > > >
> > > > Hence, when Guest kernel loads, pcieport is also behind the SMMUv3.
> > > >
> > > > [    1.466670] pcieport 0000:64:00.0: Adding to iommu group 1
> > > > ...
> > > > [    1.448205] pcieport 0000:64:01.0: Adding to iommu group 2
> > >
> > > But it should be the same without multi-instantiation, no? I would
> > > have expected this as normal. Has you tested hot-plug without the
> > > series laterly? Do you have the same pb?
> > 
> > That is a good question. I will give it a try soon and update.
> 
> I tried hot add with the current SMMUv3(iommu=smmuv3) and hot add
> works when I added a virtio dev to pcie-root-port connected to a pxb-pcie.
> 
> And now I think I know(hopefully) the reason why it is not working with
> smmuv3-nested case. I think the root cause is this commit here,
> 
> (series: " cover-letter: Add HW accelerated nesting support for arm SMMUv3")

> This changes the way address space is returned for the devices.
> 
> static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
> {
>     SMMUState *s = opaque;
>     SMMUPciBus *sbus = smmu_get_sbus(s, bus);
>     SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn);
> 
>     /* Return the system as if the device uses stage-2 only */
>     if (s->nested && !sdev->s1_hwpt) {
>         return &sdev->as_sysmem;
>     } else {
>         return &sdev->as;
>     }
> }
> 
> If we have entries in the SMMUv3 idmap for bus:devfn, then I think we should
> return IOMMU address space here. But the logic above returns sysmem
> address space for anything other than vfio/iommufd devices.
>
> The hot add works when I hacked the logic to return IOMMU address space
> for pcie root port devices.

That is to bypass the "if (memory_region_is_iommu(section->mr))"
in vfio_listener_region_add(), when the device gets initially
attached to the default container.

Once a device reaches to the pci_device_set_iommu_device() call,
it should be attached to an IDENTIY/bypass proxy s1_hwpt, so the
smmu_find_add_as() will return the iommu as.

So, the fact that your hack is working means the hotplug routine
is likely missing a pci_device_set_iommu_device() call, IMHO, or
probably it should do pci_device_iommu_address_space() after the
device finishes pci_device_set_iommu_device() instead..

Nicolin
RE: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Shameerali Kolothum Thodi via 1 year, 2 months ago
Hi Nicolin,

> -----Original Message-----
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: Tuesday, December 10, 2024 8:48 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu-
> devel@nongnu.org; peter.maydell@linaro.org; jgg@nvidia.com;
> ddutile@redhat.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B)
> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>;
> Jonathan Cameron <jonathan.cameron@huawei.com>;
> zhangfei.gao@linaro.org
> Subject: Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with
> multiple SMMU nodes
> 


> > And now I think I know(hopefully) the reason why it is not working with
> > smmuv3-nested case. I think the root cause is this commit here,
> >
> > (series: " cover-letter: Add HW accelerated nesting support for arm
> SMMUv3")
> 
> > This changes the way address space is returned for the devices.
> >
> > static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int
> devfn)
> > {
> >     SMMUState *s = opaque;
> >     SMMUPciBus *sbus = smmu_get_sbus(s, bus);
> >     SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn);
> >
> >     /* Return the system as if the device uses stage-2 only */
> >     if (s->nested && !sdev->s1_hwpt) {
> >         return &sdev->as_sysmem;
> >     } else {
> >         return &sdev->as;
> >     }
> > }
> >
> > If we have entries in the SMMUv3 idmap for bus:devfn, then I think we
> should
> > return IOMMU address space here. But the logic above returns sysmem
> > address space for anything other than vfio/iommufd devices.
> >
> > The hot add works when I hacked the logic to return IOMMU address
> space
> > for pcie root port devices.
> That is to bypass the "if (memory_region_is_iommu(section->mr))"
> in vfio_listener_region_add(), when the device gets initially
> attached to the default container.

Right. 
 
> Once a device reaches to the pci_device_set_iommu_device() call,
> it should be attached to an IDENTIY/bypass proxy s1_hwpt, so the
> smmu_find_add_as() will return the iommu as.

Agree. The above situation you explained is perfectly fine with vfio-pci dev.

> So, the fact that your hack is working means the hotplug routine
> is likely missing a pci_device_set_iommu_device() call, IMHO, or
> probably it should do pci_device_iommu_address_space() after the
> device finishes pci_device_set_iommu_device() instead..

The problem is not with the hot added vfio-pci dev but with the
pcie-root-port device. When we hot add a vfio-pci to a root port,
Qemu will inject an interrupt for the Guest root port device and
that kick starts the vfio-pci device add process. This involves writing
to the MSI address the Guest  kernel configures for the root port dev. 

As per the current logic, the root port dev will have sysmem address
space and in IORT we have root port dev id in smmu idmap. This
will not work as Guest kernel configures a translated IOVA for MSI.

I think we have discussed this issue of returning different address
spaces before here[0]. But that was in a different context though.
The hack mentioned in [0] actually works for this case as well, where
we add an extra check to see the dev is vfio-pci or not. But I am not
sure that is the best way to handle this.

Another option is to exclude all the root port devices from IORT idmap.
But that looks not an ideal one to me as it actually sits behind an SMMUv3
in this case.

Please let me know if you have any ideas.

Thanks,
Shameer

[0] https://lore.kernel.org/linux-iommu/02f3fbc5145d4449b3313eb802ecfa2c@huawei.com/

 
Re: [RFC PATCH 4/5] hw/arm/virt-acpi-build: Build IORT with multiple SMMU nodes
Posted by Nicolin Chen 1 year, 1 month ago
On Wed, Dec 11, 2024 at 03:21:37PM +0000, Shameerali Kolothum Thodi wrote:
> > Once a device reaches to the pci_device_set_iommu_device() call,
> > it should be attached to an IDENTIY/bypass proxy s1_hwpt, so the
> > smmu_find_add_as() will return the iommu as.
> 
> Agree. The above situation you explained is perfectly fine with vfio-pci dev.
> 
> > So, the fact that your hack is working means the hotplug routine
> > is likely missing a pci_device_set_iommu_device() call, IMHO, or
> > probably it should do pci_device_iommu_address_space() after the
> > device finishes pci_device_set_iommu_device() instead..
> 
> The problem is not with the hot added vfio-pci dev but with the
> pcie-root-port device. When we hot add a vfio-pci to a root port,
> Qemu will inject an interrupt for the Guest root port device and
> that kick starts the vfio-pci device add process. This involves writing
> to the MSI address the Guest  kernel configures for the root port dev. 
> 
> As per the current logic, the root port dev will have sysmem address
> space and in IORT we have root port dev id in smmu idmap. This
> will not work as Guest kernel configures a translated IOVA for MSI.
> 
> I think we have discussed this issue of returning different address
> spaces before here[0]. But that was in a different context though.
> The hack mentioned in [0] actually works for this case as well, where
> we add an extra check to see the dev is vfio-pci or not. But I am not
> sure that is the best way to handle this.
> 
> Another option is to exclude all the root port devices from IORT idmap.
> But that looks not an ideal one to me as it actually sits behind an SMMUv3
> in this case.
> 
> Please let me know if you have any ideas.

Oh... I completely forgot that...

So, we need to make sure the sdev/PCIDevice is a passthrough dev
that will go through the set_iommu_device callback. Otherwise,
just return the iommu address space.

Perhaps we could set a flag during vfio_realize() in PCIDevice *
pdev, so later we could cast the sdev to pdev and recheck that.

Or, we could do something like your approach:
-----------------------------------------------------------------
@@ -896,9 +896,11 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void *opaque, int devfn)
     SMMUState *s = opaque;
     SMMUPciBus *sbus = smmu_get_sbus(s, bus);
     SMMUDevice *sdev = smmu_get_sdev(s, sbus, bus, devfn);
+    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+    bool has_iommufd = !!object_property_find(OBJECT(pdev), "iommufd");

     /* Return the system as if the device uses stage-2 only */
-    if (s->nested && !sdev->s1_hwpt) {
+    if (s->nested && !sdev->s1_hwpt && has_iommufd) {
         return &sdev->as_sysmem;
     } else {
         return &sdev->as;
-----------------------------------------------------------------

vfio-pci might not guarantee that it has an "iommufd" property so
checking the property explicitly might be nicer.

Thanks
Nic