Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel
device. In order to support vfio-pci dev assignment with a Guest
SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2)
mode, with Guest owning the S1 page tables. Subsequent patches will
add support for smmuv3-accel to provide this.
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
hw/arm/Kconfig | 5 ++++
hw/arm/meson.build | 1 +
hw/arm/smmu-common.c | 1 +
hw/arm/smmuv3-accel.c | 51 +++++++++++++++++++++++++++++++++++
include/hw/arm/smmu-common.h | 3 +++
include/hw/arm/smmuv3-accel.h | 31 +++++++++++++++++++++
6 files changed, 92 insertions(+)
create mode 100644 hw/arm/smmuv3-accel.c
create mode 100644 include/hw/arm/smmuv3-accel.h
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index 504841ccab..f889842dd8 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -14,6 +14,7 @@ config ARM_VIRT
select ARM_GIC
select ACPI
select ARM_SMMUV3
+ select ARM_SMMUV3_ACCEL
select GPIO_KEY
select DEVICE_TREE
select FW_CFG_DMA
@@ -596,6 +597,10 @@ config FSL_IMX7
config ARM_SMMUV3
bool
+config ARM_SMMUV3_ACCEL
+ select ARM_SMMUV3
+ bool
+
config FSL_IMX6UL
bool
default y
diff --git a/hw/arm/meson.build b/hw/arm/meson.build
index 465c757f97..e8593363b0 100644
--- a/hw/arm/meson.build
+++ b/hw/arm/meson.build
@@ -55,6 +55,7 @@ arm_ss.add(when: 'CONFIG_MUSCA', if_true: files('musca.c'))
arm_ss.add(when: 'CONFIG_ARMSSE', if_true: files('armsse.c'))
arm_ss.add(when: 'CONFIG_FSL_IMX7', if_true: files('fsl-imx7.c', 'mcimx7d-sabre.c'))
arm_ss.add(when: 'CONFIG_ARM_SMMUV3', if_true: files('smmuv3.c'))
+arm_ss.add(when: 'CONFIG_ARM_SMMUV3_ACCEL', if_true: files('smmuv3-accel.c'))
arm_ss.add(when: 'CONFIG_FSL_IMX6UL', if_true: files('fsl-imx6ul.c', 'mcimx6ul-evk.c'))
arm_ss.add(when: 'CONFIG_NRF51_SOC', if_true: files('nrf51_soc.c'))
arm_ss.add(when: 'CONFIG_XEN', if_true: files(
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 8c1b407b82..f5caf1665c 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -943,6 +943,7 @@ static const Property smmu_dev_properties[] = {
DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0),
DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus,
TYPE_PCI_BUS, PCIBus *),
+ DEFINE_PROP_BOOL("accel", SMMUState, accel, false),
};
static void smmu_base_class_init(ObjectClass *klass, void *data)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
new file mode 100644
index 0000000000..c327661636
--- /dev/null
+++ b/hw/arm/smmuv3-accel.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd
+ * Copyright (C) 2025 NVIDIA
+ * Written by Nicolin Chen, Shameer Kolothum
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/arm/smmuv3-accel.h"
+
+static void smmu_accel_realize(DeviceState *d, Error **errp)
+{
+ SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d);
+ SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_GET_CLASS(s_accel);
+ SysBusDevice *dev = SYS_BUS_DEVICE(d);
+ Error *local_err = NULL;
+
+ object_property_set_bool(OBJECT(dev), "accel", true, &error_abort);
+ c->parent_realize(d, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+}
+
+static void smmuv3_accel_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_CLASS(klass);
+
+ device_class_set_parent_realize(dc, smmu_accel_realize,
+ &c->parent_realize);
+ dc->hotpluggable = false;
+}
+
+static const TypeInfo smmuv3_accel_type_info = {
+ .name = TYPE_ARM_SMMUV3_ACCEL,
+ .parent = TYPE_ARM_SMMUV3,
+ .instance_size = sizeof(SMMUv3AccelState),
+ .class_size = sizeof(SMMUv3AccelClass),
+ .class_init = smmuv3_accel_class_init,
+};
+
+static void smmuv3_accel_register_types(void)
+{
+ type_register_static(&smmuv3_accel_type_info);
+}
+
+type_init(smmuv3_accel_register_types)
diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h
index d1a4a64551..b5c63cfd5d 100644
--- a/include/hw/arm/smmu-common.h
+++ b/include/hw/arm/smmu-common.h
@@ -157,6 +157,9 @@ struct SMMUState {
QLIST_HEAD(, SMMUDevice) devices_with_notifiers;
uint8_t bus_num;
PCIBus *primary_bus;
+
+ /* For smmuv3-accel */
+ bool accel;
};
struct SMMUBaseClass {
diff --git a/include/hw/arm/smmuv3-accel.h b/include/hw/arm/smmuv3-accel.h
new file mode 100644
index 0000000000..56fe376bf4
--- /dev/null
+++ b/include/hw/arm/smmuv3-accel.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd
+ * Copyright (C) 2025 NVIDIA
+ * Written by Nicolin Chen, Shameer Kolothum
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_ARM_SMMUV3_ACCEL_H
+#define HW_ARM_SMMUV3_ACCEL_H
+
+#include "hw/arm/smmu-common.h"
+#include "hw/arm/smmuv3.h"
+#include "qom/object.h"
+
+#define TYPE_ARM_SMMUV3_ACCEL "arm-smmuv3-accel"
+OBJECT_DECLARE_TYPE(SMMUv3AccelState, SMMUv3AccelClass, ARM_SMMUV3_ACCEL)
+
+struct SMMUv3AccelState {
+ SMMUv3State smmuv3_state;
+};
+
+struct SMMUv3AccelClass {
+ /*< private >*/
+ SMMUv3Class smmuv3_class;
+ /*< public >*/
+
+ DeviceRealize parent_realize;
+};
+
+#endif /* HW_ARM_SMMUV3_ACCEL_H */
--
2.34.1
Hi Shameer, On 3/11/25 3:10 PM, Shameer Kolothum wrote: > Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel > device. In order to support vfio-pci dev assignment with a Guest guest > SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) nested (s1+s2) > mode, with Guest owning the S1 page tables. Subsequent patches will the guest > add support for smmuv3-accel to provide this. Can't this -accel smmu also works with emulated devices? Do we want an exclusive usage? I would also document in the commit msg that a new property is added in the parent SMMU (accel). Will this device be migratable? Do we need a migration blocker? > > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> > --- > hw/arm/Kconfig | 5 ++++ > hw/arm/meson.build | 1 + > hw/arm/smmu-common.c | 1 + > hw/arm/smmuv3-accel.c | 51 +++++++++++++++++++++++++++++++++++ > include/hw/arm/smmu-common.h | 3 +++ > include/hw/arm/smmuv3-accel.h | 31 +++++++++++++++++++++ > 6 files changed, 92 insertions(+) > create mode 100644 hw/arm/smmuv3-accel.c > create mode 100644 include/hw/arm/smmuv3-accel.h > > diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig > index 504841ccab..f889842dd8 100644 > --- a/hw/arm/Kconfig > +++ b/hw/arm/Kconfig > @@ -14,6 +14,7 @@ config ARM_VIRT > select ARM_GIC > select ACPI > select ARM_SMMUV3 > + select ARM_SMMUV3_ACCEL > select GPIO_KEY > select DEVICE_TREE > select FW_CFG_DMA > @@ -596,6 +597,10 @@ config FSL_IMX7 > config ARM_SMMUV3 > bool > > +config ARM_SMMUV3_ACCEL > + select ARM_SMMUV3 > + bool > + > config FSL_IMX6UL > bool > default y > diff --git a/hw/arm/meson.build b/hw/arm/meson.build > index 465c757f97..e8593363b0 100644 > --- a/hw/arm/meson.build > +++ b/hw/arm/meson.build > @@ -55,6 +55,7 @@ arm_ss.add(when: 'CONFIG_MUSCA', if_true: files('musca.c')) > arm_ss.add(when: 'CONFIG_ARMSSE', if_true: files('armsse.c')) > arm_ss.add(when: 'CONFIG_FSL_IMX7', if_true: files('fsl-imx7.c', 'mcimx7d-sabre.c')) > arm_ss.add(when: 'CONFIG_ARM_SMMUV3', if_true: files('smmuv3.c')) > +arm_ss.add(when: 'CONFIG_ARM_SMMUV3_ACCEL', if_true: files('smmuv3-accel.c')) > arm_ss.add(when: 'CONFIG_FSL_IMX6UL', if_true: files('fsl-imx6ul.c', 'mcimx6ul-evk.c')) > arm_ss.add(when: 'CONFIG_NRF51_SOC', if_true: files('nrf51_soc.c')) > arm_ss.add(when: 'CONFIG_XEN', if_true: files( > diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c > index 8c1b407b82..f5caf1665c 100644 > --- a/hw/arm/smmu-common.c > +++ b/hw/arm/smmu-common.c > @@ -943,6 +943,7 @@ static const Property smmu_dev_properties[] = { > DEFINE_PROP_UINT8("bus_num", SMMUState, bus_num, 0), > DEFINE_PROP_LINK("primary-bus", SMMUState, primary_bus, > TYPE_PCI_BUS, PCIBus *), > + DEFINE_PROP_BOOL("accel", SMMUState, accel, false), > }; > > static void smmu_base_class_init(ObjectClass *klass, void *data) > diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c > new file mode 100644 > index 0000000000..c327661636 > --- /dev/null > +++ b/hw/arm/smmuv3-accel.c > @@ -0,0 +1,51 @@ > +/* > + * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd > + * Copyright (C) 2025 NVIDIA > + * Written by Nicolin Chen, Shameer Kolothum > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#include "qemu/osdep.h" > + > +#include "hw/arm/smmuv3-accel.h" > + > +static void smmu_accel_realize(DeviceState *d, Error **errp) > +{ > + SMMUv3AccelState *s_accel = ARM_SMMUV3_ACCEL(d); > + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_GET_CLASS(s_accel); > + SysBusDevice *dev = SYS_BUS_DEVICE(d); > + Error *local_err = NULL; > + > + object_property_set_bool(OBJECT(dev), "accel", true, &error_abort); you shouldn't need dev and simply use OBJECT(d) > + c->parent_realize(d, &local_err); > + if (local_err) { > + error_propagate(errp, local_err); > + return; > + } > +} > + > +static void smmuv3_accel_class_init(ObjectClass *klass, void *data) > +{ > + DeviceClass *dc = DEVICE_CLASS(klass); > + SMMUv3AccelClass *c = ARM_SMMUV3_ACCEL_CLASS(klass); > + > + device_class_set_parent_realize(dc, smmu_accel_realize, > + &c->parent_realize); > + dc->hotpluggable = false; > +} > + > +static const TypeInfo smmuv3_accel_type_info = { > + .name = TYPE_ARM_SMMUV3_ACCEL, > + .parent = TYPE_ARM_SMMUV3, > + .instance_size = sizeof(SMMUv3AccelState), > + .class_size = sizeof(SMMUv3AccelClass), > + .class_init = smmuv3_accel_class_init, > +}; > + > +static void smmuv3_accel_register_types(void) > +{ > + type_register_static(&smmuv3_accel_type_info); > +} > + > +type_init(smmuv3_accel_register_types) > diff --git a/include/hw/arm/smmu-common.h b/include/hw/arm/smmu-common.h > index d1a4a64551..b5c63cfd5d 100644 > --- a/include/hw/arm/smmu-common.h > +++ b/include/hw/arm/smmu-common.h > @@ -157,6 +157,9 @@ struct SMMUState { > QLIST_HEAD(, SMMUDevice) devices_with_notifiers; > uint8_t bus_num; > PCIBus *primary_bus; > + > + /* For smmuv3-accel */ > + bool accel; > }; > > struct SMMUBaseClass { > diff --git a/include/hw/arm/smmuv3-accel.h b/include/hw/arm/smmuv3-accel.h > new file mode 100644 > index 0000000000..56fe376bf4 > --- /dev/null > +++ b/include/hw/arm/smmuv3-accel.h > @@ -0,0 +1,31 @@ > +/* > + * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd > + * Copyright (C) 2025 NVIDIA > + * Written by Nicolin Chen, Shameer Kolothum > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#ifndef HW_ARM_SMMUV3_ACCEL_H > +#define HW_ARM_SMMUV3_ACCEL_H > + > +#include "hw/arm/smmu-common.h" > +#include "hw/arm/smmuv3.h" > +#include "qom/object.h" > + > +#define TYPE_ARM_SMMUV3_ACCEL "arm-smmuv3-accel" > +OBJECT_DECLARE_TYPE(SMMUv3AccelState, SMMUv3AccelClass, ARM_SMMUV3_ACCEL) > + > +struct SMMUv3AccelState { > + SMMUv3State smmuv3_state; > +}; > + > +struct SMMUv3AccelClass { > + /*< private >*/ > + SMMUv3Class smmuv3_class; > + /*< public >*/ > + > + DeviceRealize parent_realize; > +}; > + > +#endif /* HW_ARM_SMMUV3_ACCEL_H */ Thanks Eric
On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: > On 3/11/25 3:10 PM, Shameer Kolothum wrote: > > Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel > > device. In order to support vfio-pci dev assignment with a Guest > guest > > SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) > nested (s1+s2) > > mode, with Guest owning the S1 page tables. Subsequent patches will > the guest > > add support for smmuv3-accel to provide this. > > Can't this -accel smmu also works with emulated devices? Do we want an > exclusive usage? Is there any benefit from emulated devices working in the HW- accelerated nested translation mode? Thanks Nicolin
On 3/17/25 6:54 PM, Nicolin Chen wrote: > On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: >> On 3/11/25 3:10 PM, Shameer Kolothum wrote: >>> Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel >>> device. In order to support vfio-pci dev assignment with a Guest >> guest >>> SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) >> nested (s1+s2) >>> mode, with Guest owning the S1 page tables. Subsequent patches will >> the guest >>> add support for smmuv3-accel to provide this. >> Can't this -accel smmu also works with emulated devices? Do we want an >> exclusive usage? > Is there any benefit from emulated devices working in the HW- > accelerated nested translation mode? Not really but do we have any justification for using different device name in accel mode? I am not even sure that accel option is really needed. Ideally the qemu device should be able to detect it is protecting a VFIO device, in which case it shall check whether nested is supported by host SMMU and then automatically turn accel mode? I gave the example of the vfio device which has different class implementration depending on the iommufd option being set or not. Thanks Eric > > Thanks > Nicolin >
On Mon, Mar 17, 2025 at 07:07:52PM +0100, Eric Auger wrote: > On 3/17/25 6:54 PM, Nicolin Chen wrote: > > On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: > >> On 3/11/25 3:10 PM, Shameer Kolothum wrote: > >>> Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel > >>> device. In order to support vfio-pci dev assignment with a Guest > >> guest > >>> SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) > >> nested (s1+s2) > >>> mode, with Guest owning the S1 page tables. Subsequent patches will > >> the guest > >>> add support for smmuv3-accel to provide this. > >> Can't this -accel smmu also works with emulated devices? Do we want an > >> exclusive usage? > > Is there any benefit from emulated devices working in the HW- > > accelerated nested translation mode? > > Not really but do we have any justification for using different device > name in accel mode? I am not even sure that accel option is really > needed. Ideally the qemu device should be able to detect it is > protecting a VFIO device, in which case it shall check whether nested is > supported by host SMMU and then automatically turn accel mode? > > I gave the example of the vfio device which has different class > implementration depending on the iommufd option being set or not. Do you mean that we should just create a regular smmuv3 device and let a VFIO device to turn on this smmuv3's accel mode depending on its LEGACY/IOMMUFD class? Another question: how does an emulated device work with a vSMMUv3? I could imagine that all the accel steps would be bypassed since !sdev->idev. Yet, the emulated iotlb should cache its translation so we will need to flush the iotlb, which will increase complexity as the TLBI command dispatching function will need to be aware what ASID is for emulated device and what is for vfio device.. Thanks Nicolin
On 3/17/25 8:10 PM, Nicolin Chen wrote: > On Mon, Mar 17, 2025 at 07:07:52PM +0100, Eric Auger wrote: >> On 3/17/25 6:54 PM, Nicolin Chen wrote: >>> On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: >>>> On 3/11/25 3:10 PM, Shameer Kolothum wrote: >>>>> Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel >>>>> device. In order to support vfio-pci dev assignment with a Guest >>>> guest >>>>> SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) >>>> nested (s1+s2) >>>>> mode, with Guest owning the S1 page tables. Subsequent patches will >>>> the guest >>>>> add support for smmuv3-accel to provide this. >>>> Can't this -accel smmu also works with emulated devices? Do we want an >>>> exclusive usage? >>> Is there any benefit from emulated devices working in the HW- >>> accelerated nested translation mode? >> Not really but do we have any justification for using different device >> name in accel mode? I am not even sure that accel option is really >> needed. Ideally the qemu device should be able to detect it is >> protecting a VFIO device, in which case it shall check whether nested is >> supported by host SMMU and then automatically turn accel mode? >> >> I gave the example of the vfio device which has different class >> implementration depending on the iommufd option being set or not. > Do you mean that we should just create a regular smmuv3 device and > let a VFIO device to turn on this smmuv3's accel mode depending on > its LEGACY/IOMMUFD class? no this is not what I meant. I gave an example where depending on an option passed to thye VFIO device you choose one class implement or the other. > > Another question: how does an emulated device work with a vSMMUv3? I don't get your question. vSMMUv3 currently only works with emulated devices. Did you mean accelerated SMMUv3? > I could imagine that all the accel steps would be bypassed since > !sdev->idev. Yet, the emulated iotlb should cache its translation > so we will need to flush the iotlb, which will increase complexity > as the TLBI command dispatching function will need to be aware what > ASID is for emulated device and what is for vfio device.. I don't get the issue. For emulated device you go through the usual translate path which indeed caches configs and translations. In case the guest invalidates something, you know the SID and you find the entries in the cache that are tagged by this SID. In case you have an accelerated device (indeed if sdev->idev) you don't exercise that path. On invalidation you detect the SID matches a VFIO devoce, propagate the invalidations to the host instead. on the invalidation you should be able to detect pretty easily if you need to flush the emulated caches or propagate the invalidations. Do I miss some extra problematic? I do not say we should support emulated devices and VFIO devices in the same guest iommu group. But I don't see why we couldn't easily plug the accelerated logic in the current logical for emulation/vhost and do not require a different qemu device. Thanks Eric > > Thanks > Nicolin >
On Wed, Mar 19, 2025 at 05:45:51PM +0100, Eric Auger wrote: > > > > On 3/17/25 8:10 PM, Nicolin Chen wrote: > > On Mon, Mar 17, 2025 at 07:07:52PM +0100, Eric Auger wrote: > >> On 3/17/25 6:54 PM, Nicolin Chen wrote: > >>> On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: > >>>> On 3/11/25 3:10 PM, Shameer Kolothum wrote: > >>>>> Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel > >>>>> device. In order to support vfio-pci dev assignment with a Guest > >>>> guest > >>>>> SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) > >>>> nested (s1+s2) > >>>>> mode, with Guest owning the S1 page tables. Subsequent patches will > >>>> the guest > >>>>> add support for smmuv3-accel to provide this. > >>>> Can't this -accel smmu also works with emulated devices? Do we want an > >>>> exclusive usage? > >>> Is there any benefit from emulated devices working in the HW- > >>> accelerated nested translation mode? > >> Not really but do we have any justification for using different device > >> name in accel mode? I am not even sure that accel option is really > >> needed. Ideally the qemu device should be able to detect it is > >> protecting a VFIO device, in which case it shall check whether nested is > >> supported by host SMMU and then automatically turn accel mode? > >> > >> I gave the example of the vfio device which has different class > >> implementration depending on the iommufd option being set or not. > > Do you mean that we should just create a regular smmuv3 device and > > let a VFIO device to turn on this smmuv3's accel mode depending on > > its LEGACY/IOMMUFD class? > > no this is not what I meant. I gave an example where depending on an > option passed to thye VFIO device you choose one class implement or the > other. Option means something like this: -device smmuv3,accel=on instead of -device "smmuv3-accel" ? Yea, I think that's good. > > Another question: how does an emulated device work with a vSMMUv3? > I don't get your question. vSMMUv3 currently only works with emulated > devices. Did you mean accelerated SMMUv3? Yea. If "accel=on", how does an emulated device work with that? > > I could imagine that all the accel steps would be bypassed since > > !sdev->idev. Yet, the emulated iotlb should cache its translation > > so we will need to flush the iotlb, which will increase complexity > > as the TLBI command dispatching function will need to be aware what > > ASID is for emulated device and what is for vfio device.. > I don't get the issue. For emulated device you go through the usual > translate path which indeed caches configs and translations. In case the > guest invalidates something, you know the SID and you find the entries > in the cache that are tagged by this SID. > > In case you have an accelerated device (indeed if sdev->idev) you don't > exercise that path. On invalidation you detect the SID matches a VFIO > devoce, propagate the invalidations to the host instead. on the > invalidation you should be able to detect pretty easily if you need to > flush the emulated caches or propagate the invalidations. Do I miss some > extra problematic? > > I do not say we should support emulated devices and VFIO devices in the > same guest iommu group. But I don't see why we couldn't easily plug the > accelerated logic in the current logical for emulation/vhost and do not > require a different qemu device. Hmm, feels like I fundamentally misunderstood your point. a) We implement the device model with the same piece of code but only provide an option "accel=on/off" to switch mode. And both passthrough devices and emulated devices can attach to the same "accel=on" device. b) We implement the device model with the same piece of code but only provide an option "accel=on/off" to switch mode. Then, an passthrough device can attach to an "accel=on" device, but an emulated device can only attach to an "accel=off" SMMU device. I was thinking that you want case (a). But actually you were just talking about case (b)? I think (b) is totally fine. We certainly can't do case (a): not all TLBI commands gives an "SID" field (so would have to broadcast, i.e. underlying SMMU HW would run commands that were supposed for emulated devices only); in case of vCMDQ, commands for emulated devices would be issued to real HW and trigger HW errors. Thanks Nicolin
Hi Nicolin, On 3/19/25 6:14 PM, Nicolin Chen wrote: > On Wed, Mar 19, 2025 at 05:45:51PM +0100, Eric Auger wrote: >> >> >> On 3/17/25 8:10 PM, Nicolin Chen wrote: >>> On Mon, Mar 17, 2025 at 07:07:52PM +0100, Eric Auger wrote: >>>> On 3/17/25 6:54 PM, Nicolin Chen wrote: >>>>> On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: >>>>>> On 3/11/25 3:10 PM, Shameer Kolothum wrote: >>>>>>> Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel >>>>>>> device. In order to support vfio-pci dev assignment with a Guest >>>>>> guest >>>>>>> SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) >>>>>> nested (s1+s2) >>>>>>> mode, with Guest owning the S1 page tables. Subsequent patches will >>>>>> the guest >>>>>>> add support for smmuv3-accel to provide this. >>>>>> Can't this -accel smmu also works with emulated devices? Do we want an >>>>>> exclusive usage? >>>>> Is there any benefit from emulated devices working in the HW- >>>>> accelerated nested translation mode? >>>> Not really but do we have any justification for using different device >>>> name in accel mode? I am not even sure that accel option is really >>>> needed. Ideally the qemu device should be able to detect it is >>>> protecting a VFIO device, in which case it shall check whether nested is >>>> supported by host SMMU and then automatically turn accel mode? >>>> >>>> I gave the example of the vfio device which has different class >>>> implementration depending on the iommufd option being set or not. >>> Do you mean that we should just create a regular smmuv3 device and >>> let a VFIO device to turn on this smmuv3's accel mode depending on >>> its LEGACY/IOMMUFD class? >> no this is not what I meant. I gave an example where depending on an >> option passed to thye VFIO device you choose one class implement or the >> other. > Option means something like this: > -device smmuv3,accel=on > instead of > -device "smmuv3-accel" > ? > > Yea, I think that's good. Yeah actually that's a big debate for not much. From an implementation pov that shall not change much. The only doubt I have is if we need to conditionnaly expose the MSI RESV regions it is easier to do if we detect we have a smmuv3-accel. what the option allows is the auto mode. > >>> Another question: how does an emulated device work with a vSMMUv3? >> I don't get your question. vSMMUv3 currently only works with emulated >> devices. Did you mean accelerated SMMUv3? > Yea. If "accel=on", how does an emulated device work with that? > >>> I could imagine that all the accel steps would be bypassed since >>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>> so we will need to flush the iotlb, which will increase complexity >>> as the TLBI command dispatching function will need to be aware what >>> ASID is for emulated device and what is for vfio device.. >> I don't get the issue. For emulated device you go through the usual >> translate path which indeed caches configs and translations. In case the >> guest invalidates something, you know the SID and you find the entries >> in the cache that are tagged by this SID. >> >> In case you have an accelerated device (indeed if sdev->idev) you don't >> exercise that path. On invalidation you detect the SID matches a VFIO >> devoce, propagate the invalidations to the host instead. on the >> invalidation you should be able to detect pretty easily if you need to >> flush the emulated caches or propagate the invalidations. Do I miss some >> extra problematic? >> >> I do not say we should support emulated devices and VFIO devices in the >> same guest iommu group. But I don't see why we couldn't easily plug the >> accelerated logic in the current logical for emulation/vhost and do not >> require a different qemu device. > Hmm, feels like I fundamentally misunderstood your point. > a) We implement the device model with the same piece of code but > only provide an option "accel=on/off" to switch mode. And both > passthrough devices and emulated devices can attach to the same > "accel=on" device. I think we all agree we don't want that use case in general. However effectively I was questioning why it couldn't work maybe at the expense of some perf degration. > b) We implement the device model with the same piece of code but > only provide an option "accel=on/off" to switch mode. Then, an > passthrough device can attach to an "accel=on" device, but an > emulated device can only attach to an "accel=off" SMMU device. > > I was thinking that you want case (a). But actually you were just > talking about case (b)? I think (b) is totally fine. > > We certainly can't do case (a): not all TLBI commands gives an "SID" > field (so would have to broadcast, i.e. underlying SMMU HW would run > commands that were supposed for emulated devices only); in case of > vCMDQ, commands for emulated devices would be issued to real HW and I am still confused about that. For instance if the guest sends an NH_ASID, NH_VA invalidation and it happens both the emulated device and VFIO-device share the same cd.asid (same guest iommu domain, which practically should not happen) why shouldn't we propagate the invalidation to the host. Does the problem come from the usage of vCMDQ or would you foresee the same problem with a generic physical SMMU? Thanks Eric > trigger HW errors. > > Thanks > Nicolin >
On 3/19/25 2:09 PM, Eric Auger wrote: > Hi Nicolin, > > > On 3/19/25 6:14 PM, Nicolin Chen wrote: >> On Wed, Mar 19, 2025 at 05:45:51PM +0100, Eric Auger wrote: >>> >>> >>> On 3/17/25 8:10 PM, Nicolin Chen wrote: >>>> On Mon, Mar 17, 2025 at 07:07:52PM +0100, Eric Auger wrote: >>>>> On 3/17/25 6:54 PM, Nicolin Chen wrote: >>>>>> On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: >>>>>>> On 3/11/25 3:10 PM, Shameer Kolothum wrote: >>>>>>>> Based on SMMUv3 as a parent device, add a user-creatable smmuv3-accel >>>>>>>> device. In order to support vfio-pci dev assignment with a Guest >>>>>>> guest >>>>>>>> SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) >>>>>>> nested (s1+s2) >>>>>>>> mode, with Guest owning the S1 page tables. Subsequent patches will >>>>>>> the guest >>>>>>>> add support for smmuv3-accel to provide this. >>>>>>> Can't this -accel smmu also works with emulated devices? Do we want an >>>>>>> exclusive usage? >>>>>> Is there any benefit from emulated devices working in the HW- >>>>>> accelerated nested translation mode? >>>>> Not really but do we have any justification for using different device >>>>> name in accel mode? I am not even sure that accel option is really >>>>> needed. Ideally the qemu device should be able to detect it is >>>>> protecting a VFIO device, in which case it shall check whether nested is >>>>> supported by host SMMU and then automatically turn accel mode? >>>>> >>>>> I gave the example of the vfio device which has different class >>>>> implementration depending on the iommufd option being set or not. >>>> Do you mean that we should just create a regular smmuv3 device and >>>> let a VFIO device to turn on this smmuv3's accel mode depending on >>>> its LEGACY/IOMMUFD class? >>> no this is not what I meant. I gave an example where depending on an >>> option passed to thye VFIO device you choose one class implement or the >>> other. >> Option means something like this: >> -device smmuv3,accel=on >> instead of >> -device "smmuv3-accel" >> ? >> >> Yea, I think that's good. > Yeah actually that's a big debate for not much. From an implementation > pov that shall not change much. The only doubt I have is if we need to > conditionnaly expose the MSI RESV regions it is easier to do if we > detect we have a smmuv3-accel. what the option allows is the auto mode. >> >>>> Another question: how does an emulated device work with a vSMMUv3? >>> I don't get your question. vSMMUv3 currently only works with emulated >>> devices. Did you mean accelerated SMMUv3? >> Yea. If "accel=on", how does an emulated device work with that? >> >>>> I could imagine that all the accel steps would be bypassed since >>>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>>> so we will need to flush the iotlb, which will increase complexity >>>> as the TLBI command dispatching function will need to be aware what >>>> ASID is for emulated device and what is for vfio device.. >>> I don't get the issue. For emulated device you go through the usual >>> translate path which indeed caches configs and translations. In case the >>> guest invalidates something, you know the SID and you find the entries >>> in the cache that are tagged by this SID. >>> >>> In case you have an accelerated device (indeed if sdev->idev) you don't >>> exercise that path. On invalidation you detect the SID matches a VFIO >>> devoce, propagate the invalidations to the host instead. on the >>> invalidation you should be able to detect pretty easily if you need to >>> flush the emulated caches or propagate the invalidations. Do I miss some >>> extra problematic? >>> >>> I do not say we should support emulated devices and VFIO devices in the >>> same guest iommu group. But I don't see why we couldn't easily plug the >>> accelerated logic in the current logical for emulation/vhost and do not >>> require a different qemu device. >> Hmm, feels like I fundamentally misunderstood your point. >> a) We implement the device model with the same piece of code but >> only provide an option "accel=on/off" to switch mode. And both >> passthrough devices and emulated devices can attach to the same >> "accel=on" device. > I think we all agree we don't want that use case in general. However > effectively I was questioning why it couldn't work maybe at the expense > of some perf degration. >> b) We implement the device model with the same piece of code but >> only provide an option "accel=on/off" to switch mode. Then, an >> passthrough device can attach to an "accel=on" device, but an >> emulated device can only attach to an "accel=off" SMMU device. >> >> I was thinking that you want case (a). But actually you were just >> talking about case (b)? I think (b) is totally fine. >> >> We certainly can't do case (a): not all TLBI commands gives an "SID" >> field (so would have to broadcast, i.e. underlying SMMU HW would run >> commands that were supposed for emulated devices only); in case of >> vCMDQ, commands for emulated devices would be issued to real HW and > I am still confused about that. For instance if the guest sends an > NH_ASID, NH_VA invalidation and it happens both the emulated device and > VFIO-device share the same cd.asid (same guest iommu domain, which > practically should not happen) why shouldn't we propagate the it can't ... on ARM ... PCIe only, no shared iommu domain btwn devices. Isn't this another reason (perf) why emulated devices & physical devices should be on different vSMMU's ... so it can be distinguished on how deep (to hw) or how wide(a broadcast) actions like TLBI is implemented, or impacts other devices ? > invalidation to the host. Does the problem come from the usage of vCMDQ > or would you foresee the same problem with a generic physical SMMU? > > Thanks > > Eric >> trigger HW errors. >> >> Thanks >> Nicolin >> >
On 3/21/25 2:26 AM, Donald Dutile wrote: > > > On 3/19/25 2:09 PM, Eric Auger wrote: >> Hi Nicolin, >> >> >> On 3/19/25 6:14 PM, Nicolin Chen wrote: >>> On Wed, Mar 19, 2025 at 05:45:51PM +0100, Eric Auger wrote: >>>> >>>> >>>> On 3/17/25 8:10 PM, Nicolin Chen wrote: >>>>> On Mon, Mar 17, 2025 at 07:07:52PM +0100, Eric Auger wrote: >>>>>> On 3/17/25 6:54 PM, Nicolin Chen wrote: >>>>>>> On Wed, Mar 12, 2025 at 04:15:10PM +0100, Eric Auger wrote: >>>>>>>> On 3/11/25 3:10 PM, Shameer Kolothum wrote: >>>>>>>>> Based on SMMUv3 as a parent device, add a user-creatable >>>>>>>>> smmuv3-accel >>>>>>>>> device. In order to support vfio-pci dev assignment with a Guest >>>>>>>> guest >>>>>>>>> SMMUv3, the physical SMMUv3 has to be configured in nested(S1+s2) >>>>>>>> nested (s1+s2) >>>>>>>>> mode, with Guest owning the S1 page tables. Subsequent patches >>>>>>>>> will >>>>>>>> the guest >>>>>>>>> add support for smmuv3-accel to provide this. >>>>>>>> Can't this -accel smmu also works with emulated devices? Do we >>>>>>>> want an >>>>>>>> exclusive usage? >>>>>>> Is there any benefit from emulated devices working in the HW- >>>>>>> accelerated nested translation mode? >>>>>> Not really but do we have any justification for using different >>>>>> device >>>>>> name in accel mode? I am not even sure that accel option is really >>>>>> needed. Ideally the qemu device should be able to detect it is >>>>>> protecting a VFIO device, in which case it shall check whether >>>>>> nested is >>>>>> supported by host SMMU and then automatically turn accel mode? >>>>>> >>>>>> I gave the example of the vfio device which has different class >>>>>> implementration depending on the iommufd option being set or not. >>>>> Do you mean that we should just create a regular smmuv3 device and >>>>> let a VFIO device to turn on this smmuv3's accel mode depending on >>>>> its LEGACY/IOMMUFD class? >>>> no this is not what I meant. I gave an example where depending on an >>>> option passed to thye VFIO device you choose one class implement or >>>> the >>>> other. >>> Option means something like this: >>> -device smmuv3,accel=on >>> instead of >>> -device "smmuv3-accel" >>> ? >>> >>> Yea, I think that's good. >> Yeah actually that's a big debate for not much. From an implementation >> pov that shall not change much. The only doubt I have is if we need to >> conditionnaly expose the MSI RESV regions it is easier to do if we >> detect we have a smmuv3-accel. what the option allows is the auto mode. >>> >>>>> Another question: how does an emulated device work with a vSMMUv3? >>>> I don't get your question. vSMMUv3 currently only works with emulated >>>> devices. Did you mean accelerated SMMUv3? >>> Yea. If "accel=on", how does an emulated device work with that? >>> >>>>> I could imagine that all the accel steps would be bypassed since >>>>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>>>> so we will need to flush the iotlb, which will increase complexity >>>>> as the TLBI command dispatching function will need to be aware what >>>>> ASID is for emulated device and what is for vfio device.. >>>> I don't get the issue. For emulated device you go through the usual >>>> translate path which indeed caches configs and translations. In >>>> case the >>>> guest invalidates something, you know the SID and you find the entries >>>> in the cache that are tagged by this SID. >>>> >>>> In case you have an accelerated device (indeed if sdev->idev) you >>>> don't >>>> exercise that path. On invalidation you detect the SID matches a VFIO >>>> devoce, propagate the invalidations to the host instead. on the >>>> invalidation you should be able to detect pretty easily if you need to >>>> flush the emulated caches or propagate the invalidations. Do I miss >>>> some >>>> extra problematic? >>>> >>>> I do not say we should support emulated devices and VFIO devices in >>>> the >>>> same guest iommu group. But I don't see why we couldn't easily plug >>>> the >>>> accelerated logic in the current logical for emulation/vhost and do >>>> not >>>> require a different qemu device. >>> Hmm, feels like I fundamentally misunderstood your point. >>> a) We implement the device model with the same piece of code but >>> only provide an option "accel=on/off" to switch mode. And both >>> passthrough devices and emulated devices can attach to the same >>> "accel=on" device. >> I think we all agree we don't want that use case in general. However >> effectively I was questioning why it couldn't work maybe at the expense >> of some perf degration. >>> b) We implement the device model with the same piece of code but >>> only provide an option "accel=on/off" to switch mode. Then, an >>> passthrough device can attach to an "accel=on" device, but an >>> emulated device can only attach to an "accel=off" SMMU device. >>> >>> I was thinking that you want case (a). But actually you were just >>> talking about case (b)? I think (b) is totally fine. >>> >>> We certainly can't do case (a): not all TLBI commands gives an "SID" >>> field (so would have to broadcast, i.e. underlying SMMU HW would run >>> commands that were supposed for emulated devices only); in case of >>> vCMDQ, commands for emulated devices would be issued to real HW and >> I am still confused about that. For instance if the guest sends an >> NH_ASID, NH_VA invalidation and it happens both the emulated device and >> VFIO-device share the same cd.asid (same guest iommu domain, which >> practically should not happen) why shouldn't we propagate the > it can't ... on ARM ... PCIe only, no shared iommu domain btwn devices. yeah I agree this generally happens behind a PCIe to PCI bridge. > > Isn't this another reason (perf) why emulated devices & physical > devices should > be on different vSMMU's ... so it can be distinguished on how deep (to > hw) > or how wide(a broadcast) actions like TLBI is implemented, or impacts > other devices ? To me the actual issue is vcmdq. Here we have a blocker. Otherwise if you don't have vcmdq you still can propage invalidations using the proper notifier (VFIO or vhost). This used to work Eric > > >> invalidation to the host. Does the problem come from the usage of vCMDQ >> or would you foresee the same problem with a generic physical SMMU? >> >> Thanks >> >> Eric >>> trigger HW errors. >>> >>> Thanks >>> Nicolin >>> >> >
On Wed, Mar 19, 2025 at 07:09:33PM +0100, Eric Auger wrote: > > Option means something like this: > > -device smmuv3,accel=on > > instead of > > -device "smmuv3-accel" > > ? > > > > Yea, I think that's good. > Yeah actually that's a big debate for not much. From an implementation > pov that shall not change much. The only doubt I have is if we need to > conditionnaly expose the MSI RESV regions it is easier to do if we > detect we have a smmuv3-accel. what the option allows is the auto mode. Mind elaborating your doubt about the MSI RESV region? Do you mean how VMS code should tag "accel=on" option and generate RMR nodes in the IORT table? > > We certainly can't do case (a): not all TLBI commands gives an "SID" > > field (so would have to broadcast, i.e. underlying SMMU HW would run > > commands that were supposed for emulated devices only); in case of > > vCMDQ, commands for emulated devices would be issued to real HW and > I am still confused about that. For instance if the guest sends an > NH_ASID, NH_VA invalidation and it happens both the emulated device and > VFIO-device share the same cd.asid (same guest iommu domain, which > practically should not happen) why shouldn't we propagate the > invalidation to the host. Does the problem come from the usage of vCMDQ > or would you foresee the same problem with a generic physical SMMU? Host (HW) would end up with executing commands that were issued for emulated devices, which impacts performance. With vCMDQ, QEMU cannot trap command queue because all invalidation commands will be issued to HW directly from the guest kernel driver. This includes TLBI and ATC_INV commands. It's probably okay to run TLBI commands with vCMDQ (again perf impact), while ATC_INV commands would result in "unkonwn SID" errors or directly ATC_INV timeouts. Thanks Nicolin
On 3/19/25 7:34 PM, Nicolin Chen wrote: > On Wed, Mar 19, 2025 at 07:09:33PM +0100, Eric Auger wrote: >>> Option means something like this: >>> -device smmuv3,accel=on >>> instead of >>> -device "smmuv3-accel" >>> ? >>> >>> Yea, I think that's good. >> Yeah actually that's a big debate for not much. From an implementation >> pov that shall not change much. The only doubt I have is if we need to >> conditionnaly expose the MSI RESV regions it is easier to do if we >> detect we have a smmuv3-accel. what the option allows is the auto mode. > Mind elaborating your doubt about the MSI RESV region? > > Do you mean how VMS code should tag "accel=on" option and generate > RMR nodes in the IORT table? yes that was my point. Earlier we detected whether a "nested-smmu" was part of the object hierarchy. Now we do the same with smmu type and check if accel=on. I guess we can retrieve the property value but this is worth to test. > >>> We certainly can't do case (a): not all TLBI commands gives an "SID" >>> field (so would have to broadcast, i.e. underlying SMMU HW would run >>> commands that were supposed for emulated devices only); in case of >>> vCMDQ, commands for emulated devices would be issued to real HW and >> I am still confused about that. For instance if the guest sends an >> NH_ASID, NH_VA invalidation and it happens both the emulated device and >> VFIO-device share the same cd.asid (same guest iommu domain, which >> practically should not happen) why shouldn't we propagate the >> invalidation to the host. Does the problem come from the usage of vCMDQ >> or would you foresee the same problem with a generic physical SMMU? > Host (HW) would end up with executing commands that were issued for > emulated devices, which impacts performance. > > With vCMDQ, QEMU cannot trap command queue because all invalidation > commands will be issued to HW directly from the guest kernel driver. > This includes TLBI and ATC_INV commands. It's probably okay to run > TLBI commands with vCMDQ (again perf impact), while ATC_INV commands > would result in "unkonwn SID" errors or directly ATC_INV timeouts. OK understood. Thanks and sorry for the misunderstanding Eric > > Thanks > Nicolin >
> -----Original Message----- > From: Eric Auger <eric.auger@redhat.com> > Sent: Wednesday, March 19, 2025 4:46 PM > To: Nicolin Chen <nicolinc@nvidia.com> > Cc: Shameerali Kolothum Thodi > <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org; > qemu-devel@nongnu.org; peter.maydell@linaro.org; jgg@nvidia.com; > ddutile@redhat.com; berrange@redhat.com; nathanc@nvidia.com; > mochs@nvidia.com; smostafa@google.com; Linuxarm > <linuxarm@huawei.com>; Wangzhou (B) <wangzhou1@hisilicon.com>; > jiangkunkun <jiangkunkun@huawei.com>; Jonathan Cameron > <jonathan.cameron@huawei.com>; zhangfei.gao@linaro.org > Subject: Re: [RFC PATCH v2 03/20] hw/arm/smmuv3-accel: Add initial > infrastructure for smmuv3-accel device > >>> Is there any benefit from emulated devices working in the HW- > >>> accelerated nested translation mode? > >> Not really but do we have any justification for using different device > >> name in accel mode? I am not even sure that accel option is really > >> needed. Ideally the qemu device should be able to detect it is > >> protecting a VFIO device, in which case it shall check whether nested is > >> supported by host SMMU and then automatically turn accel mode? > >> > >> I gave the example of the vfio device which has different class > >> implementration depending on the iommufd option being set or not. > > Do you mean that we should just create a regular smmuv3 device and > > let a VFIO device to turn on this smmuv3's accel mode depending on > > its LEGACY/IOMMUFD class? > > no this is not what I meant. I gave an example where depending on an > option passed to thye VFIO device you choose one class implement or the > other. > > > > Another question: how does an emulated device work with a vSMMUv3? > I don't get your question. vSMMUv3 currently only works with emulated > devices. Did you mean accelerated SMMUv3? > > I could imagine that all the accel steps would be bypassed since > > !sdev->idev. Yet, the emulated iotlb should cache its translation > > so we will need to flush the iotlb, which will increase complexity > > as the TLBI command dispatching function will need to be aware what > > ASID is for emulated device and what is for vfio device.. > I don't get the issue. For emulated device you go through the usual > translate path which indeed caches configs and translations. In case the > guest invalidates something, you know the SID and you find the entries > in the cache that are tagged by this SID. Not always you get sid, eg: CMD_TLBI_NH_ASID Thanks, Shameer
Hi Shameer, On 3/19/25 5:53 PM, Shameerali Kolothum Thodi wrote: > >> -----Original Message----- >> From: Eric Auger <eric.auger@redhat.com> >> Sent: Wednesday, March 19, 2025 4:46 PM >> To: Nicolin Chen <nicolinc@nvidia.com> >> Cc: Shameerali Kolothum Thodi >> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org; >> qemu-devel@nongnu.org; peter.maydell@linaro.org; jgg@nvidia.com; >> ddutile@redhat.com; berrange@redhat.com; nathanc@nvidia.com; >> mochs@nvidia.com; smostafa@google.com; Linuxarm >> <linuxarm@huawei.com>; Wangzhou (B) <wangzhou1@hisilicon.com>; >> jiangkunkun <jiangkunkun@huawei.com>; Jonathan Cameron >> <jonathan.cameron@huawei.com>; zhangfei.gao@linaro.org >> Subject: Re: [RFC PATCH v2 03/20] hw/arm/smmuv3-accel: Add initial >> infrastructure for smmuv3-accel device >>>>> Is there any benefit from emulated devices working in the HW- >>>>> accelerated nested translation mode? >>>> Not really but do we have any justification for using different device >>>> name in accel mode? I am not even sure that accel option is really >>>> needed. Ideally the qemu device should be able to detect it is >>>> protecting a VFIO device, in which case it shall check whether nested is >>>> supported by host SMMU and then automatically turn accel mode? >>>> >>>> I gave the example of the vfio device which has different class >>>> implementration depending on the iommufd option being set or not. >>> Do you mean that we should just create a regular smmuv3 device and >>> let a VFIO device to turn on this smmuv3's accel mode depending on >>> its LEGACY/IOMMUFD class? >> no this is not what I meant. I gave an example where depending on an >> option passed to thye VFIO device you choose one class implement or the >> other. >>> Another question: how does an emulated device work with a vSMMUv3? >> I don't get your question. vSMMUv3 currently only works with emulated >> devices. Did you mean accelerated SMMUv3? >>> I could imagine that all the accel steps would be bypassed since >>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>> so we will need to flush the iotlb, which will increase complexity >>> as the TLBI command dispatching function will need to be aware what >>> ASID is for emulated device and what is for vfio device.. >> I don't get the issue. For emulated device you go through the usual >> translate path which indeed caches configs and translations. In case the >> guest invalidates something, you know the SID and you find the entries >> in the cache that are tagged by this SID. > Not always you get sid, eg: CMD_TLBI_NH_ASID Effectively with ASID invalidation you potentially need to do both qemu IOTLB invalidation and host invalidation propagation. but this code is already in place in the code and used in vhost mode: smmu_inv_notifiers_all(&s->smmu_state); smmu_iotlb_inv_asid_vmid(bs, asid, vmid); but as stated before in VFIO accel mode the cache is not filled so I don't expect a huge penalty Besides we can also disable qemu caches if it turns the accel mode is in use, no? Eric > > Thanks, > Shameer
On Wed, Mar 19, 2025 at 06:26:48PM +0100, Eric Auger wrote: > Effectively with ASID invalidation you potentially need to do both qemu > IOTLB invalidation and host invalidation propagation. > but this code is already in place in the code and used in vhost mode: Let's not forget the focus here, the point of the accel mode is to run fast, especially fast invalidation. Doing a bunch of extra stuff on hot paths just to support mixing virtual devices with physical doesn't seem like a great direction.. Jason
On 3/19/25 6:34 PM, Jason Gunthorpe wrote: > On Wed, Mar 19, 2025 at 06:26:48PM +0100, Eric Auger wrote: >> Effectively with ASID invalidation you potentially need to do both qemu >> IOTLB invalidation and host invalidation propagation. >> but this code is already in place in the code and used in vhost mode: > Let's not forget the focus here, the point of the accel mode is to > run fast, especially fast invalidation. > > Doing a bunch of extra stuff on hot paths just to support mixing > virtual devices with physical doesn't seem like a great direction.. fair enough. Then let's disable the internal caches if we are in accel mode. Eric > > Jason >
On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: > Another question: how does an emulated device work with a vSMMUv3? > I could imagine that all the accel steps would be bypassed since > !sdev->idev. Yet, the emulated iotlb should cache its translation > so we will need to flush the iotlb, which will increase complexity > as the TLBI command dispatching function will need to be aware what > ASID is for emulated device and what is for vfio device.. I think you should block it. We already expect different vSMMU's depending on the physical SMMU under the PCI device, it makes sense that a SW VFIO device would have it's own, non-accelerated, vSMMU model in the guest. Jason
On 3/17/25 3:24 PM, Jason Gunthorpe wrote: > On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: >> Another question: how does an emulated device work with a vSMMUv3? >> I could imagine that all the accel steps would be bypassed since >> !sdev->idev. Yet, the emulated iotlb should cache its translation >> so we will need to flush the iotlb, which will increase complexity >> as the TLBI command dispatching function will need to be aware what >> ASID is for emulated device and what is for vfio device.. > > I think you should block it. We already expect different vSMMU's ... and when you say 'block', you mean qemu prints out a helpful message like "Mixing emulate/virtual devices and physical devices on a single SMMUv3 is not allowed. Specify separate smmuv3 objects for each type of device; multiple smmuv3 objects may be required for each physical device if they are attached to different smmuv3's in the host system." Or would that be an allowed qemu machine definition, but the 'block' would be a warning like: "Mixing emulated/virtual devices and physical devices on a single SMMUv3 is not recommended for performance reasons. To yield optimal performance, place physical devices on separate SMMUv3 objects than emulated/virtual device SMMUv3 objects." ... and in this case, the physical devices would not use the accel features of an smmuv3, but still be 'functional'. This may be desired for a machine definition that wants to be used on different hosts that may not have the (same) accel feature(s). > depending on the physical SMMU under the PCI device, it makes sense > that a SW VFIO device would have it's own, non-accelerated, vSMMU > model in the guest. > > Jason >
On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: > On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: > > Another question: how does an emulated device work with a vSMMUv3? > > I could imagine that all the accel steps would be bypassed since > > !sdev->idev. Yet, the emulated iotlb should cache its translation > > so we will need to flush the iotlb, which will increase complexity > > as the TLBI command dispatching function will need to be aware what > > ASID is for emulated device and what is for vfio device.. > > I think you should block it. We already expect different vSMMU's > depending on the physical SMMU under the PCI device, it makes sense > that a SW VFIO device would have it's own, non-accelerated, vSMMU > model in the guest. Yea, I agree and it'd be cleaner for an implementation separating them. In my mind, the general idea of "accel=on" is also to keep things in a more efficient way: passthrough devices go to HW-accelerated vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- bypassed (PCIE0). Though I do see the point from QEMU prospective that user may want to start a VM with HW-accelerated vSMMU for one passthrough device using a simple setup without caring about the routing via command. Thanks Nicolin
Hi, On 3/17/25 9:19 PM, Nicolin Chen wrote: > On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: >> On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: >>> Another question: how does an emulated device work with a vSMMUv3? >>> I could imagine that all the accel steps would be bypassed since >>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>> so we will need to flush the iotlb, which will increase complexity >>> as the TLBI command dispatching function will need to be aware what >>> ASID is for emulated device and what is for vfio device.. >> I think you should block it. We already expect different vSMMU's >> depending on the physical SMMU under the PCI device, it makes sense >> that a SW VFIO device would have it's own, non-accelerated, vSMMU >> model in the guest. > Yea, I agree and it'd be cleaner for an implementation separating > them. > > In my mind, the general idea of "accel=on" is also to keep things > in a more efficient way: passthrough devices go to HW-accelerated > vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- > bypassed (PCIE0). Originally a specific SMMU device was needed to opt in for MSI reserved region ACPI IORT description which are not needed if you don't rely on S1+S2. However if we don't rely on this trick this was not even needed with legacy integration (https://patchwork.kernel.org/project/qemu-devel/cover/20180921081819.9203-1-eric.auger@redhat.com/). Nevertheless I don't think anything prevents the acceleration granted device from also working with virtio/vhost devices for instance unless you unplug the existing infra. The translation and invalidation just should use different control paths (explicit translation requests, invalidation notifications towards vhost, ...). Again, what does legitimate to have different qemu devices for the same IP? I understand that it simplifies the implementation but I am not sure this is a good reason. Nevertheless it worth challenging. What is the plan for intel iommu? Will we have 2 devices, the legacy device and one for nested? Thanks Eric > > Though I do see the point from QEMU prospective that user may want > to start a VM with HW-accelerated vSMMU for one passthrough device > using a simple setup without caring about the routing via command. > > Thanks > Nicolin >
On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: > Nevertheless I don't think anything prevents the acceleration granted > device from also working with virtio/vhost devices for instance unless > you unplug the existing infra. If the accel mode is using something like vcmdq then it is not possible to work since the invalidations won't even be trapped. Even in the case where we trap the invalidations it sure is complicated.. invalidation is done by ASID which is not obviously related to any specific device. An ASID could be hidden inside a CD table that is being HW accessed and also inside a CD table that is SW accessed. The VMM has no way to know what is going on so you'd end up forced to replicate all the ASID invalidations. :\ It just doesn't seem worthwhile to try to make it all work. I'd suggest arranging to share some of the SMMUv3 emulation code, maybe with a library/headerfile or something, but I think it does make sense they would be different implementations given how completely different they should be. Jason
Hi Jason, On 3/19/25 1:31 AM, Jason Gunthorpe wrote: > On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: >> Nevertheless I don't think anything prevents the acceleration granted >> device from also working with virtio/vhost devices for instance unless >> you unplug the existing infra. > If the accel mode is using something like vcmdq then it is not > possible to work since the invalidations won't even be trapped. I acknowledged I was more focused on the case without vcmdq which was addressed in the past and now I better see the problem. > > Even in the case where we trap the invalidations it sure is > complicated.. invalidation is done by ASID which is not obviously > related to any specific device. An ASID could be hidden inside a CD > table that is being HW accessed and also inside a CD table that is SW > accessed. The VMM has no way to know what is going on so you'd end up > forced to replicate all the ASID invalidations. :\ Nevertheless I think we shall also support the case without vcmdq (currently supported in this series). And this one looks more compatible with emulated devices althout less optimized. > > It just doesn't seem worthwhile to try to make it all work. > > I'd suggest arranging to share some of the SMMUv3 emulation code, > maybe with a library/headerfile or something, but I think it does make > sense they would be different implementations given how completely > different they should be. I agree with can do our utmost to separate implementations. I more concerned about having libvirt guessing what kind of devices it shall use. on x86 libvirt needs to use -device intel-iommu,caching-mode=on if one wants to protect a VFIO device. So this looks like similar to adding accel=on on ARM. Eric > > Jason >
On Tue, Mar 18, 2025 at 09:31:35PM -0300, Jason Gunthorpe wrote: > On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: > > Nevertheless I don't think anything prevents the acceleration granted > > device from also working with virtio/vhost devices for instance unless > > you unplug the existing infra. > > If the accel mode is using something like vcmdq then it is not > possible to work since the invalidations won't even be trapped. Yea, I totally forgot that.. All the invalidation commands that belong to emulated devices would be issued to VCMDQ (HW), while those vSIDs wouldn't be supported by the HW for CFGI_CD/ATC_INV, which will trigger errors/timeouts. Thanks Nicolin
On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: > On 3/17/25 9:19 PM, Nicolin Chen wrote: > > On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: > >> On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: > >>> Another question: how does an emulated device work with a vSMMUv3? > >>> I could imagine that all the accel steps would be bypassed since > >>> !sdev->idev. Yet, the emulated iotlb should cache its translation > >>> so we will need to flush the iotlb, which will increase complexity > >>> as the TLBI command dispatching function will need to be aware what > >>> ASID is for emulated device and what is for vfio device.. > >> I think you should block it. We already expect different vSMMU's > >> depending on the physical SMMU under the PCI device, it makes sense > >> that a SW VFIO device would have it's own, non-accelerated, vSMMU > >> model in the guest. > > Yea, I agree and it'd be cleaner for an implementation separating > > them. > > > > In my mind, the general idea of "accel=on" is also to keep things > > in a more efficient way: passthrough devices go to HW-accelerated > > vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- > > bypassed (PCIE0). > Originally a specific SMMU device was needed to opt in for MSI reserved > region ACPI IORT description which are not needed if you don't rely on > S1+S2. However if we don't rely on this trick this was not even needed > with legacy integration > (https://patchwork.kernel.org/project/qemu-devel/cover/20180921081819.9203-1-eric.auger@redhat.com/). > > Nevertheless I don't think anything prevents the acceleration granted > device from also working with virtio/vhost devices for instance unless > you unplug the existing infra. The translation and invalidation just > should use different control paths (explicit translation requests, > invalidation notifications towards vhost, ...). smmuv3_translate() is per sdev, so it's easy. Invalidation is done via commands, which could be tricky: a) Broadcast command b) ASID validation -- we'll need to keep track of a list of ASIDs for vfio device to compare the ASID in each per-ASID command, potentially by trapping all CFGI_CD(_ALL) commands? Note that each vfio device may have multiple ASIDs (for multiple CDs). Either a or b above will have some validation efficiency impact. > Again, what does legitimate to have different qemu devices for the same > IP? I understand that it simplifies the implementation but I am not sure > this is a good reason. Nevertheless it worth challenging. What is the > plan for intel iommu? Will we have 2 devices, the legacy device and one > for nested? Hmm, it seems that there are two different topics: 1. Use one SMMU device model (source code file; "iommu=" string) for both an emulated vSMMU and an HW-accelerated vSMMU. 2. Allow one vSMMU instance to work with both an emulated device and a passthrough device. And I get that you want both 1 and 2. I'm totally okay with 1, yet see no compelling benefit from 2 for the increased complexity in the invalidation routine. And another question about the mixed device attachment. Let's say we have in the host: VFIO passthrough dev0 -> pSMMU0 VFIO passthrough dev1 -> pSMMU1 Should we allow emulated devices to be flexibly plugged? dev0 -> vSMMU0 /* Hard requirement */ dev1 -> vSMMU1 /* Hard requirement */ emu0 -> vSMMU0 /* Soft requirement; can be vSMMU1 also */ emu1 -> vSMMU1 /* Soft requirement; can be vSMMU0 also */ Thanks Nicolin
On 3/18/25 3:13 PM, Nicolin Chen wrote: > On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: >> On 3/17/25 9:19 PM, Nicolin Chen wrote: >>> On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: >>>> On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: >>>>> Another question: how does an emulated device work with a vSMMUv3? >>>>> I could imagine that all the accel steps would be bypassed since >>>>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>>>> so we will need to flush the iotlb, which will increase complexity >>>>> as the TLBI command dispatching function will need to be aware what >>>>> ASID is for emulated device and what is for vfio device.. >>>> I think you should block it. We already expect different vSMMU's >>>> depending on the physical SMMU under the PCI device, it makes sense >>>> that a SW VFIO device would have it's own, non-accelerated, vSMMU >>>> model in the guest. >>> Yea, I agree and it'd be cleaner for an implementation separating >>> them. >>> >>> In my mind, the general idea of "accel=on" is also to keep things >>> in a more efficient way: passthrough devices go to HW-accelerated >>> vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- >>> bypassed (PCIE0). > >> Originally a specific SMMU device was needed to opt in for MSI reserved >> region ACPI IORT description which are not needed if you don't rely on >> S1+S2. However if we don't rely on this trick this was not even needed >> with legacy integration >> (https://patchwork.kernel.org/project/qemu-devel/cover/20180921081819.9203-1-eric.auger@redhat.com/). >> >> Nevertheless I don't think anything prevents the acceleration granted >> device from also working with virtio/vhost devices for instance unless >> you unplug the existing infra. The translation and invalidation just >> should use different control paths (explicit translation requests, >> invalidation notifications towards vhost, ...). > > smmuv3_translate() is per sdev, so it's easy. > > Invalidation is done via commands, which could be tricky: > a) Broadcast command > b) ASID validation -- we'll need to keep track of a list of ASIDs > for vfio device to compare the ASID in each per-ASID command, > potentially by trapping all CFGI_CD(_ALL) commands? Note that > each vfio device may have multiple ASIDs (for multiple CDs). > Either a or b above will have some validation efficiency impact. > >> Again, what does legitimate to have different qemu devices for the same >> IP? I understand that it simplifies the implementation but I am not sure >> this is a good reason. Nevertheless it worth challenging. What is the >> plan for intel iommu? Will we have 2 devices, the legacy device and one >> for nested? > > Hmm, it seems that there are two different topics: > 1. Use one SMMU device model (source code file; "iommu=" string) > for both an emulated vSMMU and an HW-accelerated vSMMU. > 2. Allow one vSMMU instance to work with both an emulated device > and a passthrough device. > And I get that you want both 1 and 2. > > I'm totally okay with 1, yet see no compelling benefit from 2 for > the increased complexity in the invalidation routine. > > And another question about the mixed device attachment. Let's say > we have in the host: > VFIO passthrough dev0 -> pSMMU0 > VFIO passthrough dev1 -> pSMMU1 > Should we allow emulated devices to be flexibly plugged? > dev0 -> vSMMU0 /* Hard requirement */ > dev1 -> vSMMU1 /* Hard requirement */ > emu0 -> vSMMU0 /* Soft requirement; can be vSMMU1 also */ > emu1 -> vSMMU1 /* Soft requirement; can be vSMMU0 also */ > > Thanks > Nicolin > I agree w/Jason & Nicolin: different vSMMUs for pass-through devices than emulated, & vice-versa. Not mixing... because... of the next agreement: I agree with Eric that 'accel' isn't needed -- this should be ascertained from the pSMMU that a physical device is attached to. Now... how does vfio(?; why not qemu?) layer determine that? -- where are SMMUv3 'accel' features exposed either: a) in the device struct (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't find anything under either on my g-h system, but would appreciate a ptr if there is. and like Eric, although 'accel' is better than the original 'nested', it's non-obvious what accel feature(s) are being turned on, or not. In fact, if broken accel hw occurs ('if' -> 'when'), how should it be turned off? ... if info in the kernel, a kernel boot-param will be needed; if in sysfs, a write to 0 an enable(disable) it maybe an alternative as well. Bottom line: we need a way to (a) ascertain the accel feature (b) a way to disable it when it is broken, so qemu's smmuv3 spec will 'just work'. [This may also help when migrating from a machine that has accel working to one that does not.[ ... and when an emulated device is assigned a vSMMU, there are no accel features ... unless we have tunables like batch iotlb invalidation for perf reasons, which can be viewed as an 'accel' option.
On 3/18/25 10:22 PM, Donald Dutile wrote: > > > On 3/18/25 3:13 PM, Nicolin Chen wrote: >> On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: >>> On 3/17/25 9:19 PM, Nicolin Chen wrote: >>>> On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: >>>>> On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: >>>>>> Another question: how does an emulated device work with a vSMMUv3? >>>>>> I could imagine that all the accel steps would be bypassed since >>>>>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>>>>> so we will need to flush the iotlb, which will increase complexity >>>>>> as the TLBI command dispatching function will need to be aware what >>>>>> ASID is for emulated device and what is for vfio device.. >>>>> I think you should block it. We already expect different vSMMU's >>>>> depending on the physical SMMU under the PCI device, it makes sense >>>>> that a SW VFIO device would have it's own, non-accelerated, vSMMU >>>>> model in the guest. >>>> Yea, I agree and it'd be cleaner for an implementation separating >>>> them. >>>> >>>> In my mind, the general idea of "accel=on" is also to keep things >>>> in a more efficient way: passthrough devices go to HW-accelerated >>>> vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- >>>> bypassed (PCIE0). >> >>> Originally a specific SMMU device was needed to opt in for MSI reserved >>> region ACPI IORT description which are not needed if you don't rely on >>> S1+S2. However if we don't rely on this trick this was not even needed >>> with legacy integration >>> (https://patchwork.kernel.org/project/qemu-devel/cover/20180921081819.9203-1-eric.auger@redhat.com/). >>> >>> >>> Nevertheless I don't think anything prevents the acceleration granted >>> device from also working with virtio/vhost devices for instance unless >>> you unplug the existing infra. The translation and invalidation just >>> should use different control paths (explicit translation requests, >>> invalidation notifications towards vhost, ...). >> >> smmuv3_translate() is per sdev, so it's easy. >> >> Invalidation is done via commands, which could be tricky: >> a) Broadcast command >> b) ASID validation -- we'll need to keep track of a list of ASIDs >> for vfio device to compare the ASID in each per-ASID command, >> potentially by trapping all CFGI_CD(_ALL) commands? Note that >> each vfio device may have multiple ASIDs (for multiple CDs). >> Either a or b above will have some validation efficiency impact. >> >>> Again, what does legitimate to have different qemu devices for the same >>> IP? I understand that it simplifies the implementation but I am not >>> sure >>> this is a good reason. Nevertheless it worth challenging. What is the >>> plan for intel iommu? Will we have 2 devices, the legacy device and one >>> for nested? >> >> Hmm, it seems that there are two different topics: >> 1. Use one SMMU device model (source code file; "iommu=" string) >> for both an emulated vSMMU and an HW-accelerated vSMMU. >> 2. Allow one vSMMU instance to work with both an emulated device >> and a passthrough device. >> And I get that you want both 1 and 2. >> >> I'm totally okay with 1, yet see no compelling benefit from 2 for >> the increased complexity in the invalidation routine. >> >> And another question about the mixed device attachment. Let's say >> we have in the host: >> VFIO passthrough dev0 -> pSMMU0 >> VFIO passthrough dev1 -> pSMMU1 >> Should we allow emulated devices to be flexibly plugged? >> dev0 -> vSMMU0 /* Hard requirement */ >> dev1 -> vSMMU1 /* Hard requirement */ >> emu0 -> vSMMU0 /* Soft requirement; can be vSMMU1 also */ >> emu1 -> vSMMU1 /* Soft requirement; can be vSMMU0 also */ >> >> Thanks >> Nicolin >> > I agree w/Jason & Nicolin: different vSMMUs for pass-through devices > than emulated, & vice-versa. > Not mixing... because... of the next agreement: you need to clarify what you mean by different vSMMUs: are you taking about different instances or different qemu device types? > > I agree with Eric that 'accel' isn't needed -- this should be > ascertained from the pSMMU that a physical device is attached to. we can simply use an AUTO_ON_OFF property and by default choose AUTO value. That would close the debate ;-) Eric > Now... how does vfio(?; why not qemu?) layer determine that? -- where > are SMMUv3 'accel' features exposed either: a) in the device struct > (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't find > anything under either on my g-h system, but would appreciate a ptr if > there is. > and like Eric, although 'accel' is better than the original 'nested', > it's non-obvious what accel feature(s) are being turned on, or not. > In fact, if broken accel hw occurs ('if' -> 'when'), how should it be > turned off? ... if info in the kernel, a kernel boot-param will be > needed; > if in sysfs, a write to 0 an enable(disable) it maybe an alternative > as well. > Bottom line: we need a way to (a) ascertain the accel feature (b) a > way to disable it when it is broken, > so qemu's smmuv3 spec will 'just work'. > [This may also help when migrating from a machine that has accel > working to one that does not.[ > > ... and when an emulated device is assigned a vSMMU, there are no > accel features ... unless we have tunables like batch iotlb > invalidation for perf reasons, which can be viewed as an 'accel' option. >
On 3/19/25 1:04 PM, Eric Auger wrote: > > > > On 3/18/25 10:22 PM, Donald Dutile wrote: >> >> >> On 3/18/25 3:13 PM, Nicolin Chen wrote: >>> On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: >>>> On 3/17/25 9:19 PM, Nicolin Chen wrote: >>>>> On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: >>>>>> On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: >>>>>>> Another question: how does an emulated device work with a vSMMUv3? >>>>>>> I could imagine that all the accel steps would be bypassed since >>>>>>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>>>>>> so we will need to flush the iotlb, which will increase complexity >>>>>>> as the TLBI command dispatching function will need to be aware what >>>>>>> ASID is for emulated device and what is for vfio device.. >>>>>> I think you should block it. We already expect different vSMMU's >>>>>> depending on the physical SMMU under the PCI device, it makes sense >>>>>> that a SW VFIO device would have it's own, non-accelerated, vSMMU >>>>>> model in the guest. >>>>> Yea, I agree and it'd be cleaner for an implementation separating >>>>> them. >>>>> >>>>> In my mind, the general idea of "accel=on" is also to keep things >>>>> in a more efficient way: passthrough devices go to HW-accelerated >>>>> vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- >>>>> bypassed (PCIE0). >>> >>>> Originally a specific SMMU device was needed to opt in for MSI reserved >>>> region ACPI IORT description which are not needed if you don't rely on >>>> S1+S2. However if we don't rely on this trick this was not even needed >>>> with legacy integration >>>> (https://patchwork.kernel.org/project/qemu-devel/cover/20180921081819.9203-1-eric.auger@redhat.com/). >>>> >>>> >>>> Nevertheless I don't think anything prevents the acceleration granted >>>> device from also working with virtio/vhost devices for instance unless >>>> you unplug the existing infra. The translation and invalidation just >>>> should use different control paths (explicit translation requests, >>>> invalidation notifications towards vhost, ...). >>> >>> smmuv3_translate() is per sdev, so it's easy. >>> >>> Invalidation is done via commands, which could be tricky: >>> a) Broadcast command >>> b) ASID validation -- we'll need to keep track of a list of ASIDs >>> for vfio device to compare the ASID in each per-ASID command, >>> potentially by trapping all CFGI_CD(_ALL) commands? Note that >>> each vfio device may have multiple ASIDs (for multiple CDs). >>> Either a or b above will have some validation efficiency impact. >>> >>>> Again, what does legitimate to have different qemu devices for the same >>>> IP? I understand that it simplifies the implementation but I am not >>>> sure >>>> this is a good reason. Nevertheless it worth challenging. What is the >>>> plan for intel iommu? Will we have 2 devices, the legacy device and one >>>> for nested? >>> >>> Hmm, it seems that there are two different topics: >>> 1. Use one SMMU device model (source code file; "iommu=" string) >>> for both an emulated vSMMU and an HW-accelerated vSMMU. >>> 2. Allow one vSMMU instance to work with both an emulated device >>> and a passthrough device. >>> And I get that you want both 1 and 2. >>> >>> I'm totally okay with 1, yet see no compelling benefit from 2 for >>> the increased complexity in the invalidation routine. >>> >>> And another question about the mixed device attachment. Let's say >>> we have in the host: >>> VFIO passthrough dev0 -> pSMMU0 >>> VFIO passthrough dev1 -> pSMMU1 >>> Should we allow emulated devices to be flexibly plugged? >>> dev0 -> vSMMU0 /* Hard requirement */ >>> dev1 -> vSMMU1 /* Hard requirement */ >>> emu0 -> vSMMU0 /* Soft requirement; can be vSMMU1 also */ >>> emu1 -> vSMMU1 /* Soft requirement; can be vSMMU0 also */ >>> >>> Thanks >>> Nicolin >>> >> I agree w/Jason & Nicolin: different vSMMUs for pass-through devices >> than emulated, & vice-versa. >> Not mixing... because... of the next agreement: > you need to clarify what you mean by different vSMMUs: are you taking > about different instances or different qemu device types? Both. a device needed to use hw-accel feature has to use an smmu that has that feature; an emulated device can use such an smmu, but as mentioned in other threads, if you start with all emulated in one smmu, if you hot-plug a (assigned) device, it needs another smmu that has hw-accel features. Keeping them split makes it easier at config time, and it may enable the code to be simpler... but the other half of my brain wants common code paths with accel/emulate branches but a different smmu instance will like simplify the smmu-(accel-)specific lookups. >> >> I agree with Eric that 'accel' isn't needed -- this should be >> ascertained from the pSMMU that a physical device is attached to. > we can simply use an AUTO_ON_OFF property and by default choose AUTO > value. That would close the debate ;-) > Preaching to the choir... yes. > Eric >> Now... how does vfio(?; why not qemu?) layer determine that? -- where >> are SMMUv3 'accel' features exposed either: a) in the device struct >> (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't find >> anything under either on my g-h system, but would appreciate a ptr if >> there is. >> and like Eric, although 'accel' is better than the original 'nested', >> it's non-obvious what accel feature(s) are being turned on, or not. >> In fact, if broken accel hw occurs ('if' -> 'when'), how should it be >> turned off? ... if info in the kernel, a kernel boot-param will be >> needed; >> if in sysfs, a write to 0 an enable(disable) it maybe an alternative >> as well. >> Bottom line: we need a way to (a) ascertain the accel feature (b) a >> way to disable it when it is broken, >> so qemu's smmuv3 spec will 'just work'. >> [This may also help when migrating from a machine that has accel >> working to one that does not.[ >> >> ... and when an emulated device is assigned a vSMMU, there are no >> accel features ... unless we have tunables like batch iotlb >> invalidation for perf reasons, which can be viewed as an 'accel' option. >> >
On 3/21/25 1:54 AM, Donald Dutile wrote: > > > On 3/19/25 1:04 PM, Eric Auger wrote: >> >> >> >> On 3/18/25 10:22 PM, Donald Dutile wrote: >>> >>> >>> On 3/18/25 3:13 PM, Nicolin Chen wrote: >>>> On Tue, Mar 18, 2025 at 07:31:36PM +0100, Eric Auger wrote: >>>>> On 3/17/25 9:19 PM, Nicolin Chen wrote: >>>>>> On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: >>>>>>> On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: >>>>>>>> Another question: how does an emulated device work with a vSMMUv3? >>>>>>>> I could imagine that all the accel steps would be bypassed since >>>>>>>> !sdev->idev. Yet, the emulated iotlb should cache its translation >>>>>>>> so we will need to flush the iotlb, which will increase complexity >>>>>>>> as the TLBI command dispatching function will need to be aware >>>>>>>> what >>>>>>>> ASID is for emulated device and what is for vfio device.. >>>>>>> I think you should block it. We already expect different vSMMU's >>>>>>> depending on the physical SMMU under the PCI device, it makes sense >>>>>>> that a SW VFIO device would have it's own, non-accelerated, vSMMU >>>>>>> model in the guest. >>>>>> Yea, I agree and it'd be cleaner for an implementation separating >>>>>> them. >>>>>> >>>>>> In my mind, the general idea of "accel=on" is also to keep things >>>>>> in a more efficient way: passthrough devices go to HW-accelerated >>>>>> vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- >>>>>> bypassed (PCIE0). >>>> >>>>> Originally a specific SMMU device was needed to opt in for MSI >>>>> reserved >>>>> region ACPI IORT description which are not needed if you don't >>>>> rely on >>>>> S1+S2. However if we don't rely on this trick this was not even >>>>> needed >>>>> with legacy integration >>>>> (https://patchwork.kernel.org/project/qemu-devel/cover/20180921081819.9203-1-eric.auger@redhat.com/). >>>>> >>>>> >>>>> >>>>> Nevertheless I don't think anything prevents the acceleration granted >>>>> device from also working with virtio/vhost devices for instance >>>>> unless >>>>> you unplug the existing infra. The translation and invalidation just >>>>> should use different control paths (explicit translation requests, >>>>> invalidation notifications towards vhost, ...). >>>> >>>> smmuv3_translate() is per sdev, so it's easy. >>>> >>>> Invalidation is done via commands, which could be tricky: >>>> a) Broadcast command >>>> b) ASID validation -- we'll need to keep track of a list of ASIDs >>>> for vfio device to compare the ASID in each per-ASID command, >>>> potentially by trapping all CFGI_CD(_ALL) commands? Note that >>>> each vfio device may have multiple ASIDs (for multiple CDs). >>>> Either a or b above will have some validation efficiency impact. >>>> >>>>> Again, what does legitimate to have different qemu devices for the >>>>> same >>>>> IP? I understand that it simplifies the implementation but I am not >>>>> sure >>>>> this is a good reason. Nevertheless it worth challenging. What is the >>>>> plan for intel iommu? Will we have 2 devices, the legacy device >>>>> and one >>>>> for nested? >>>> >>>> Hmm, it seems that there are two different topics: >>>> 1. Use one SMMU device model (source code file; "iommu=" string) >>>> for both an emulated vSMMU and an HW-accelerated vSMMU. >>>> 2. Allow one vSMMU instance to work with both an emulated device >>>> and a passthrough device. >>>> And I get that you want both 1 and 2. >>>> >>>> I'm totally okay with 1, yet see no compelling benefit from 2 for >>>> the increased complexity in the invalidation routine. >>>> >>>> And another question about the mixed device attachment. Let's say >>>> we have in the host: >>>> VFIO passthrough dev0 -> pSMMU0 >>>> VFIO passthrough dev1 -> pSMMU1 >>>> Should we allow emulated devices to be flexibly plugged? >>>> dev0 -> vSMMU0 /* Hard requirement */ >>>> dev1 -> vSMMU1 /* Hard requirement */ >>>> emu0 -> vSMMU0 /* Soft requirement; can be vSMMU1 also */ >>>> emu1 -> vSMMU1 /* Soft requirement; can be vSMMU0 also */ >>>> >>>> Thanks >>>> Nicolin >>>> >>> I agree w/Jason & Nicolin: different vSMMUs for pass-through devices >>> than emulated, & vice-versa. >>> Not mixing... because... of the next agreement: >> you need to clarify what you mean by different vSMMUs: are you taking >> about different instances or different qemu device types? > Both. a device needed to use hw-accel feature has to use an smmu that > has that feature; > an emulated device can use such an smmu, but as mentioned in other > threads, > if you start with all emulated in one smmu, if you hot-plug a > (assigned) device, > it needs another smmu that has hw-accel features. > Keeping them split makes it easier at config time, and it may enable > the code to be simpler... > but the other half of my brain wants common code paths with > accel/emulate branches but > a different smmu instance will like simplify the smmu-(accel-)specific > lookups. Yes I think we agree on the fact that several smmu instances are needed, especially for matching the underneath HW topology and for having a separate protection for emulated and host devices (esp with vCMD queues) Eric > >>> >>> I agree with Eric that 'accel' isn't needed -- this should be >>> ascertained from the pSMMU that a physical device is attached to. >> we can simply use an AUTO_ON_OFF property and by default choose AUTO >> value. That would close the debate ;-) >> > Preaching to the choir... yes. > >> Eric >>> Now... how does vfio(?; why not qemu?) layer determine that? -- where >>> are SMMUv3 'accel' features exposed either: a) in the device struct >>> (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't find >>> anything under either on my g-h system, but would appreciate a ptr if >>> there is. >>> and like Eric, although 'accel' is better than the original 'nested', >>> it's non-obvious what accel feature(s) are being turned on, or not. >>> In fact, if broken accel hw occurs ('if' -> 'when'), how should it be >>> turned off? ... if info in the kernel, a kernel boot-param will be >>> needed; >>> if in sysfs, a write to 0 an enable(disable) it maybe an alternative >>> as well. >>> Bottom line: we need a way to (a) ascertain the accel feature (b) a >>> way to disable it when it is broken, >>> so qemu's smmuv3 spec will 'just work'. >>> [This may also help when migrating from a machine that has accel >>> working to one that does not.[ >>> >>> ... and when an emulated device is assigned a vSMMU, there are no >>> accel features ... unless we have tunables like batch iotlb >>> invalidation for perf reasons, which can be viewed as an 'accel' >>> option. >>> >> >
On Tue, Mar 18, 2025 at 05:22:51PM -0400, Donald Dutile wrote: > I agree with Eric that 'accel' isn't needed -- this should be > ascertained from the pSMMU that a physical device is attached to. I seem to remember the point was made that we don't actually know if accel is possible, or desired, especially in the case of hotplug. The accelerated mode has a number of limitations that the software mode does not have. I think it does make sense that the user would deliberately choose to use a more restrictive operating mode and then would have to meet the requirements - eg by creating the required number and configuration of vSMMUs. > Now... how does vfio(?; why not qemu?) layer determine that? -- > where are SMMUv3 'accel' features exposed either: a) in the device > struct (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't > find anything under either on my g-h system, but would appreciate a > ptr if there is. I think it is not discoverable yet other thatn through try-and-fail. Discoverability would probably be some bits in an iommufd GET_INFO ioctl or something like that. > and like Eric, although 'accel' is better than the > original 'nested', it's non-obvious what accel feature(s) are being > turned on, or not. There are really only one accel feature - direct HW usage of the IO Page table in the guest (no shadowing). A secondary addon would be direct HW usage of an invalidation queue in the guest. > kernel boot-param will be needed; if in sysfs, a write to 0 an > enable(disable) it maybe an alternative as well. Bottom line: we > need a way to (a) ascertain the accel feature (b) a way to disable > it when it is broken, so qemu's smmuv3 spec will 'just work'. You'd turned it off by not asking qemu to use it, that is sort of the reasoning behind the command line opt in for accel or not. Jason
Hi, On 3/19/25 1:23 AM, Jason Gunthorpe wrote: > On Tue, Mar 18, 2025 at 05:22:51PM -0400, Donald Dutile wrote: > >> I agree with Eric that 'accel' isn't needed -- this should be >> ascertained from the pSMMU that a physical device is attached to. > I seem to remember the point was made that we don't actually know if > accel is possible, or desired, especially in the case of hotplug. that's why I think it would be better if we could instantiate a single type of device that can do both accel and non accel mode. Maybe that would be at the price of always enforcing MSI resv regions on guest to assure MSI nesting is possible. > > The accelerated mode has a number of limitations that the software > mode does not have. I think it does make sense that the user would > deliberately choose to use a more restrictive operating mode and then > would have to meet the requirements - eg by creating the required > number and configuration of vSMMUs. To avoid any misunderstanding I am not pushing for have a single vSMMU instance. I advocate for having several instances, each somehow specialized for VFIO devices or emulated devices. Maybe we can opt-in with accel=on but the default could be auto (the property can be AUTO_ON_OFF) where the code detects if a VFIO device is translated.In case incompatible devices are translated into a same vSMMU instance I guess it could be detected and will fail. What I am pusshing for is to have a single type of QEMU device which can do both accel and non accel. > In general I advocate for having several vSMMU instances, each of them > >> Now... how does vfio(?; why not qemu?) layer determine that? -- >> where are SMMUv3 'accel' features exposed either: a) in the device >> struct (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't >> find anything under either on my g-h system, but would appreciate a >> ptr if there is. > I think it is not discoverable yet other thatn through > try-and-fail. Discoverability would probably be some bits in an > iommufd GET_INFO ioctl or something like that. yeah but at least we can easily detect if a VFIO device is beeing translated by a vSMMU instance in which case there is no other choice to turn accel on. Thanks Eric > >> and like Eric, although 'accel' is better than the >> original 'nested', it's non-obvious what accel feature(s) are being >> turned on, or not. > There are really only one accel feature - direct HW usage of the IO > Page table in the guest (no shadowing). > > A secondary addon would be direct HW usage of an invalidation queue in > the guest. > >> kernel boot-param will be needed; if in sysfs, a write to 0 an >> enable(disable) it maybe an alternative as well. Bottom line: we >> need a way to (a) ascertain the accel feature (b) a way to disable >> it when it is broken, so qemu's smmuv3 spec will 'just work'. > You'd turned it off by not asking qemu to use it, that is sort of the > reasoning behind the command line opt in for accel or not. > > Jason >
On 3/19/25 1:00 PM, Eric Auger wrote: > Hi, > > > On 3/19/25 1:23 AM, Jason Gunthorpe wrote: >> On Tue, Mar 18, 2025 at 05:22:51PM -0400, Donald Dutile wrote: >> >>> I agree with Eric that 'accel' isn't needed -- this should be >>> ascertained from the pSMMU that a physical device is attached to. >> I seem to remember the point was made that we don't actually know if >> accel is possible, or desired, especially in the case of hotplug. > that's why I think it would be better if we could instantiate a single > type of device that can do both accel and non accel mode. > Maybe that would be at the price of always enforcing MSI resv regions on > guest to assure MSI nesting is possible. > >> >> The accelerated mode has a number of limitations that the software >> mode does not have. I think it does make sense that the user would >> deliberately choose to use a more restrictive operating mode and then >> would have to meet the requirements - eg by creating the required >> number and configuration of vSMMUs. > To avoid any misunderstanding I am not pushing for have a single vSMMU > instance. I advocate for having several instances, each somehow > specialized for VFIO devices or emulated devices. Maybe we can opt-in > with accel=on but the default could be auto (the property can be > AUTO_ON_OFF) where the code detects if a VFIO device is translated.In > case incompatible devices are translated into a same vSMMU instance I > guess it could be detected and will fail. > > What I am pusshing for is to have a single type of QEMU device which can > do both accel and non accel. +1 ! >> In general I advocate for having several vSMMU instances, each of them >> >>> Now... how does vfio(?; why not qemu?) layer determine that? -- >>> where are SMMUv3 'accel' features exposed either: a) in the device >>> struct (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't >>> find anything under either on my g-h system, but would appreciate a >>> ptr if there is. >> I think it is not discoverable yet other thatn through >> try-and-fail. Discoverability would probably be some bits in an >> iommufd GET_INFO ioctl or something like that. > yeah but at least we can easily detect if a VFIO device is beeing > translated by a vSMMU instance in which case there is no other choice to > turn accel on. > > Thanks > > Eric >> >>> and like Eric, although 'accel' is better than the >>> original 'nested', it's non-obvious what accel feature(s) are being >>> turned on, or not. >> There are really only one accel feature - direct HW usage of the IO >> Page table in the guest (no shadowing). >> >> A secondary addon would be direct HW usage of an invalidation queue in >> the guest. >> >>> kernel boot-param will be needed; if in sysfs, a write to 0 an >>> enable(disable) it maybe an alternative as well. Bottom line: we >>> need a way to (a) ascertain the accel feature (b) a way to disable >>> it when it is broken, so qemu's smmuv3 spec will 'just work'. >> You'd turned it off by not asking qemu to use it, that is sort of the >> reasoning behind the command line opt in for accel or not. >> >> Jason >> >
> -----Original Message----- > From: Eric Auger <eric.auger@redhat.com> > Sent: Wednesday, March 19, 2025 5:01 PM > To: Jason Gunthorpe <jgg@nvidia.com>; Donald Dutile > <ddutile@redhat.com> > Cc: Nicolin Chen <nicolinc@nvidia.com>; Shameerali Kolothum Thodi > <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org; > qemu-devel@nongnu.org; peter.maydell@linaro.org; > berrange@redhat.com; nathanc@nvidia.com; mochs@nvidia.com; > smostafa@google.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B) > <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>; > Jonathan Cameron <jonathan.cameron@huawei.com>; > zhangfei.gao@linaro.org > Subject: Re: [RFC PATCH v2 03/20] hw/arm/smmuv3-accel: Add initial > infrastructure for smmuv3-accel device > > Hi, > > > On 3/19/25 1:23 AM, Jason Gunthorpe wrote: > > On Tue, Mar 18, 2025 at 05:22:51PM -0400, Donald Dutile wrote: > > > >> I agree with Eric that 'accel' isn't needed -- this should be > >> ascertained from the pSMMU that a physical device is attached to. > > I seem to remember the point was made that we don't actually know if > > accel is possible, or desired, especially in the case of hotplug. > that's why I think it would be better if we could instantiate a single > type of device that can do both accel and non accel mode. > Maybe that would be at the price of always enforcing MSI resv regions on > guest to assure MSI nesting is possible. > > > > > The accelerated mode has a number of limitations that the software > > mode does not have. I think it does make sense that the user would > > deliberately choose to use a more restrictive operating mode and then > > would have to meet the requirements - eg by creating the required > > number and configuration of vSMMUs. > To avoid any misunderstanding I am not pushing for have a single vSMMU > instance. I advocate for having several instances, each somehow > specialized for VFIO devices or emulated devices. Maybe we can opt-in > with accel=on but the default could be auto (the property can be > AUTO_ON_OFF) where the code detects if a VFIO device is translated.In > case incompatible devices are translated into a same vSMMU instance I > guess it could be detected and will fail. > > What I am pusshing for is to have a single type of QEMU device which can > do both accel and non accel. > > In general I advocate for having several vSMMU instances, each of them > > > >> Now... how does vfio(?; why not qemu?) layer determine that? -- > >> where are SMMUv3 'accel' features exposed either: a) in the device > >> struct (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't > >> find anything under either on my g-h system, but would appreciate a > >> ptr if there is. > > I think it is not discoverable yet other thatn through > > try-and-fail. Discoverability would probably be some bits in an > > iommufd GET_INFO ioctl or something like that. > yeah but at least we can easily detect if a VFIO device is beeing > translated by a vSMMU instance in which case there is no other choice to > turn accel on. Not sure, how you can handle hotplug in such a case? For example if the smmuv3 dev starts with an emulated device and later try plug a vfio dev? In case of "accel" the feature bits(IIDR) is queried from the host SMMUv3 and is presented to to the vSMMU(See patch #16). We can't do this once Guest is booted. Also Daniel previously commented on RFCv1 that he would like to have explicit vSMMU<-->pSMMU association in Qemu command line. https://lore.kernel.org/qemu-devel/Z6TLSdwgajmHVmGH@redhat.com/ Though we are not there yet without a cold-plugged VFIO dev at the moment, having auto detection of accel is not the right approach if we want an explicit association in Qemu command line. Thanks, Shameer
On 3/19/25 6:12 PM, Shameerali Kolothum Thodi wrote: > >> -----Original Message----- >> From: Eric Auger <eric.auger@redhat.com> >> Sent: Wednesday, March 19, 2025 5:01 PM >> To: Jason Gunthorpe <jgg@nvidia.com>; Donald Dutile >> <ddutile@redhat.com> >> Cc: Nicolin Chen <nicolinc@nvidia.com>; Shameerali Kolothum Thodi >> <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org; >> qemu-devel@nongnu.org; peter.maydell@linaro.org; >> berrange@redhat.com; nathanc@nvidia.com; mochs@nvidia.com; >> smostafa@google.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B) >> <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>; >> Jonathan Cameron <jonathan.cameron@huawei.com>; >> zhangfei.gao@linaro.org >> Subject: Re: [RFC PATCH v2 03/20] hw/arm/smmuv3-accel: Add initial >> infrastructure for smmuv3-accel device >> >> Hi, >> >> >> On 3/19/25 1:23 AM, Jason Gunthorpe wrote: >>> On Tue, Mar 18, 2025 at 05:22:51PM -0400, Donald Dutile wrote: >>> >>>> I agree with Eric that 'accel' isn't needed -- this should be >>>> ascertained from the pSMMU that a physical device is attached to. >>> I seem to remember the point was made that we don't actually know if >>> accel is possible, or desired, especially in the case of hotplug. >> that's why I think it would be better if we could instantiate a single >> type of device that can do both accel and non accel mode. >> Maybe that would be at the price of always enforcing MSI resv regions on >> guest to assure MSI nesting is possible. >> >>> The accelerated mode has a number of limitations that the software >>> mode does not have. I think it does make sense that the user would >>> deliberately choose to use a more restrictive operating mode and then >>> would have to meet the requirements - eg by creating the required >>> number and configuration of vSMMUs. >> To avoid any misunderstanding I am not pushing for have a single vSMMU >> instance. I advocate for having several instances, each somehow >> specialized for VFIO devices or emulated devices. Maybe we can opt-in >> with accel=on but the default could be auto (the property can be >> AUTO_ON_OFF) where the code detects if a VFIO device is translated.In >> case incompatible devices are translated into a same vSMMU instance I >> guess it could be detected and will fail. >> >> What I am pusshing for is to have a single type of QEMU device which can >> do both accel and non accel. >>> In general I advocate for having several vSMMU instances, each of them >>> >>>> Now... how does vfio(?; why not qemu?) layer determine that? -- >>>> where are SMMUv3 'accel' features exposed either: a) in the device >>>> struct (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't >>>> find anything under either on my g-h system, but would appreciate a >>>> ptr if there is. >>> I think it is not discoverable yet other thatn through >>> try-and-fail. Discoverability would probably be some bits in an >>> iommufd GET_INFO ioctl or something like that. >> yeah but at least we can easily detect if a VFIO device is beeing >> translated by a vSMMU instance in which case there is no other choice to >> turn accel on. > Not sure, how you can handle hotplug in such a case? For example if the smmuv3 > dev starts with an emulated device and later try plug a vfio dev? In case of "accel" > the feature bits(IIDR) is queried from the host SMMUv3 and is presented to > to the vSMMU(See patch #16). We can't do this once Guest is booted. if accel=auto, if smmu is attached to a bus where only emulated devices are plugged, at cold start accel=false and then it effectively becomes impossible to hotplug a vfio device. if accel=auto and smmu is attached to a bus where a VFIO-PCI device is cold-plugged, we end up with accel=on forced. otherwise you always have the possible to opt-on for accel with accel=true just like intel_iommu has caching_mode option. > > Also Daniel previously commented on RFCv1 that he would like to have explicit > vSMMU<-->pSMMU association in Qemu command line. > https://lore.kernel.org/qemu-devel/Z6TLSdwgajmHVmGH@redhat.com/ tbh I did not understand why this explicit setting was needed and why it can't be inferred from the HostIOMMUDevice. But I need to read this back. > > Though we are not there yet without a cold-plugged VFIO dev at the moment, > having auto detection of accel is not the right approach if we want an explicit > association in Qemu command line. Maybe we shall not focus too much on auto detection at the moment. Eric > > Thanks, > Shameer
Jason, Hey! On 3/18/25 8:23 PM, Jason Gunthorpe wrote: > On Tue, Mar 18, 2025 at 05:22:51PM -0400, Donald Dutile wrote: > >> I agree with Eric that 'accel' isn't needed -- this should be >> ascertained from the pSMMU that a physical device is attached to. > > I seem to remember the point was made that we don't actually know if > accel is possible, or desired, especially in the case of hotplug. > In the case of hw-passthrough hot-plug, what isn't known?: a) domain:b:d.f is known b) thus its hierarchy and SMMUv3 association in the host is known c) thus, if the (accel) features of the SMMUv3 were exposed (known), then the proper setup (separate SMMUv3 vs system-wide-emulated SMMUv3; association of (allocated/configured) vSMMUv3 to pSMMUv3 would be known/made What else is missing? > The accelerated mode has a number of limitations that the software > mode does not have. I think it does make sense that the user would > deliberately choose to use a more restrictive operating mode and then > would have to meet the requirements - eg by creating the required > number and configuration of vSMMUs. > At a qemu-cmd level, the right number & config of smmuv3's, but libvirt, if it had the above info, could auto-generate the right number of smmuv3's (stages, accel-features, etc.) ... just as it does today in creating the right number of pcie bus's, RPs, etc. from simple(r) device specs into more complete, qemu configs. >> Now... how does vfio(?; why not qemu?) layer determine that? -- >> where are SMMUv3 'accel' features exposed either: a) in the device >> struct (for the smmuv3) or (b) somewhere under sysfs? ... I couldn't >> find anything under either on my g-h system, but would appreciate a >> ptr if there is. > > I think it is not discoverable yet other thatn through > try-and-fail. Discoverability would probably be some bits in an > iommufd GET_INFO ioctl or something like that. > I don't see how iommufd would 'get-info' the needed info any better than any other interface/subsystem. ... >> and like Eric, although 'accel' is better than the >> original 'nested', it's non-obvious what accel feature(s) are being >> turned on, or not. > > There are really only one accel feature - direct HW usage of the IO > Page table in the guest (no shadowing). > > A secondary addon would be direct HW usage of an invalidation queue in > the guest. > and, if architected correctly, even in (device-specific) sw-provided tables, it could be 'formatted' in a way that it was discoverable by the appropriate layers (libvirt, qemu). Once discoverable, this whole separate accel device -- which is really an attribute of an SMMUv3 -- can be generalized, and reduced, to a much smaller, simpler, sw footprint, with the concept of callbacks (as the series uses) to enable hw accelerators to perform the shadow-ops that fully-emulated smmuv3 would have to do. >> kernel boot-param will be needed; if in sysfs, a write to 0 an >> enable(disable) it maybe an alternative as well. Bottom line: we >> need a way to (a) ascertain the accel feature (b) a way to disable >> it when it is broken, so qemu's smmuv3 spec will 'just work'. > > You'd turned it off by not asking qemu to use it, that is sort of the > reasoning behind the command line opt in for accel or not. It would make machine-level definitions far more portable if the working/non-working, and the one-accel, or two-accel, or three-accel, or ... features were dynamically determined vs a static (qemu) machine config, that would have to be manipulated each time it ran on a different machine. e.g., cluster sw scans servers for machines with device-X. create VMs, assigning some/all of device-X to a VM via its own smmuv3. done. Now, if the smmuv3 features were exposed all the way up to userspace, then one could argue the cluster sw could scan for those features and add it to the accel=x,y,z option of the smmuv3 associated with an assigned device. potato/po-tah-toe cluster sw or libvirt or qemu or <something-else> scans/reads ... discoverability of the features has to be done by (a) a computer, or (b) an error-prone human. ... all that AI gone to waste ... ;-) - Don > > Jason >
> -----Original Message----- > From: Nicolin Chen <nicolinc@nvidia.com> > Sent: Monday, March 17, 2025 8:19 PM > To: Jason Gunthorpe <jgg@nvidia.com> > Cc: Eric Auger <eric.auger@redhat.com>; Shameerali Kolothum Thodi > <shameerali.kolothum.thodi@huawei.com>; qemu-arm@nongnu.org; > qemu-devel@nongnu.org; peter.maydell@linaro.org; ddutile@redhat.com; > berrange@redhat.com; nathanc@nvidia.com; mochs@nvidia.com; > smostafa@google.com; Linuxarm <linuxarm@huawei.com>; Wangzhou (B) > <wangzhou1@hisilicon.com>; jiangkunkun <jiangkunkun@huawei.com>; > Jonathan Cameron <jonathan.cameron@huawei.com>; > zhangfei.gao@linaro.org > Subject: Re: [RFC PATCH v2 03/20] hw/arm/smmuv3-accel: Add initial > infrastructure for smmuv3-accel device > > On Mon, Mar 17, 2025 at 04:24:53PM -0300, Jason Gunthorpe wrote: > > On Mon, Mar 17, 2025 at 12:10:19PM -0700, Nicolin Chen wrote: > > > Another question: how does an emulated device work with a > vSMMUv3? > > > I could imagine that all the accel steps would be bypassed since > > > !sdev->idev. Yet, the emulated iotlb should cache its translation > > > so we will need to flush the iotlb, which will increase complexity > > > as the TLBI command dispatching function will need to be aware what > > > ASID is for emulated device and what is for vfio device.. > > > > I think you should block it. We already expect different vSMMU's > > depending on the physical SMMU under the PCI device, it makes sense > > that a SW VFIO device would have it's own, non-accelerated, vSMMU > > model in the guest. > > Yea, I agree and it'd be cleaner for an implementation separating > them. > > In my mind, the general idea of "accel=on" is also to keep things > in a more efficient way: passthrough devices go to HW-accelerated > vSMMUs (separated PCIE buses), while emulated ones go to a vSMMU- > bypassed (PCIE0). > > Though I do see the point from QEMU prospective that user may want > to start a VM with HW-accelerated vSMMU for one passthrough device > using a simple setup without caring about the routing via command. For now we don't use iotlb for accel cases with emulated devices. So probably can document/warn the user about possible performance degradation if they attach such a device rather than blocking. Thanks, Shameer
On Tue, Mar 11, 2025 at 02:10:28PM +0000, Shameer Kolothum wrote: > +/* > + * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd > + * Copyright (C) 2025 NVIDIA + * Copyright (C) 2025 NVIDIA CORPORATION & AFFILIATES > + * Written by Nicolin Chen, Shameer Kolothum (Thanks for adding my name!) > struct SMMUBaseClass { > diff --git a/include/hw/arm/smmuv3-accel.h b/include/hw/arm/smmuv3-accel.h > new file mode 100644 > index 0000000000..56fe376bf4 > --- /dev/null > +++ b/include/hw/arm/smmuv3-accel.h > @@ -0,0 +1,31 @@ > +/* > + * Copyright (c) 2025 Huawei Technologies R & D (UK) Ltd > + * Copyright (C) 2025 NVIDIA Ditto > + * Written by Nicolin Chen, Shameer Kolothum > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#ifndef HW_ARM_SMMUV3_ACCEL_H > +#define HW_ARM_SMMUV3_ACCEL_H > + > +#include "hw/arm/smmu-common.h" > +#include "hw/arm/smmuv3.h" > +#include "qom/object.h" smmuv3.h seems to include smmu-common.h and object.h already. Nicolin
© 2016 - 2025 Red Hat, Inc.