When the guest reboots with devices in nested mode (S1 + S2), any QEMU/UEFI
access to those devices can fail because S1 translation is not valid during
the reboot. For example, a passthrough NVMe device may hold GRUB boot info
that UEFI tries to read during the reboot.
Set S1 to bypass mode during reset to avoid such failures.
Reported-by: Matthew R. Ochs <mochs@nvidia.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 29 +++++++++++++++++++++++++++++
hw/arm/smmuv3-accel.h | 4 ++++
hw/arm/smmuv3.c | 1 +
3 files changed, 34 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index defeddbd8c..8396053a6c 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -634,6 +634,35 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
.get_msi_address_space = smmuv3_accel_find_msi_as,
};
+/*
+ * If the guest reboots and devices are configured for S1+S2, Stage1 must
+ * be switched to bypass. Otherwise, QEMU/UEFI may fail when accessing a
+ * device, e.g. when UEFI retrieves boot partition information from an
+ * assigned vfio-pci NVMe device.
+ */
+void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s)
+{
+ SMMUv3AccelDevice *accel_dev;
+ SMMUViommu *viommu;
+
+ if (!s->accel || !s->s_accel->viommu) {
+ return;
+ }
+
+ viommu = s->s_accel->viommu;
+ QLIST_FOREACH(accel_dev, &viommu->device_list, next) {
+ if (!accel_dev->vdev) {
+ continue;
+ }
+ if (!host_iommu_device_iommufd_attach_hwpt(accel_dev->idev,
+ viommu->bypass_hwpt_id,
+ NULL)) {
+ error_report("Failed to install bypass hwpt id %u for dev id %u",
+ viommu->bypass_hwpt_id, accel_dev->idev->devid);
+ }
+ }
+}
+
void smmuv3_accel_init(SMMUv3State *s)
{
SMMUState *bs = ARM_SMMU(s);
diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
index 3bdba47616..75f858e34a 100644
--- a/hw/arm/smmuv3-accel.h
+++ b/hw/arm/smmuv3-accel.h
@@ -48,6 +48,7 @@ bool smmuv3_accel_install_nested_ste_range(SMMUv3State *s, SMMUSIDRange *range,
Error **errp);
bool smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
Error **errp);
+void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s);
#else
static inline void smmuv3_accel_init(SMMUv3State *s)
{
@@ -70,6 +71,9 @@ smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
{
return true;
}
+static inline void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s)
+{
+}
#endif
#endif /* HW_ARM_SMMUV3_ACCEL_H */
diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
index 5830cf5a03..94b2bbc374 100644
--- a/hw/arm/smmuv3.c
+++ b/hw/arm/smmuv3.c
@@ -1913,6 +1913,7 @@ static void smmu_reset_exit(Object *obj, ResetType type)
if (c->parent_phases.exit) {
c->parent_phases.exit(obj, type);
}
+ smmuv3_accel_attach_bypass_hwpt(s);
}
static void smmu_realize(DeviceState *d, Error **errp)
--
2.43.0
On 9/29/25 3:36 PM, Shameer Kolothum wrote:
> When the guest reboots with devices in nested mode (S1 + S2), any QEMU/UEFI
> access to those devices can fail because S1 translation is not valid during
> the reboot. For example, a passthrough NVMe device may hold GRUB boot info
> that UEFI tries to read during the reboot.
>
> Set S1 to bypass mode during reset to avoid such failures.
>
> Reported-by: Matthew R. Ochs <mochs@nvidia.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/arm/smmuv3-accel.c | 29 +++++++++++++++++++++++++++++
> hw/arm/smmuv3-accel.h | 4 ++++
> hw/arm/smmuv3.c | 1 +
> 3 files changed, 34 insertions(+)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index defeddbd8c..8396053a6c 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -634,6 +634,35 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
> .get_msi_address_space = smmuv3_accel_find_msi_as,
> };
>
> +/*
> + * If the guest reboots and devices are configured for S1+S2, Stage1 must
> + * be switched to bypass. Otherwise, QEMU/UEFI may fail when accessing a
> + * device, e.g. when UEFI retrieves boot partition information from an
> + * assigned vfio-pci NVMe device.
> + */
> +void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s)
> +{
> + SMMUv3AccelDevice *accel_dev;
> + SMMUViommu *viommu;
> +
> + if (!s->accel || !s->s_accel->viommu) {
> + return;
> + }
> +
> + viommu = s->s_accel->viommu;
> + QLIST_FOREACH(accel_dev, &viommu->device_list, next) {
> + if (!accel_dev->vdev) {
> + continue;
> + }
> + if (!host_iommu_device_iommufd_attach_hwpt(accel_dev->idev,
> + viommu->bypass_hwpt_id,
> + NULL)) {
I would prefer we pass a proper local_err, add the hint below and then
report the concatenated error.
Eric
> + error_report("Failed to install bypass hwpt id %u for dev id %u",
> + viommu->bypass_hwpt_id, accel_dev->idev->devid);
> + }
> + }
> +}
> +
> void smmuv3_accel_init(SMMUv3State *s)
> {
> SMMUState *bs = ARM_SMMU(s);
> diff --git a/hw/arm/smmuv3-accel.h b/hw/arm/smmuv3-accel.h
> index 3bdba47616..75f858e34a 100644
> --- a/hw/arm/smmuv3-accel.h
> +++ b/hw/arm/smmuv3-accel.h
> @@ -48,6 +48,7 @@ bool smmuv3_accel_install_nested_ste_range(SMMUv3State *s, SMMUSIDRange *range,
> Error **errp);
> bool smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
> Error **errp);
> +void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s);
> #else
> static inline void smmuv3_accel_init(SMMUv3State *s)
> {
> @@ -70,6 +71,9 @@ smmuv3_accel_issue_inv_cmd(SMMUv3State *s, void *cmd, SMMUDevice *sdev,
> {
> return true;
> }
> +static inline void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s)
> +{
> +}
> #endif
>
> #endif /* HW_ARM_SMMUV3_ACCEL_H */
> diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
> index 5830cf5a03..94b2bbc374 100644
> --- a/hw/arm/smmuv3.c
> +++ b/hw/arm/smmuv3.c
> @@ -1913,6 +1913,7 @@ static void smmu_reset_exit(Object *obj, ResetType type)
> if (c->parent_phases.exit) {
> c->parent_phases.exit(obj, type);
> }
> + smmuv3_accel_attach_bypass_hwpt(s);
> }
>
> static void smmu_realize(DeviceState *d, Error **errp)
On Mon, Sep 29, 2025 at 02:36:35PM +0100, Shameer Kolothum wrote:
> When the guest reboots with devices in nested mode (S1 + S2), any QEMU/UEFI
> access to those devices can fail because S1 translation is not valid during
> the reboot. For example, a passthrough NVMe device may hold GRUB boot info
> that UEFI tries to read during the reboot.
>
> Set S1 to bypass mode during reset to avoid such failures.
GBPA is set to bypass on reset so I think it's fine. Yet, maybe the
code should check that.
> Reported-by: Matthew R. Ochs <mochs@nvidia.com>
> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> ---
> hw/arm/smmuv3-accel.c | 29 +++++++++++++++++++++++++++++
> hw/arm/smmuv3-accel.h | 4 ++++
> hw/arm/smmuv3.c | 1 +
> 3 files changed, 34 insertions(+)
>
> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
> index defeddbd8c..8396053a6c 100644
> --- a/hw/arm/smmuv3-accel.c
> +++ b/hw/arm/smmuv3-accel.c
> @@ -634,6 +634,35 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
> .get_msi_address_space = smmuv3_accel_find_msi_as,
> };
>
> +/*
> + * If the guest reboots and devices are configured for S1+S2, Stage1 must
> + * be switched to bypass. Otherwise, QEMU/UEFI may fail when accessing a
> + * device, e.g. when UEFI retrieves boot partition information from an
> + * assigned vfio-pci NVMe device.
> + */
> +void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s)
We could rename it to something like smmuv3_accel_reset().
Nicolin
On 10/17/25 1:19 AM, Nicolin Chen wrote:
> On Mon, Sep 29, 2025 at 02:36:35PM +0100, Shameer Kolothum wrote:
>> When the guest reboots with devices in nested mode (S1 + S2), any QEMU/UEFI
>> access to those devices can fail because S1 translation is not valid during
>> the reboot. For example, a passthrough NVMe device may hold GRUB boot info
>> that UEFI tries to read during the reboot.
>>
>> Set S1 to bypass mode during reset to avoid such failures.
> GBPA is set to bypass on reset so I think it's fine. Yet, maybe the
> code should check that.
shouldn't we check its actual value before setting bypass?
By the way the spec says is ABORT is set to 0x0:
"Do not abort incoming transactions. Transactions bypass the SMMU with
attributes given by other fields in this register."
Wondering about those attributes and they can apply on the host?
Eric
>
>> Reported-by: Matthew R. Ochs <mochs@nvidia.com>
>> Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
>> ---
>> hw/arm/smmuv3-accel.c | 29 +++++++++++++++++++++++++++++
>> hw/arm/smmuv3-accel.h | 4 ++++
>> hw/arm/smmuv3.c | 1 +
>> 3 files changed, 34 insertions(+)
>>
>> diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
>> index defeddbd8c..8396053a6c 100644
>> --- a/hw/arm/smmuv3-accel.c
>> +++ b/hw/arm/smmuv3-accel.c
>> @@ -634,6 +634,35 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
>> .get_msi_address_space = smmuv3_accel_find_msi_as,
>> };
>>
>> +/*
>> + * If the guest reboots and devices are configured for S1+S2, Stage1 must
>> + * be switched to bypass. Otherwise, QEMU/UEFI may fail when accessing a
>> + * device, e.g. when UEFI retrieves boot partition information from an
>> + * assigned vfio-pci NVMe device.
>> + */
>> +void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s)
> We could rename it to something like smmuv3_accel_reset().
>
> Nicolin
>
On Mon, Oct 27, 2025 at 03:26:15PM +0100, Eric Auger wrote: > On 10/17/25 1:19 AM, Nicolin Chen wrote: > > On Mon, Sep 29, 2025 at 02:36:35PM +0100, Shameer Kolothum wrote: > >> When the guest reboots with devices in nested mode (S1 + S2), any QEMU/UEFI > >> access to those devices can fail because S1 translation is not valid during > >> the reboot. For example, a passthrough NVMe device may hold GRUB boot info > >> that UEFI tries to read during the reboot. > >> > >> Set S1 to bypass mode during reset to avoid such failures. > > GBPA is set to bypass on reset so I think it's fine. Yet, maybe the > > code should check that. > shouldn't we check its actual value before setting bypass? Yes, you are right. GBPA can be changed by the guest. So: "maybe" -> "should" > By the way the spec says is ABORT is set to 0x0: > "Do not abort incoming transactions. Transactions bypass the SMMU with > attributes given by other fields in this register." > > Wondering about those attributes and they can apply on the host? Not at this moment. vSTE only carries: * @ste: The first two double words of the user space Stream Table Entry for * the translation. Must be little-endian. * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD So, kernel needs to expand the word-1 to support those GBPA fields. I will send a kernel patch this week. Thanks Nicolin
> -----Original Message----- > From: Eric Auger <eric.auger@redhat.com> > Sent: 27 October 2025 14:26 > To: Nicolin Chen <nicolinc@nvidia.com>; Shameer Kolothum > <skolothumtho@nvidia.com> > Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org; > peter.maydell@linaro.org; Jason Gunthorpe <jgg@nvidia.com>; > ddutile@redhat.com; berrange@redhat.com; Nathan Chen > <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>; > smostafa@google.com; wangzhou1@hisilicon.com; > jiangkunkun@huawei.com; jonathan.cameron@huawei.com; > zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com; > shameerkolothum@gmail.com > Subject: Re: [PATCH v4 19/27] hw/arm/smmuv3-accel: Install S1 bypass hwpt > on reset > > External email: Use caution opening links or attachments > > > On 10/17/25 1:19 AM, Nicolin Chen wrote: > > On Mon, Sep 29, 2025 at 02:36:35PM +0100, Shameer Kolothum wrote: > >> When the guest reboots with devices in nested mode (S1 + S2), any > QEMU/UEFI > >> access to those devices can fail because S1 translation is not valid during > >> the reboot. For example, a passthrough NVMe device may hold GRUB boot > info > >> that UEFI tries to read during the reboot. > >> > >> Set S1 to bypass mode during reset to avoid such failures. > > GBPA is set to bypass on reset so I think it's fine. Yet, maybe the > > code should check that. > > shouldn't we check its actual value before setting bypass? > > By the way the spec says is ABORT is set to 0x0: > "Do not abort incoming transactions. Transactions bypass the SMMU with > attributes given by other fields in this register." > > Wondering about those attributes and they can apply on the host? That’s right. There are other attributes there. Currently kernel only support, * @ste: The first two double words of the user space Stream Table Entry for * the translation. Must be little-endian. * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD If other attributes make sense, we may have to update kernel. I will add a note here, so that we can update it if required. I think Nicolin is looking into this. Thanks, Shameer
On Mon, Oct 27, 2025 at 07:51:15AM -0700, Shameer Kolothum wrote: > > On 10/17/25 1:19 AM, Nicolin Chen wrote: > > > On Mon, Sep 29, 2025 at 02:36:35PM +0100, Shameer Kolothum wrote: > > >> When the guest reboots with devices in nested mode (S1 + S2), any > > QEMU/UEFI > > >> access to those devices can fail because S1 translation is not valid during > > >> the reboot. For example, a passthrough NVMe device may hold GRUB boot > > info > > >> that UEFI tries to read during the reboot. > > >> > > >> Set S1 to bypass mode during reset to avoid such failures. > > > GBPA is set to bypass on reset so I think it's fine. Yet, maybe the > > > code should check that. > > > > shouldn't we check its actual value before setting bypass? > > > > By the way the spec says is ABORT is set to 0x0: > > "Do not abort incoming transactions. Transactions bypass the SMMU with > > attributes given by other fields in this register." > > > > Wondering about those attributes and they can apply on the host? > > That’s right. There are other attributes there. Currently kernel only > support, > > * @ste: The first two double words of the user space Stream Table Entry for > * the translation. Must be little-endian. > * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) > * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax > * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD > > If other attributes make sense, we may have to update kernel. I will add a note > here, so that we can update it if required. I think Nicolin is looking into this. According to SMMU spec 6.3 GBPA register's Additional information: - If SMMU_IDR1.ATTR_TYPES_OVR == 0, MTCFG, SHCFG, ALLOCCFG are effectively fixed as Use incoming and it is IMPLEMENTATION SPECIFIC whether these fields read as zero or a previously written value. In this case, MemAttr reads as UNKNOWN. - If SMMU_IDR1.ATTR_PERMS_OVR == 0, INSTCFG and PRIVCFG are effectively fixed as Use incoming and it is IMPLEMENTATION SPECIFIC whether these fields read as zero or a previously written value. On the other hand, QEMU seems to set both OVR fields to 0, so all those "other attributes" wouldn't be necessarily forwarded to the kernel? Nicolin
> -----Original Message----- > From: Nicolin Chen <nicolinc@nvidia.com> > Sent: 29 October 2025 04:27 > To: Shameer Kolothum <skolothumtho@nvidia.com> > Cc: eric.auger@redhat.com; qemu-arm@nongnu.org; qemu- > devel@nongnu.org; peter.maydell@linaro.org; Jason Gunthorpe > <jgg@nvidia.com>; ddutile@redhat.com; berrange@redhat.com; Nathan > Chen <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>; > smostafa@google.com; wangzhou1@hisilicon.com; > jiangkunkun@huawei.com; jonathan.cameron@huawei.com; > zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com; > shameerkolothum@gmail.com > Subject: Re: [PATCH v4 19/27] hw/arm/smmuv3-accel: Install S1 bypass hwpt > on reset > > On Mon, Oct 27, 2025 at 07:51:15AM -0700, Shameer Kolothum wrote: > > > On 10/17/25 1:19 AM, Nicolin Chen wrote: > > > > On Mon, Sep 29, 2025 at 02:36:35PM +0100, Shameer Kolothum wrote: > > > >> When the guest reboots with devices in nested mode (S1 + S2), any > > > QEMU/UEFI > > > >> access to those devices can fail because S1 translation is not valid during > > > >> the reboot. For example, a passthrough NVMe device may hold GRUB > boot > > > info > > > >> that UEFI tries to read during the reboot. > > > >> > > > >> Set S1 to bypass mode during reset to avoid such failures. > > > > GBPA is set to bypass on reset so I think it's fine. Yet, maybe the > > > > code should check that. > > > > > > shouldn't we check its actual value before setting bypass? > > > > > > By the way the spec says is ABORT is set to 0x0: > > > "Do not abort incoming transactions. Transactions bypass the SMMU with > > > attributes given by other fields in this register." > > > > > > Wondering about those attributes and they can apply on the host? > > > > That’s right. There are other attributes there. Currently kernel only > > support, > > > > * @ste: The first two double words of the user space Stream Table Entry for > > * the translation. Must be little-endian. > > * Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec) > > * - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax > > * - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD > > > > If other attributes make sense, we may have to update kernel. I will add a > note > > here, so that we can update it if required. I think Nicolin is looking into this. > > According to SMMU spec 6.3 GBPA register's Additional information: > - If SMMU_IDR1.ATTR_TYPES_OVR == 0, MTCFG, SHCFG, ALLOCCFG are > effectively fixed as Use incoming and it is IMPLEMENTATION > SPECIFIC whether these fields read as zero or a previously > written value. In this case, MemAttr reads as UNKNOWN. > - If SMMU_IDR1.ATTR_PERMS_OVR == 0, INSTCFG and PRIVCFG are > effectively fixed as Use incoming and it is IMPLEMENTATION > SPECIFIC whether these fields read as zero or a previously > written value. > > On the other hand, QEMU seems to set both OVR fields to 0, so all > those "other attributes" wouldn't be necessarily forwarded to the > kernel? OK. Based on the QEMU OVR value, GBPA now resets to 0x1000, meaning SHCFG = 0b01 (Use incoming). However, in the current vSTE bypass/abort cases, SHCFG is set to 0b00 (Non-shareable). However, I think the SHCFG will be overridden by S2FWB. So, I don’t think we need to modify anything at this stage. In general, though, the kernel might need to propagate some of these attributes, possibly INSTCFG and PRIVCFG, since they are not overridden by S2FWB ? Thanks, Shameer
On Wed, Oct 29, 2025 at 11:19:59AM -0700, Shameer Kolothum wrote: > > According to SMMU spec 6.3 GBPA register's Additional information: > > - If SMMU_IDR1.ATTR_TYPES_OVR == 0, MTCFG, SHCFG, ALLOCCFG are > > effectively fixed as Use incoming and it is IMPLEMENTATION > > SPECIFIC whether these fields read as zero or a previously > > written value. In this case, MemAttr reads as UNKNOWN. > > - If SMMU_IDR1.ATTR_PERMS_OVR == 0, INSTCFG and PRIVCFG are > > effectively fixed as Use incoming and it is IMPLEMENTATION > > SPECIFIC whether these fields read as zero or a previously > > written value. > > > > On the other hand, QEMU seems to set both OVR fields to 0, so all > > those "other attributes" wouldn't be necessarily forwarded to the > > kernel? > > OK. Based on the QEMU OVR value, GBPA now resets to 0x1000, meaning > SHCFG = 0b01 (Use incoming). However, in the current vSTE bypass/abort > cases, SHCFG is set to 0b00 (Non-shareable). Ah, no, my bad. SHCFG will need to be forwarded, if the hw_info call reports that host SMMU has SMMU_IDR1.ATTR_TYPES_OVR == 1. So, the SHCFG=incoming has been the default case, but to support a non-incoming configuration, kernel needs to allow SHCFG in the vSTE. > However, I think the SHCFG will be overridden by S2FWB. I don't think S2FWB affects SHCFG. SHCFG has been set by kernel: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c?h=v6.18-rc3#n1681 > So, I don’t think we need to modify anything at this stage. In general, > though, the kernel might need to propagate some of these attributes, > possibly INSTCFG and PRIVCFG, since they are not overridden by S2FWB ? Yes. I have drafted a few patches, and will send soon. Thanks Nicolin
On Wed, Oct 29, 2025 at 10:28:35PM -0700, Nicolin Chen wrote: > On Wed, Oct 29, 2025 at 11:19:59AM -0700, Shameer Kolothum wrote: > > > According to SMMU spec 6.3 GBPA register's Additional information: > > > - If SMMU_IDR1.ATTR_TYPES_OVR == 0, MTCFG, SHCFG, ALLOCCFG are > > > effectively fixed as Use incoming and it is IMPLEMENTATION > > > SPECIFIC whether these fields read as zero or a previously > > > written value. In this case, MemAttr reads as UNKNOWN. > > > - If SMMU_IDR1.ATTR_PERMS_OVR == 0, INSTCFG and PRIVCFG are > > > effectively fixed as Use incoming and it is IMPLEMENTATION > > > SPECIFIC whether these fields read as zero or a previously > > > written value. > > > > > > On the other hand, QEMU seems to set both OVR fields to 0, so all > > > those "other attributes" wouldn't be necessarily forwarded to the > > > kernel? > > > > OK. Based on the QEMU OVR value, GBPA now resets to 0x1000, meaning > > SHCFG = 0b01 (Use incoming). However, in the current vSTE bypass/abort > > cases, SHCFG is set to 0b00 (Non-shareable). > > Ah, no, my bad. SHCFG will need to be forwarded, if the hw_info > call reports that host SMMU has SMMU_IDR1.ATTR_TYPES_OVR == 1. > > So, the SHCFG=incoming has been the default case, but to support > a non-incoming configuration, kernel needs to allow SHCFG in the > vSTE. > > > However, I think the SHCFG will be overridden by S2FWB. > > I don't think S2FWB affects SHCFG. SHCFG has been set by kernel: > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c?h=v6.18-rc3#n1681 Hmm, the table "13.5 Summary of attribute/permission configuration fields" in SMMU spec doesn't seem to show the complete picture.. I found the pseudo code in ARMv8 spec telling the details: shared/translation/attrs/S2AttrDecode // S2AttrDecode() // ============== // Converts the Stage 2 attribute fields into orthogonal attributes and hints MemoryAttributes S2AttrDecode(bits(2) SH, bits(4) attr, AccType acctype) MemoryAttributes memattrs; apply_force_writeback = HaveStage2MemAttrControl() && HCR_EL2.FWB == '1'; // Device memory if (apply_force_writeback && attr<2> == '0') || attr<3:2> == '00' then memattrs.memtype = MemType_Device; case attr<1:0> of when '00' memattrs.device = DeviceType_nGnRnE; when '01' memattrs.device = DeviceType_nGnRE; when '10' memattrs.device = DeviceType_nGRE; when '11' memattrs.device = DeviceType_GRE; // Normal memory elsif apply_force_writeback then if attr<2> == '1' then memattrs.memtype = MemType_Normal; memattrs.inner.attrs = attr<1:0>; memattrs.outer.attrs = attr<1:0>; memattrs.shareable = SH<1> == '1'; memattrs.outershareable = SH == '10'; elsif attr<1:0> != '00' then memattrs.memtype = MemType_Normal; memattrs.outer = S2ConvertAttrsHints(attr<3:2>, acctype); memattrs.inner = S2ConvertAttrsHints(attr<1:0>, acctype); memattrs.shareable = SH<1> == '1'; memattrs.outershareable = SH == '10'; else memattrs = MemoryAttributes UNKNOWN; // Reserved return MemAttrDefaults(memattrs); So, it seems that you are right. SHCFG will be overridden by S2FWB. However, we have CPU like Grace that doesn't have S2FWB.. Nicolin
On Thu, Oct 30, 2025 at 12:35:43AM -0700, Nicolin Chen wrote: > However, we have CPU like Grace that doesn't have S2FWB.. Those CPUs basically ignore all of these cachability attributes. Jason
> -----Original Message-----
> From: Nicolin Chen <nicolinc@nvidia.com>
> Sent: 17 October 2025 00:20
> To: Shameer Kolothum <skolothumtho@nvidia.com>
> Cc: qemu-arm@nongnu.org; qemu-devel@nongnu.org;
> eric.auger@redhat.com; peter.maydell@linaro.org; Jason Gunthorpe
> <jgg@nvidia.com>; ddutile@redhat.com; berrange@redhat.com; Nathan
> Chen <nathanc@nvidia.com>; Matt Ochs <mochs@nvidia.com>;
> smostafa@google.com; wangzhou1@hisilicon.com;
> jiangkunkun@huawei.com; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; zhenzhong.duan@intel.com; yi.l.liu@intel.com;
> shameerkolothum@gmail.com
> Subject: Re: [PATCH v4 19/27] hw/arm/smmuv3-accel: Install S1 bypass hwpt
> on reset
>
> On Mon, Sep 29, 2025 at 02:36:35PM +0100, Shameer Kolothum wrote:
> > When the guest reboots with devices in nested mode (S1 + S2), any
> > QEMU/UEFI access to those devices can fail because S1 translation is
> > not valid during the reboot. For example, a passthrough NVMe device
> > may hold GRUB boot info that UEFI tries to read during the reboot.
> >
> > Set S1 to bypass mode during reset to avoid such failures.
>
> GBPA is set to bypass on reset so I think it's fine. Yet, maybe the code should
> check that.
Looking at it again, I think it doesn't now as I moved smmuv3_init_regs() to
smmu_realize() in patch #14 and it is not in smmu_reset_exit() path anymore.
I need to carve out the IDR init separately. I will do that in v5.
> > Reported-by: Matthew R. Ochs <mochs@nvidia.com>
> > Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
> > ---
> > hw/arm/smmuv3-accel.c | 29 +++++++++++++++++++++++++++++
> > hw/arm/smmuv3-accel.h | 4 ++++
> > hw/arm/smmuv3.c | 1 +
> > 3 files changed, 34 insertions(+)
> >
> > diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c index
> > defeddbd8c..8396053a6c 100644
> > --- a/hw/arm/smmuv3-accel.c
> > +++ b/hw/arm/smmuv3-accel.c
> > @@ -634,6 +634,35 @@ static const PCIIOMMUOps smmuv3_accel_ops = {
> > .get_msi_address_space = smmuv3_accel_find_msi_as, };
> >
> > +/*
> > + * If the guest reboots and devices are configured for S1+S2, Stage1
> > +must
> > + * be switched to bypass. Otherwise, QEMU/UEFI may fail when
> > +accessing a
> > + * device, e.g. when UEFI retrieves boot partition information from
> > +an
> > + * assigned vfio-pci NVMe device.
> > + */
> > +void smmuv3_accel_attach_bypass_hwpt(SMMUv3State *s)
>
> We could rename it to something like smmuv3_accel_reset().
Makes sense.
Thanks,
Shameer
On Mon, 29 Sep 2025 14:36:35 +0100 Shameer Kolothum <skolothumtho@nvidia.com> wrote: > When the guest reboots with devices in nested mode (S1 + S2), any QEMU/UEFI > access to those devices can fail because S1 translation is not valid during > the reboot. For example, a passthrough NVMe device may hold GRUB boot info > that UEFI tries to read during the reboot. > > Set S1 to bypass mode during reset to avoid such failures. > > Reported-by: Matthew R. Ochs <mochs@nvidia.com> > Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com> Seems reasonable. Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
© 2016 - 2025 Red Hat, Inc.