hw/vfio/listener.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
The VFIO IOMMU Type1 kernel driver enforces a default IOMMU mapping
limit of 65535, which is configurable via the 'dma_max_mappings'
module parameter. When this limit is reached, QEMU issues a warning
and fails the mapping operation, but allows the VM to continue
running, potentially causing issues later. This scenario occurs with
SEV-SNP guests, which must update all IOMMU mappings during
initialization.
To address this, update vfio_ram_discard_register_listener() to accept
an 'Error **' parameter and propagate the error to the caller. This
change will halt the VM immediately, at init time, with the same error
message.
Additionally, the same behavior will be enforced at runtime. While
this might be considered too brutal, the rarity of this case and the
planned removal of the dma_max_mappings module parameter make it a
reasonable approach.
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Cédric Le Goater <clg@redhat.com>
---
hw/vfio/listener.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
index 184c15e05fcb388cf0848e97e1eb283f17a50ad4..bc40ec8613c71a12b8c0dfdea497a14a446ac1fd 100644
--- a/hw/vfio/listener.c
+++ b/hw/vfio/listener.c
@@ -250,8 +250,9 @@ int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
return 0;
}
-static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
- MemoryRegionSection *section)
+static bool vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
+ MemoryRegionSection *section,
+ Error **errp)
{
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
int target_page_size = qemu_target_page_size();
@@ -316,13 +317,15 @@ static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
if (vrdl_mappings + max_memslots - vrdl_count >
bcontainer->dma_max_mappings) {
- warn_report("%s: possibly running out of DMA mappings. E.g., try"
+ error_setg(errp, "%s: possibly running out of DMA mappings. E.g., try"
" increasing the 'block-size' of virtio-mem devies."
" Maximum possible DMA mappings: %d, Maximum possible"
" memslots: %d", __func__, bcontainer->dma_max_mappings,
max_memslots);
+ return false;
}
}
+ return true;
}
static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
@@ -576,7 +579,9 @@ void vfio_container_region_add(VFIOContainerBase *bcontainer,
*/
if (memory_region_has_ram_discard_manager(section->mr)) {
if (!cpr_remap) {
- vfio_ram_discard_register_listener(bcontainer, section);
+ if (!vfio_ram_discard_register_listener(bcontainer, section, &err)) {
+ goto fail;
+ }
} else if (!vfio_cpr_ram_discard_replay_populated(bcontainer,
section)) {
error_setg(&err,
--
2.50.1
On 8/14/25 17:34, Cédric Le Goater wrote: > The VFIO IOMMU Type1 kernel driver enforces a default IOMMU mapping > limit of 65535, which is configurable via the 'dma_max_mappings' > module parameter. When this limit is reached, QEMU issues a warning > and fails the mapping operation, but allows the VM to continue > running, potentially causing issues later. This scenario occurs with > SEV-SNP guests, which must update all IOMMU mappings during > initialization. > > To address this, update vfio_ram_discard_register_listener() to accept > an 'Error **' parameter and propagate the error to the caller. This > change will halt the VM immediately, at init time, with the same error > message. > > Additionally, the same behavior will be enforced at runtime. While > this might be considered too brutal, the rarity of this case and the > planned removal of the dma_max_mappings module parameter make it a > reasonable approach. > > Cc: Alex Williamson <alex.williamson@redhat.com> > Signed-off-by: Cédric Le Goater <clg@redhat.com> > --- > hw/vfio/listener.c | 13 +++++++++---- > 1 file changed, 9 insertions(+), 4 deletions(-) > Applied to vfio-next. Thanks, C.
On 2025/8/14 23:34, Cédric Le Goater wrote:
> The VFIO IOMMU Type1 kernel driver enforces a default IOMMU mapping
> limit of 65535, which is configurable via the 'dma_max_mappings'
@Alex, I've a long standing question, could you share why 65535 is the
magic number? :)
> module parameter. When this limit is reached, QEMU issues a warning
> and fails the mapping operation, but allows the VM to continue
> running, potentially causing issues later. This scenario occurs with
> SEV-SNP guests, which must update all IOMMU mappings during
> initialization.
>
> To address this, update vfio_ram_discard_register_listener() to accept
> an 'Error **' parameter and propagate the error to the caller. This
> change will halt the VM immediately, at init time, with the same error
> message.
>
> Additionally, the same behavior will be enforced at runtime. While
> this might be considered too brutal, the rarity of this case and the
> planned removal of the dma_max_mappings module parameter make it a
> reasonable approach.
>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Cédric Le Goater <clg@redhat.com>
> ---
> hw/vfio/listener.c | 13 +++++++++----
> 1 file changed, 9 insertions(+), 4 deletions(-)
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index 184c15e05fcb388cf0848e97e1eb283f17a50ad4..bc40ec8613c71a12b8c0dfdea497a14a446ac1fd 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -250,8 +250,9 @@ int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
> return 0;
> }
>
> -static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
> - MemoryRegionSection *section)
> +static bool vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
> + MemoryRegionSection *section,
> + Error **errp)
> {
> RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
> int target_page_size = qemu_target_page_size();
> @@ -316,13 +317,15 @@ static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
>
> if (vrdl_mappings + max_memslots - vrdl_count >
> bcontainer->dma_max_mappings) {
> - warn_report("%s: possibly running out of DMA mappings. E.g., try"
> + error_setg(errp, "%s: possibly running out of DMA mappings. E.g., try"
> " increasing the 'block-size' of virtio-mem devies."
> " Maximum possible DMA mappings: %d, Maximum possible"
> " memslots: %d", __func__, bcontainer->dma_max_mappings,
> max_memslots);
> + return false;
> }
> }
> + return true;
> }
>
> static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
> @@ -576,7 +579,9 @@ void vfio_container_region_add(VFIOContainerBase *bcontainer,
> */
> if (memory_region_has_ram_discard_manager(section->mr)) {
> if (!cpr_remap) {
> - vfio_ram_discard_register_listener(bcontainer, section);
> + if (!vfio_ram_discard_register_listener(bcontainer, section, &err)) {
> + goto fail;
> + }
> } else if (!vfio_cpr_ram_discard_replay_populated(bcontainer,
> section)) {
> error_setg(&err,
On Fri, 15 Aug 2025 18:49:22 +0800
Yi Liu <yi.l.liu@intel.com> wrote:
> On 2025/8/14 23:34, Cédric Le Goater wrote:
> > The VFIO IOMMU Type1 kernel driver enforces a default IOMMU mapping
> > limit of 65535, which is configurable via the 'dma_max_mappings'
>
> @Alex, I've a long standing question, could you share why 65535 is the
> magic number? :)
640^Hk is enough for anyone, right? ;)
We added this in response to a security issue where a user could
allocate an unlimited number of vfio_dma objects and, iirc, the thought
was that 64k entries was an absurdly high number for all typical cases
where we're making relatively few, relatively static DMA mappings,
which is effectively what the type1 interface is designed for. It
would be insanely inefficient to map the entire VM with 4K pages with
type1, right?! Enter confidential device assignment...
It's still a bad idea to use type1 this way, I'm just waiting for the
reports of slow VM startup with large memory VMs, however we might be
able to mitigate the security issue if we allocated the vfio_dma
objects with GFP_KERNEL_ACCOUNT. However, I think we also compounded
the problem in QEMU when looking for the number of available mapping
entries it assumes 64k if the limit capability isn't found, rather than
unlimited. So to unwind ourselves out of this jam, we might choose to
report UINT32_MAX and some additional mechanism to report unlimited, or
let QEMU fix itself, or we just advise that type1 is a bad interface
for this and needing to adjust the limit is an indication or that and
such use cases should migrate to better interfaces in IOMMUFD. Thanks,
Alex
> > module parameter. When this limit is reached, QEMU issues a warning
> > and fails the mapping operation, but allows the VM to continue
> > running, potentially causing issues later. This scenario occurs with
> > SEV-SNP guests, which must update all IOMMU mappings during
> > initialization.
> >
> > To address this, update vfio_ram_discard_register_listener() to accept
> > an 'Error **' parameter and propagate the error to the caller. This
> > change will halt the VM immediately, at init time, with the same error
> > message.
> >
> > Additionally, the same behavior will be enforced at runtime. While
> > this might be considered too brutal, the rarity of this case and the
> > planned removal of the dma_max_mappings module parameter make it a
> > reasonable approach.
> >
> > Cc: Alex Williamson <alex.williamson@redhat.com>
> > Signed-off-by: Cédric Le Goater <clg@redhat.com>
> > ---
> > hw/vfio/listener.c | 13 +++++++++----
> > 1 file changed, 9 insertions(+), 4 deletions(-)
>
> Reviewed-by: Yi Liu <yi.l.liu@intel.com>
>
> > diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> > index 184c15e05fcb388cf0848e97e1eb283f17a50ad4..bc40ec8613c71a12b8c0dfdea497a14a446ac1fd 100644
> > --- a/hw/vfio/listener.c
> > +++ b/hw/vfio/listener.c
> > @@ -250,8 +250,9 @@ int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
> > return 0;
> > }
> >
> > -static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
> > - MemoryRegionSection *section)
> > +static bool vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
> > + I'd prefer a replacement but I will accept a refund if you're not able to provide a compatible item. MemoryRegionSection *section,
> > + Error **errp)
> > {
> > RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
> > int target_page_size = qemu_target_page_size();
> > @@ -316,13 +317,15 @@ static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
> >
> > if (vrdl_mappings + max_memslots - vrdl_count >
> > bcontainer->dma_max_mappings) {
> > - warn_report("%s: possibly running out of DMA mappings. E.g., try"
> > + error_setg(errp, "%s: possibly running out of DMA mappings. E.g., try"
> > " increasing the 'block-size' of virtio-mem devies."
> > " Maximum possible DMA mappings: %d, Maximum possible"
> > " memslots: %d", __func__, bcontainer->dma_max_mappings,
> > max_memslots);
> > + return false;
> > }
> > }
> > + return true;
> > }
> >
> > static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
> > @@ -576,7 +579,9 @@ void vfio_container_region_add(VFIOContainerBase *bcontainer,
> > */
> > if (memory_region_has_ram_discard_manager(section->mr)) {
> > if (!cpr_remap) {
> > - vfio_ram_discard_register_listener(bcontainer, section);
> > + if (!vfio_ram_discard_register_listener(bcontainer, section, &err)) {
> > + goto fail;
> > + }
> > } else if (!vfio_cpr_ram_discard_replay_populated(bcontainer,
> > section)) {
> > error_setg(&err,
>
On 2025/8/15 22:20, Alex Williamson wrote: > On Fri, 15 Aug 2025 18:49:22 +0800 > Yi Liu <yi.l.liu@intel.com> wrote: > >> On 2025/8/14 23:34, Cédric Le Goater wrote: >>> The VFIO IOMMU Type1 kernel driver enforces a default IOMMU mapping >>> limit of 65535, which is configurable via the 'dma_max_mappings' >> >> @Alex, I've a long standing question, could you share why 65535 is the >> magic number? :) > > 640^Hk is enough for anyone, right? ;) > > We added this in response to a security issue where a user could > allocate an unlimited number of vfio_dma objects and, iirc, the thought > was that 64k entries was an absurdly high number for all typical cases > where we're making relatively few, relatively static DMA mappings, > which is effectively what the type1 interface is designed for. It > would be insanely inefficient to map the entire VM with 4K pages with > type1, right?! Enter confidential device assignment... yes. I remember there were some NIC passthrough scenarios hit the 65535 limit in the before and passed after opt a larger dma_max_mappings value. > It's still a bad idea to use type1 this way, I'm just waiting for the > reports of slow VM startup with large memory VMs, however we might be > able to mitigate the security issue if we allocated the vfio_dma > objects with GFP_KERNEL_ACCOUNT. However, I think we also compounded > the problem in QEMU when looking for the number of available mapping > entries it assumes 64k if the limit capability isn't found, rather than > unlimited. yeah, admin can program a smaller dma_max_mappings value on an eld kernel (a version before dma_avail cap is reported). If so, existing QEMU may hit the dma_max_mapping limit while it believes it has not yet. > So to unwind ourselves out of this jam, we might choose to > report UINT32_MAX and some additional mechanism to report unlimited, or > let QEMU fix itself, or we just advise that type1 is a bad interface > for this and needing to adjust the limit is an indication or that and > such use cases should migrate to better interfaces in IOMMUFD. Thanks, thanks for the the explanation. :) Regards, Yi Liu
On Thu, 14 Aug 2025 17:34:19 +0200
Cédric Le Goater <clg@redhat.com> wrote:
> The VFIO IOMMU Type1 kernel driver enforces a default IOMMU mapping
> limit of 65535, which is configurable via the 'dma_max_mappings'
> module parameter. When this limit is reached, QEMU issues a warning
> and fails the mapping operation, but allows the VM to continue
> running, potentially causing issues later. This scenario occurs with
> SEV-SNP guests, which must update all IOMMU mappings during
> initialization.
>
> To address this, update vfio_ram_discard_register_listener() to accept
> an 'Error **' parameter and propagate the error to the caller. This
> change will halt the VM immediately, at init time, with the same error
> message.
>
> Additionally, the same behavior will be enforced at runtime. While
> this might be considered too brutal, the rarity of this case and the
> planned removal of the dma_max_mappings module parameter make it a
> reasonable approach.
>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Cédric Le Goater <clg@redhat.com>
> ---
> hw/vfio/listener.c | 13 +++++++++----
> 1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c
> index 184c15e05fcb388cf0848e97e1eb283f17a50ad4..bc40ec8613c71a12b8c0dfdea497a14a446ac1fd 100644
> --- a/hw/vfio/listener.c
> +++ b/hw/vfio/listener.c
> @@ -250,8 +250,9 @@ int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
> return 0;
> }
>
> -static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
> - MemoryRegionSection *section)
> +static bool vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
> + MemoryRegionSection *section,
> + Error **errp)
> {
> RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
> int target_page_size = qemu_target_page_size();
> @@ -316,13 +317,15 @@ static void vfio_ram_discard_register_listener(VFIOContainerBase *bcontainer,
>
> if (vrdl_mappings + max_memslots - vrdl_count >
> bcontainer->dma_max_mappings) {
> - warn_report("%s: possibly running out of DMA mappings. E.g., try"
> + error_setg(errp, "%s: possibly running out of DMA mappings. E.g., try"
> " increasing the 'block-size' of virtio-mem devies."
> " Maximum possible DMA mappings: %d, Maximum possible"
> " memslots: %d", __func__, bcontainer->dma_max_mappings,
> max_memslots);
> + return false;
> }
> }
> + return true;
> }
>
> static void vfio_ram_discard_unregister_listener(VFIOContainerBase *bcontainer,
> @@ -576,7 +579,9 @@ void vfio_container_region_add(VFIOContainerBase *bcontainer,
> */
> if (memory_region_has_ram_discard_manager(section->mr)) {
> if (!cpr_remap) {
> - vfio_ram_discard_register_listener(bcontainer, section);
> + if (!vfio_ram_discard_register_listener(bcontainer, section, &err)) {
> + goto fail;
> + }
> } else if (!vfio_cpr_ram_discard_replay_populated(bcontainer,
> section)) {
> error_setg(&err,
LGTM
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
© 2016 - 2025 Red Hat, Inc.