From: Leon Romanovsky <leonro@nvidia.com>
dma-buf invalidation is handled asynchronously by the hardware, so VFIO
must wait until all affected objects have been fully invalidated.
In addition, the dma-buf exporter is expecting that all importers unmap any
buffers they previously mapped.
Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/vfio/pci/vfio_pci_dmabuf.c | 71 ++++++++++++++++++++++++++++++++++++--
1 file changed, 68 insertions(+), 3 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index d8ceafabef48..485515629fe4 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -17,6 +17,8 @@ struct vfio_pci_dma_buf {
struct dma_buf_phys_vec *phys_vec;
struct p2pdma_provider *provider;
u32 nr_ranges;
+ struct kref kref;
+ struct completion comp;
u8 revoked : 1;
};
@@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
return 0;
}
+static void vfio_pci_dma_buf_done(struct kref *kref)
+{
+ struct vfio_pci_dma_buf *priv =
+ container_of(kref, struct vfio_pci_dma_buf, kref);
+
+ complete(&priv->comp);
+}
+
static struct sg_table *
vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
enum dma_data_direction dir)
{
struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+ struct sg_table *ret;
dma_resv_assert_held(priv->dmabuf->resv);
if (priv->revoked)
return ERR_PTR(-ENODEV);
- return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
- priv->phys_vec, priv->nr_ranges,
- priv->size, dir);
+ ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
+ priv->phys_vec, priv->nr_ranges,
+ priv->size, dir);
+ if (IS_ERR(ret))
+ return ret;
+
+ kref_get(&priv->kref);
+ return ret;
}
static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,
enum dma_data_direction dir)
{
+ struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+
+ dma_resv_assert_held(priv->dmabuf->resv);
+
dma_buf_free_sgt(attachment, sgt, dir);
+ kref_put(&priv->kref, vfio_pci_dma_buf_done);
}
static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
@@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
goto err_dev_put;
}
+ kref_init(&priv->kref);
+ init_completion(&priv->comp);
+
/* dma_buf_put() now frees priv */
INIT_LIST_HEAD(&priv->dmabufs_elm);
down_write(&vdev->memory_lock);
@@ -326,6 +350,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
lockdep_assert_held_write(&vdev->memory_lock);
list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ unsigned long wait;
+
if (!get_file_active(&priv->dmabuf->file))
continue;
@@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
dma_resv_lock(priv->dmabuf->resv, NULL);
priv->revoked = revoked;
dma_buf_invalidate_mappings(priv->dmabuf);
+ dma_resv_wait_timeout(priv->dmabuf->resv,
+ DMA_RESV_USAGE_BOOKKEEP, false,
+ MAX_SCHEDULE_TIMEOUT);
dma_resv_unlock(priv->dmabuf->resv);
+ if (revoked) {
+ kref_put(&priv->kref, vfio_pci_dma_buf_done);
+ /* Let's wait till all DMA unmap are completed. */
+ wait = wait_for_completion_timeout(
+ &priv->comp, secs_to_jiffies(1));
+ /*
+ * If you see this WARN_ON, it means that
+ * importer didn't call unmap in response to
+ * dma_buf_invalidate_mappings() which is not
+ * allowed.
+ */
+ WARN(!wait,
+ "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
+ } else {
+ /*
+ * Kref is initialize again, because when revoke
+ * was performed the reference counter was decreased
+ * to zero to trigger completion.
+ */
+ kref_init(&priv->kref);
+ /*
+ * There is no need to wait as no mapping was
+ * performed when the previous status was
+ * priv->revoked == true.
+ */
+ reinit_completion(&priv->comp);
+ }
}
fput(priv->dmabuf->file);
}
@@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
down_write(&vdev->memory_lock);
list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ unsigned long wait;
+
if (!get_file_active(&priv->dmabuf->file))
continue;
@@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
priv->vdev = NULL;
priv->revoked = true;
dma_buf_invalidate_mappings(priv->dmabuf);
+ dma_resv_wait_timeout(priv->dmabuf->resv,
+ DMA_RESV_USAGE_BOOKKEEP, false,
+ MAX_SCHEDULE_TIMEOUT);
dma_resv_unlock(priv->dmabuf->resv);
+ kref_put(&priv->kref, vfio_pci_dma_buf_done);
+ wait = wait_for_completion_timeout(&priv->comp,
+ secs_to_jiffies(1));
+ WARN_ON(!wait);
vfio_device_put_registration(&vdev->vdev);
fput(priv->dmabuf->file);
}
--
2.52.0
On 1/24/26 20:14, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
>
> dma-buf invalidation is handled asynchronously by the hardware, so VFIO
> must wait until all affected objects have been fully invalidated.
>
> In addition, the dma-buf exporter is expecting that all importers unmap any
> buffers they previously mapped.
>
> Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions")
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> drivers/vfio/pci/vfio_pci_dmabuf.c | 71 ++++++++++++++++++++++++++++++++++++--
> 1 file changed, 68 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index d8ceafabef48..485515629fe4 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -17,6 +17,8 @@ struct vfio_pci_dma_buf {
> struct dma_buf_phys_vec *phys_vec;
> struct p2pdma_provider *provider;
> u32 nr_ranges;
> + struct kref kref;
> + struct completion comp;
> u8 revoked : 1;
> };
>
> @@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> return 0;
> }
>
> +static void vfio_pci_dma_buf_done(struct kref *kref)
> +{
> + struct vfio_pci_dma_buf *priv =
> + container_of(kref, struct vfio_pci_dma_buf, kref);
> +
> + complete(&priv->comp);
> +}
> +
> static struct sg_table *
> vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> enum dma_data_direction dir)
> {
> struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> + struct sg_table *ret;
>
> dma_resv_assert_held(priv->dmabuf->resv);
>
> if (priv->revoked)
> return ERR_PTR(-ENODEV);
>
> - return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> - priv->phys_vec, priv->nr_ranges,
> - priv->size, dir);
> + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> + priv->phys_vec, priv->nr_ranges,
> + priv->size, dir);
> + if (IS_ERR(ret))
> + return ret;
> +
> + kref_get(&priv->kref);
> + return ret;
> }
>
> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> struct sg_table *sgt,
> enum dma_data_direction dir)
> {
> + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +
> + dma_resv_assert_held(priv->dmabuf->resv);
> +
> dma_buf_free_sgt(attachment, sgt, dir);
> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> }
>
> static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> @@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> goto err_dev_put;
> }
>
> + kref_init(&priv->kref);
> + init_completion(&priv->comp);
> +
> /* dma_buf_put() now frees priv */
> INIT_LIST_HEAD(&priv->dmabufs_elm);
> down_write(&vdev->memory_lock);
> @@ -326,6 +350,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> lockdep_assert_held_write(&vdev->memory_lock);
>
> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> + unsigned long wait;
> +
> if (!get_file_active(&priv->dmabuf->file))
> continue;
>
> @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> dma_resv_lock(priv->dmabuf->resv, NULL);
> priv->revoked = revoked;
> dma_buf_invalidate_mappings(priv->dmabuf);
> + dma_resv_wait_timeout(priv->dmabuf->resv,
> + DMA_RESV_USAGE_BOOKKEEP, false,
> + MAX_SCHEDULE_TIMEOUT);
> dma_resv_unlock(priv->dmabuf->resv);
> + if (revoked) {
> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> + /* Let's wait till all DMA unmap are completed. */
> + wait = wait_for_completion_timeout(
> + &priv->comp, secs_to_jiffies(1));
> + /*
> + * If you see this WARN_ON, it means that
> + * importer didn't call unmap in response to
> + * dma_buf_invalidate_mappings() which is not
> + * allowed.
> + */
> + WARN(!wait,
> + "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
You can do the revoke to do your resource management, for example re-use the backing store for something else.
But it is mandatory that you keep the mapping around indefinitely until the importer closes it.
Before that you can't do things like runtime PM or remove or anything which would make the DMA addresses invalid.
As far as I can see vfio_pci_dma_buf_move() is used exactly for that use case so this here is an absolutely clear NAK from my side for this approach.
You can either split up the functionality of vfio_pci_dma_buf_move() into vfio_pci_dma_buf_invalidate_mappings() and vfio_pci_dma_buf_flush() and then call the later whenever necessary or you keep it in one function and block everybody until the importer has dropped all mappings.
> + } else {
> + /*
> + * Kref is initialize again, because when revoke
> + * was performed the reference counter was decreased
> + * to zero to trigger completion.
> + */
> + kref_init(&priv->kref);
> + /*
> + * There is no need to wait as no mapping was
> + * performed when the previous status was
> + * priv->revoked == true.
> + */
> + reinit_completion(&priv->comp);
> + }
> }
> fput(priv->dmabuf->file);
This is also extremely questionable. Why doesn't the dmabuf have a reference while on the linked list?
Regards,
Christian.
> }
> @@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
>
> down_write(&vdev->memory_lock);
> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> + unsigned long wait;
> +
> if (!get_file_active(&priv->dmabuf->file))
> continue;
>
> @@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> priv->vdev = NULL;
> priv->revoked = true;
> dma_buf_invalidate_mappings(priv->dmabuf);
> + dma_resv_wait_timeout(priv->dmabuf->resv,
> + DMA_RESV_USAGE_BOOKKEEP, false,
> + MAX_SCHEDULE_TIMEOUT);
> dma_resv_unlock(priv->dmabuf->resv);
> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> + wait = wait_for_completion_timeout(&priv->comp,
> + secs_to_jiffies(1));
> + WARN_ON(!wait);
> vfio_device_put_registration(&vdev->vdev);
> fput(priv->dmabuf->file);
> }
>
On Fri, Jan 30, 2026 at 09:30:59AM +0100, Christian König wrote:
> On 1/24/26 20:14, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > dma-buf invalidation is handled asynchronously by the hardware, so VFIO
> > must wait until all affected objects have been fully invalidated.
> >
> > In addition, the dma-buf exporter is expecting that all importers unmap any
> > buffers they previously mapped.
> >
> > Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions")
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > drivers/vfio/pci/vfio_pci_dmabuf.c | 71 ++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 68 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > index d8ceafabef48..485515629fe4 100644
> > --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > @@ -17,6 +17,8 @@ struct vfio_pci_dma_buf {
> > struct dma_buf_phys_vec *phys_vec;
> > struct p2pdma_provider *provider;
> > u32 nr_ranges;
> > + struct kref kref;
> > + struct completion comp;
> > u8 revoked : 1;
> > };
> >
> > @@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> > return 0;
> > }
> >
> > +static void vfio_pci_dma_buf_done(struct kref *kref)
> > +{
> > + struct vfio_pci_dma_buf *priv =
> > + container_of(kref, struct vfio_pci_dma_buf, kref);
> > +
> > + complete(&priv->comp);
> > +}
> > +
> > static struct sg_table *
> > vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> > enum dma_data_direction dir)
> > {
> > struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> > + struct sg_table *ret;
> >
> > dma_resv_assert_held(priv->dmabuf->resv);
> >
> > if (priv->revoked)
> > return ERR_PTR(-ENODEV);
> >
> > - return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> > - priv->phys_vec, priv->nr_ranges,
> > - priv->size, dir);
> > + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> > + priv->phys_vec, priv->nr_ranges,
> > + priv->size, dir);
> > + if (IS_ERR(ret))
> > + return ret;
> > +
> > + kref_get(&priv->kref);
> > + return ret;
> > }
> >
> > static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> > struct sg_table *sgt,
> > enum dma_data_direction dir)
> > {
> > + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> > +
> > + dma_resv_assert_held(priv->dmabuf->resv);
> > +
> > dma_buf_free_sgt(attachment, sgt, dir);
> > + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> > }
> >
> > static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> > @@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> > goto err_dev_put;
> > }
> >
> > + kref_init(&priv->kref);
> > + init_completion(&priv->comp);
> > +
> > /* dma_buf_put() now frees priv */
> > INIT_LIST_HEAD(&priv->dmabufs_elm);
> > down_write(&vdev->memory_lock);
> > @@ -326,6 +350,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > lockdep_assert_held_write(&vdev->memory_lock);
> >
> > list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> > + unsigned long wait;
> > +
> > if (!get_file_active(&priv->dmabuf->file))
> > continue;
> >
> > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > dma_resv_lock(priv->dmabuf->resv, NULL);
> > priv->revoked = revoked;
> > dma_buf_invalidate_mappings(priv->dmabuf);
> > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > + DMA_RESV_USAGE_BOOKKEEP, false,
> > + MAX_SCHEDULE_TIMEOUT);
> > dma_resv_unlock(priv->dmabuf->resv);
> > + if (revoked) {
> > + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> > + /* Let's wait till all DMA unmap are completed. */
> > + wait = wait_for_completion_timeout(
> > + &priv->comp, secs_to_jiffies(1));
> > + /*
> > + * If you see this WARN_ON, it means that
> > + * importer didn't call unmap in response to
> > + * dma_buf_invalidate_mappings() which is not
> > + * allowed.
> > + */
> > + WARN(!wait,
> > + "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
>
> You can do the revoke to do your resource management, for example re-use the backing store for something else.
>
> But it is mandatory that you keep the mapping around indefinitely until the importer closes it.
>
> Before that you can't do things like runtime PM or remove or anything which would make the DMA addresses invalid.
>
> As far as I can see vfio_pci_dma_buf_move() is used exactly for that use case so this here is an absolutely clear NAK from my side for this approach.
>
> You can either split up the functionality of vfio_pci_dma_buf_move() into vfio_pci_dma_buf_invalidate_mappings() and vfio_pci_dma_buf_flush() and then call the later whenever necessary or you keep it in one function and block everybody until the importer has dropped all mappings.
No problem, I can change it to be:
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index d087d018d547..53772a84c93b 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -357,23 +357,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
dma_resv_unlock(priv->dmabuf->resv);
if (revoked) {
kref_put(&priv->kref, vfio_pci_dma_buf_done);
- /*
- * Let's wait for 1 second till all DMA unmap
- * are completed. It is supposed to catch dma-buf
- * importers which lied about their support
- * of dmabuf revoke. See dma_buf_invalidate_mappings()
- * for the expected behaviour.
- */
- wait = wait_for_completion_timeout(
- &priv->comp, secs_to_jiffies(1));
- /*
- * If you see this WARN_ON, it means that
- * importer didn't call unmap in response to
- * dma_buf_invalidate_mappings() which is not
- * allowed.
- */
- WARN(!wait,
- "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
+ wait_for_completion(&priv->comp);
} else {
/*
* Kref is initialize again, because when revoke
Do you want me to send v6?
Thanks
>
> > + } else {
> > + /*
> > + * Kref is initialize again, because when revoke
> > + * was performed the reference counter was decreased
> > + * to zero to trigger completion.
> > + */
> > + kref_init(&priv->kref);
> > + /*
> > + * There is no need to wait as no mapping was
> > + * performed when the previous status was
> > + * priv->revoked == true.
> > + */
> > + reinit_completion(&priv->comp);
> > + }
> > }
> > fput(priv->dmabuf->file);
>
> This is also extremely questionable. Why doesn't the dmabuf have a reference while on the linked list?
>
> Regards,
> Christian.
>
> > }
> > @@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> >
> > down_write(&vdev->memory_lock);
> > list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> > + unsigned long wait;
> > +
> > if (!get_file_active(&priv->dmabuf->file))
> > continue;
> >
> > @@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> > priv->vdev = NULL;
> > priv->revoked = true;
> > dma_buf_invalidate_mappings(priv->dmabuf);
> > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > + DMA_RESV_USAGE_BOOKKEEP, false,
> > + MAX_SCHEDULE_TIMEOUT);
> > dma_resv_unlock(priv->dmabuf->resv);
> > + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> > + wait = wait_for_completion_timeout(&priv->comp,
> > + secs_to_jiffies(1));
> > + WARN_ON(!wait);
> > vfio_device_put_registration(&vdev->vdev);
> > fput(priv->dmabuf->file);
> > }
> >
>
>
On 1/30/26 14:01, Leon Romanovsky wrote:
> On Fri, Jan 30, 2026 at 09:30:59AM +0100, Christian König wrote:
>> On 1/24/26 20:14, Leon Romanovsky wrote:
>>> From: Leon Romanovsky <leonro@nvidia.com>
>>>
>>> dma-buf invalidation is handled asynchronously by the hardware, so VFIO
>>> must wait until all affected objects have been fully invalidated.
>>>
>>> In addition, the dma-buf exporter is expecting that all importers unmap any
>>> buffers they previously mapped.
>>>
>>> Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions")
>>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>>> ---
>>> drivers/vfio/pci/vfio_pci_dmabuf.c | 71 ++++++++++++++++++++++++++++++++++++--
>>> 1 file changed, 68 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
>>> index d8ceafabef48..485515629fe4 100644
>>> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
>>> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
>>> @@ -17,6 +17,8 @@ struct vfio_pci_dma_buf {
>>> struct dma_buf_phys_vec *phys_vec;
>>> struct p2pdma_provider *provider;
>>> u32 nr_ranges;
>>> + struct kref kref;
>>> + struct completion comp;
>>> u8 revoked : 1;
>>> };
>>>
>>> @@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
>>> return 0;
>>> }
>>>
>>> +static void vfio_pci_dma_buf_done(struct kref *kref)
>>> +{
>>> + struct vfio_pci_dma_buf *priv =
>>> + container_of(kref, struct vfio_pci_dma_buf, kref);
>>> +
>>> + complete(&priv->comp);
>>> +}
>>> +
>>> static struct sg_table *
>>> vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
>>> enum dma_data_direction dir)
>>> {
>>> struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
>>> + struct sg_table *ret;
>>>
>>> dma_resv_assert_held(priv->dmabuf->resv);
>>>
>>> if (priv->revoked)
>>> return ERR_PTR(-ENODEV);
>>>
>>> - return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
>>> - priv->phys_vec, priv->nr_ranges,
>>> - priv->size, dir);
>>> + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
>>> + priv->phys_vec, priv->nr_ranges,
>>> + priv->size, dir);
>>> + if (IS_ERR(ret))
>>> + return ret;
>>> +
>>> + kref_get(&priv->kref);
>>> + return ret;
>>> }
>>>
>>> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
>>> struct sg_table *sgt,
>>> enum dma_data_direction dir)
>>> {
>>> + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
>>> +
>>> + dma_resv_assert_held(priv->dmabuf->resv);
>>> +
>>> dma_buf_free_sgt(attachment, sgt, dir);
>>> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
>>> }
>>>
>>> static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
>>> @@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
>>> goto err_dev_put;
>>> }
>>>
>>> + kref_init(&priv->kref);
>>> + init_completion(&priv->comp);
>>> +
>>> /* dma_buf_put() now frees priv */
>>> INIT_LIST_HEAD(&priv->dmabufs_elm);
>>> down_write(&vdev->memory_lock);
>>> @@ -326,6 +350,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
>>> lockdep_assert_held_write(&vdev->memory_lock);
>>>
>>> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
>>> + unsigned long wait;
>>> +
>>> if (!get_file_active(&priv->dmabuf->file))
>>> continue;
>>>
>>> @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
>>> dma_resv_lock(priv->dmabuf->resv, NULL);
>>> priv->revoked = revoked;
>>> dma_buf_invalidate_mappings(priv->dmabuf);
>>> + dma_resv_wait_timeout(priv->dmabuf->resv,
>>> + DMA_RESV_USAGE_BOOKKEEP, false,
>>> + MAX_SCHEDULE_TIMEOUT);
>>> dma_resv_unlock(priv->dmabuf->resv);
>>> + if (revoked) {
>>> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
>>> + /* Let's wait till all DMA unmap are completed. */
>>> + wait = wait_for_completion_timeout(
>>> + &priv->comp, secs_to_jiffies(1));
>>> + /*
>>> + * If you see this WARN_ON, it means that
>>> + * importer didn't call unmap in response to
>>> + * dma_buf_invalidate_mappings() which is not
>>> + * allowed.
>>> + */
>>> + WARN(!wait,
>>> + "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
>>
>> You can do the revoke to do your resource management, for example re-use the backing store for something else.
>>
>> But it is mandatory that you keep the mapping around indefinitely until the importer closes it.
>>
>> Before that you can't do things like runtime PM or remove or anything which would make the DMA addresses invalid.
>>
>> As far as I can see vfio_pci_dma_buf_move() is used exactly for that use case so this here is an absolutely clear NAK from my side for this approach.
>>
>> You can either split up the functionality of vfio_pci_dma_buf_move() into vfio_pci_dma_buf_invalidate_mappings() and vfio_pci_dma_buf_flush() and then call the later whenever necessary or you keep it in one function and block everybody until the importer has dropped all mappings.
>
> No problem, I can change it to be:
>
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index d087d018d547..53772a84c93b 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -357,23 +357,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> dma_resv_unlock(priv->dmabuf->resv);
> if (revoked) {
> kref_put(&priv->kref, vfio_pci_dma_buf_done);
> - /*
> - * Let's wait for 1 second till all DMA unmap
> - * are completed. It is supposed to catch dma-buf
> - * importers which lied about their support
> - * of dmabuf revoke. See dma_buf_invalidate_mappings()
> - * for the expected behaviour.
> - */
> - wait = wait_for_completion_timeout(
> - &priv->comp, secs_to_jiffies(1));
> - /*
> - * If you see this WARN_ON, it means that
> - * importer didn't call unmap in response to
> - * dma_buf_invalidate_mappings() which is not
> - * allowed.
> - */
> - WARN(!wait,
> - "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
> + wait_for_completion(&priv->comp);
> } else {
> /*
> * Kref is initialize again, because when revoke
>
> Do you want me to send v6?
That would work for me.
Question is if you really want to do it this way? See usually exporters try to avoid blocking such functions.
What exporters usually do instead is to grab references, e.g. call pm_runtime_get_sync() when either a DMA-buf, a DMA-buf attachment or in your case here a mapping of this attachment is made.
But all of this is just a suggestion, if you are fine with blocking then feel free to add my rb.
Regards,
Christian.
>
> Thanks
>
>>
>>> + } else {
>>> + /*
>>> + * Kref is initialize again, because when revoke
>>> + * was performed the reference counter was decreased
>>> + * to zero to trigger completion.
>>> + */
>>> + kref_init(&priv->kref);
>>> + /*
>>> + * There is no need to wait as no mapping was
>>> + * performed when the previous status was
>>> + * priv->revoked == true.
>>> + */
>>> + reinit_completion(&priv->comp);
>>> + }
>>> }
>>> fput(priv->dmabuf->file);
>>
>> This is also extremely questionable. Why doesn't the dmabuf have a reference while on the linked list?
>>
>> Regards,
>> Christian.
>>
>>> }
>>> @@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
>>>
>>> down_write(&vdev->memory_lock);
>>> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
>>> + unsigned long wait;
>>> +
>>> if (!get_file_active(&priv->dmabuf->file))
>>> continue;
>>>
>>> @@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
>>> priv->vdev = NULL;
>>> priv->revoked = true;
>>> dma_buf_invalidate_mappings(priv->dmabuf);
>>> + dma_resv_wait_timeout(priv->dmabuf->resv,
>>> + DMA_RESV_USAGE_BOOKKEEP, false,
>>> + MAX_SCHEDULE_TIMEOUT);
>>> dma_resv_unlock(priv->dmabuf->resv);
>>> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
>>> + wait = wait_for_completion_timeout(&priv->comp,
>>> + secs_to_jiffies(1));
>>> + WARN_ON(!wait);
>>> vfio_device_put_registration(&vdev->vdev);
>>> fput(priv->dmabuf->file);
>>> }
>>>
>>
>>
On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote: > That would work for me. > > Question is if you really want to do it this way? See usually > exporters try to avoid blocking such functions. Yes, it has to be this way, revoke is a synchronous user space triggered operation around things like FLR or device close. We can't defer it into some background operation like pm. > >>> } > >>> fput(priv->dmabuf->file); > >> > >> This is also extremely questionable. Why doesn't the dmabuf have > >> a reference while on the linked list? If we hold a refcount while on the list then the FD can never be closed. There is locking protecting the list so that it is safe and close continues to work right. Jason
On 1/30/26 14:56, Jason Gunthorpe wrote: > On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote: > >> That would work for me. >> >> Question is if you really want to do it this way? See usually >> exporters try to avoid blocking such functions. > > Yes, it has to be this way, revoke is a synchronous user space > triggered operation around things like FLR or device close. We can't > defer it into some background operation like pm. Yeah, but you only need that in a couple of use cases and not all. Especially when the device is idle exporters usually don't want runtime PM to kick in and try to suspend the device while importers are still using it. > >>>>> } >>>>> fput(priv->dmabuf->file); >>>> >>>> This is also extremely questionable. Why doesn't the dmabuf have >>>> a reference while on the linked list? > > If we hold a refcount while on the list then the FD can never be > closed. > > There is locking protecting the list so that it is safe and close > continues to work right. Ah! You use get_file_active(), yeah that should work. Thanks, Christian. > > Jason
On Fri, Jan 30, 2026 at 03:11:48PM +0100, Christian König wrote: > On 1/30/26 14:56, Jason Gunthorpe wrote: > > On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote: > > > >> That would work for me. > >> > >> Question is if you really want to do it this way? See usually > >> exporters try to avoid blocking such functions. > > > > Yes, it has to be this way, revoke is a synchronous user space > > triggered operation around things like FLR or device close. We can't > > defer it into some background operation like pm. > > Yeah, but you only need that in a couple of use cases and not all. Not all, that is why the dma_buf_attach_revocable() is there to distinguish this case from others. Jason
On 1/30/26 15:44, Jason Gunthorpe wrote: > On Fri, Jan 30, 2026 at 03:11:48PM +0100, Christian König wrote: >> On 1/30/26 14:56, Jason Gunthorpe wrote: >>> On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote: >>> >>>> That would work for me. >>>> >>>> Question is if you really want to do it this way? See usually >>>> exporters try to avoid blocking such functions. >>> >>> Yes, it has to be this way, revoke is a synchronous user space >>> triggered operation around things like FLR or device close. We can't >>> defer it into some background operation like pm. >> >> Yeah, but you only need that in a couple of use cases and not all. > > Not all, that is why the dma_buf_attach_revocable() is there to > distinguish this case from others. No, no that's not what I mean. See on the one hand you have runtime PM which automatically shuts down your device after some time when the last user stops using it. Then on the other hand you have an intentional revoke triggered by userspace. As far as I've read up on the code currently both are handled the same way, and that doesn't sound correct to me. Runtime PM should *not* trigger automatically when there are still mappings or even DMA-bufs in existence for the VFIO device. Regards, Christian. > > Jason
On Mon, Feb 02, 2026 at 09:42:22AM +0100, Christian König wrote: > On 1/30/26 15:44, Jason Gunthorpe wrote: > > On Fri, Jan 30, 2026 at 03:11:48PM +0100, Christian König wrote: > >> On 1/30/26 14:56, Jason Gunthorpe wrote: > >>> On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote: > >>> > >>>> That would work for me. > >>>> > >>>> Question is if you really want to do it this way? See usually > >>>> exporters try to avoid blocking such functions. > >>> > >>> Yes, it has to be this way, revoke is a synchronous user space > >>> triggered operation around things like FLR or device close. We can't > >>> defer it into some background operation like pm. > >> > >> Yeah, but you only need that in a couple of use cases and not all. > > > > Not all, that is why the dma_buf_attach_revocable() is there to > > distinguish this case from others. > > No, no that's not what I mean. > > See on the one hand you have runtime PM which automatically shuts > down your device after some time when the last user stops using it. > > Then on the other hand you have an intentional revoke triggered by > userspace. > > As far as I've read up on the code currently both are handled the > same way, and that doesn't sound correct to me. > > Runtime PM should *not* trigger automatically when there are still > mappings or even DMA-bufs in existence for the VFIO device. I'm a little confused why we are talking about runtime PM - are you pointing out an issue in VFIO today where it's PM support is not good? I admit I don't know a lot about VFIO PM support.. Though I thought in the VFIO case PM was actually under userspace control as generally the PM control is delegated to the VM. Through that lens, what is happening here is correct. If the VM requests to shut down VFIO PM (through a hypervisor vfio ioctl) then we do want to revoke the DMABUF so that the VM can't trigger a AER/etc by trying to access the sleeping PCI device. I don't think VFIO uses automatic PM on a timer, that doesn't make sense for it's programming model. Jason
On 2/2/26 16:12, Jason Gunthorpe wrote: > On Mon, Feb 02, 2026 at 09:42:22AM +0100, Christian König wrote: >> On 1/30/26 15:44, Jason Gunthorpe wrote: >>> On Fri, Jan 30, 2026 at 03:11:48PM +0100, Christian König wrote: >>>> On 1/30/26 14:56, Jason Gunthorpe wrote: >>>>> On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote: >>>>> >>>>>> That would work for me. >>>>>> >>>>>> Question is if you really want to do it this way? See usually >>>>>> exporters try to avoid blocking such functions. >>>>> >>>>> Yes, it has to be this way, revoke is a synchronous user space >>>>> triggered operation around things like FLR or device close. We can't >>>>> defer it into some background operation like pm. >>>> >>>> Yeah, but you only need that in a couple of use cases and not all. >>> >>> Not all, that is why the dma_buf_attach_revocable() is there to >>> distinguish this case from others. >> >> No, no that's not what I mean. >> >> See on the one hand you have runtime PM which automatically shuts >> down your device after some time when the last user stops using it. >> >> Then on the other hand you have an intentional revoke triggered by >> userspace. >> >> As far as I've read up on the code currently both are handled the >> same way, and that doesn't sound correct to me. >> >> Runtime PM should *not* trigger automatically when there are still >> mappings or even DMA-bufs in existence for the VFIO device. > > I'm a little confused why we are talking about runtime PM - are you > pointing out an issue in VFIO today where it's PM support is not good? Exactly that, yes. This patch set here doesn't break it, but most likely makes the effect quite worse. > I admit I don't know a lot about VFIO PM support.. Though I thought in > the VFIO case PM was actually under userspace control as generally the > PM control is delegated to the VM. > > Through that lens, what is happening here is correct. If the VM > requests to shut down VFIO PM (through a hypervisor vfio ioctl) then > we do want to revoke the DMABUF so that the VM can't trigger a AER/etc > by trying to access the sleeping PCI device. > > I don't think VFIO uses automatic PM on a timer, that doesn't make > sense for it's programming model. From your description I agree that this doesn't make sense, but from the code it looks like exactly that is done. Grep for pm_runtime_* on drivers/vfio/pci, but could be that I misunderstood the functionality, e.g. didn't spend to much time on it. Just keep it in the back of your mind and maybe double check if that is actually the desired behavior. Regards, Christian. > Jason
On Mon, Feb 02, 2026 at 04:21:50PM +0100, Christian König wrote: > > I admit I don't know a lot about VFIO PM support.. Though I thought in > > the VFIO case PM was actually under userspace control as generally the > > PM control is delegated to the VM. > > > > Through that lens, what is happening here is correct. If the VM > > requests to shut down VFIO PM (through a hypervisor vfio ioctl) then > > we do want to revoke the DMABUF so that the VM can't trigger a AER/etc > > by trying to access the sleeping PCI device. > > > > I don't think VFIO uses automatic PM on a timer, that doesn't make > > sense for it's programming model. > > From your description I agree that this doesn't make sense, but from > the code it looks like exactly that is done. > > Grep for pm_runtime_* on drivers/vfio/pci, but could be that I > misunderstood the functionality, e.g. didn't spend to much time on > it. > > Just keep it in the back of your mind and maybe double check if that > is actually the desired behavior. I had a small conversation with AlexW and we think VFIO is OK (bugs excluded). The use of the PM timer is still under userspace control, even though a timer is still involved. Basically there are a series of IOCTL defined in VFIO, like LOW_POWER_ENTRY that all isolate the PCI device from userspace. The mmap is blocked with SIBGUS and the DMABUFs are revoked. The VFIO uAPI contract requries userspace to stop touching the device immediately when using these IOCTLs. The PM timer may still be involved, but is an implementation detail. Effectively VFIO has a device state "isolated" meaning that userspace cannot access the MMIO, and it enters this state based on various IOCTLs from userspace. It ties mmap and DMABUF together so that if mmap SIGBUS's the DMABUF is unmapped. I understand your remarks, and this use of PM is certainly nothing that any other driver should copy, but it does make sense for VFIO. If there are bugs/issues we would continue to keep the overall property that SGIBUS==DMABUF unmapped and only adjust when that happens. TBH, I don't think people use the VFIO PM feature very much. Jason
On Fri, Jan 30, 2026 at 02:21:08PM +0100, Christian König wrote:
> On 1/30/26 14:01, Leon Romanovsky wrote:
> > On Fri, Jan 30, 2026 at 09:30:59AM +0100, Christian König wrote:
> >> On 1/24/26 20:14, Leon Romanovsky wrote:
> >>> From: Leon Romanovsky <leonro@nvidia.com>
> >>>
> >>> dma-buf invalidation is handled asynchronously by the hardware, so VFIO
> >>> must wait until all affected objects have been fully invalidated.
> >>>
> >>> In addition, the dma-buf exporter is expecting that all importers unmap any
> >>> buffers they previously mapped.
> >>>
> >>> Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions")
> >>> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> >>> ---
> >>> drivers/vfio/pci/vfio_pci_dmabuf.c | 71 ++++++++++++++++++++++++++++++++++++--
> >>> 1 file changed, 68 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> >>> index d8ceafabef48..485515629fe4 100644
> >>> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> >>> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> >>> @@ -17,6 +17,8 @@ struct vfio_pci_dma_buf {
> >>> struct dma_buf_phys_vec *phys_vec;
> >>> struct p2pdma_provider *provider;
> >>> u32 nr_ranges;
> >>> + struct kref kref;
> >>> + struct completion comp;
> >>> u8 revoked : 1;
> >>> };
> >>>
> >>> @@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> >>> return 0;
> >>> }
> >>>
> >>> +static void vfio_pci_dma_buf_done(struct kref *kref)
> >>> +{
> >>> + struct vfio_pci_dma_buf *priv =
> >>> + container_of(kref, struct vfio_pci_dma_buf, kref);
> >>> +
> >>> + complete(&priv->comp);
> >>> +}
> >>> +
> >>> static struct sg_table *
> >>> vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> >>> enum dma_data_direction dir)
> >>> {
> >>> struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> >>> + struct sg_table *ret;
> >>>
> >>> dma_resv_assert_held(priv->dmabuf->resv);
> >>>
> >>> if (priv->revoked)
> >>> return ERR_PTR(-ENODEV);
> >>>
> >>> - return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> >>> - priv->phys_vec, priv->nr_ranges,
> >>> - priv->size, dir);
> >>> + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> >>> + priv->phys_vec, priv->nr_ranges,
> >>> + priv->size, dir);
> >>> + if (IS_ERR(ret))
> >>> + return ret;
> >>> +
> >>> + kref_get(&priv->kref);
> >>> + return ret;
> >>> }
> >>>
> >>> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> >>> struct sg_table *sgt,
> >>> enum dma_data_direction dir)
> >>> {
> >>> + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> >>> +
> >>> + dma_resv_assert_held(priv->dmabuf->resv);
> >>> +
> >>> dma_buf_free_sgt(attachment, sgt, dir);
> >>> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> >>> }
> >>>
> >>> static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> >>> @@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> >>> goto err_dev_put;
> >>> }
> >>>
> >>> + kref_init(&priv->kref);
> >>> + init_completion(&priv->comp);
> >>> +
> >>> /* dma_buf_put() now frees priv */
> >>> INIT_LIST_HEAD(&priv->dmabufs_elm);
> >>> down_write(&vdev->memory_lock);
> >>> @@ -326,6 +350,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> >>> lockdep_assert_held_write(&vdev->memory_lock);
> >>>
> >>> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> >>> + unsigned long wait;
> >>> +
> >>> if (!get_file_active(&priv->dmabuf->file))
> >>> continue;
> >>>
> >>> @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> >>> dma_resv_lock(priv->dmabuf->resv, NULL);
> >>> priv->revoked = revoked;
> >>> dma_buf_invalidate_mappings(priv->dmabuf);
> >>> + dma_resv_wait_timeout(priv->dmabuf->resv,
> >>> + DMA_RESV_USAGE_BOOKKEEP, false,
> >>> + MAX_SCHEDULE_TIMEOUT);
> >>> dma_resv_unlock(priv->dmabuf->resv);
> >>> + if (revoked) {
> >>> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> >>> + /* Let's wait till all DMA unmap are completed. */
> >>> + wait = wait_for_completion_timeout(
> >>> + &priv->comp, secs_to_jiffies(1));
> >>> + /*
> >>> + * If you see this WARN_ON, it means that
> >>> + * importer didn't call unmap in response to
> >>> + * dma_buf_invalidate_mappings() which is not
> >>> + * allowed.
> >>> + */
> >>> + WARN(!wait,
> >>> + "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
> >>
> >> You can do the revoke to do your resource management, for example re-use the backing store for something else.
> >>
> >> But it is mandatory that you keep the mapping around indefinitely until the importer closes it.
> >>
> >> Before that you can't do things like runtime PM or remove or anything which would make the DMA addresses invalid.
> >>
> >> As far as I can see vfio_pci_dma_buf_move() is used exactly for that use case so this here is an absolutely clear NAK from my side for this approach.
> >>
> >> You can either split up the functionality of vfio_pci_dma_buf_move() into vfio_pci_dma_buf_invalidate_mappings() and vfio_pci_dma_buf_flush() and then call the later whenever necessary or you keep it in one function and block everybody until the importer has dropped all mappings.
> >
> > No problem, I can change it to be:
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > index d087d018d547..53772a84c93b 100644
> > --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> > +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> > @@ -357,23 +357,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > dma_resv_unlock(priv->dmabuf->resv);
> > if (revoked) {
> > kref_put(&priv->kref, vfio_pci_dma_buf_done);
> > - /*
> > - * Let's wait for 1 second till all DMA unmap
> > - * are completed. It is supposed to catch dma-buf
> > - * importers which lied about their support
> > - * of dmabuf revoke. See dma_buf_invalidate_mappings()
> > - * for the expected behaviour.
> > - */
> > - wait = wait_for_completion_timeout(
> > - &priv->comp, secs_to_jiffies(1));
> > - /*
> > - * If you see this WARN_ON, it means that
> > - * importer didn't call unmap in response to
> > - * dma_buf_invalidate_mappings() which is not
> > - * allowed.
> > - */
> > - WARN(!wait,
> > - "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
> > + wait_for_completion(&priv->comp);
> > } else {
> > /*
> > * Kref is initialize again, because when revoke
> >
> > Do you want me to send v6?
>
> That would work for me.
>
> Question is if you really want to do it this way? See usually exporters try to avoid blocking such functions.
>
> What exporters usually do instead is to grab references, e.g. call pm_runtime_get_sync() when either a DMA-buf, a DMA-buf attachment or in your case here a mapping of this attachment is made.
I view this as an enhancement that can be addressed later down the road.
>
> But all of this is just a suggestion, if you are fine with blocking then feel free to add my rb.
It is fine for initial version. We need to start somewhere.
Thanks
>
> Regards,
> Christian.
>
> >
> > Thanks
> >
> >>
> >>> + } else {
> >>> + /*
> >>> + * Kref is initialize again, because when revoke
> >>> + * was performed the reference counter was decreased
> >>> + * to zero to trigger completion.
> >>> + */
> >>> + kref_init(&priv->kref);
> >>> + /*
> >>> + * There is no need to wait as no mapping was
> >>> + * performed when the previous status was
> >>> + * priv->revoked == true.
> >>> + */
> >>> + reinit_completion(&priv->comp);
> >>> + }
> >>> }
> >>> fput(priv->dmabuf->file);
> >>
> >> This is also extremely questionable. Why doesn't the dmabuf have a reference while on the linked list?
> >>
> >> Regards,
> >> Christian.
> >>
> >>> }
> >>> @@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> >>>
> >>> down_write(&vdev->memory_lock);
> >>> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> >>> + unsigned long wait;
> >>> +
> >>> if (!get_file_active(&priv->dmabuf->file))
> >>> continue;
> >>>
> >>> @@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> >>> priv->vdev = NULL;
> >>> priv->revoked = true;
> >>> dma_buf_invalidate_mappings(priv->dmabuf);
> >>> + dma_resv_wait_timeout(priv->dmabuf->resv,
> >>> + DMA_RESV_USAGE_BOOKKEEP, false,
> >>> + MAX_SCHEDULE_TIMEOUT);
> >>> dma_resv_unlock(priv->dmabuf->resv);
> >>> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> >>> + wait = wait_for_completion_timeout(&priv->comp,
> >>> + secs_to_jiffies(1));
> >>> + WARN_ON(!wait);
> >>> vfio_device_put_registration(&vdev->vdev);
> >>> fput(priv->dmabuf->file);
> >>> }
> >>>
> >>
> >>
>
>
On Sat, Jan 24, 2026 at 09:14:16PM +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
>
> dma-buf invalidation is handled asynchronously by the hardware, so VFIO
> must wait until all affected objects have been fully invalidated.
>
> In addition, the dma-buf exporter is expecting that all importers unmap any
> buffers they previously mapped.
>
> Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions")
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
> drivers/vfio/pci/vfio_pci_dmabuf.c | 71 ++++++++++++++++++++++++++++++++++++--
> 1 file changed, 68 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index d8ceafabef48..485515629fe4 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -17,6 +17,8 @@ struct vfio_pci_dma_buf {
> struct dma_buf_phys_vec *phys_vec;
> struct p2pdma_provider *provider;
> u32 nr_ranges;
> + struct kref kref;
> + struct completion comp;
> u8 revoked : 1;
> };
>
> @@ -44,27 +46,46 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> return 0;
> }
>
> +static void vfio_pci_dma_buf_done(struct kref *kref)
> +{
> + struct vfio_pci_dma_buf *priv =
> + container_of(kref, struct vfio_pci_dma_buf, kref);
> +
> + complete(&priv->comp);
> +}
> +
> static struct sg_table *
> vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> enum dma_data_direction dir)
> {
> struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> + struct sg_table *ret;
>
> dma_resv_assert_held(priv->dmabuf->resv);
>
> if (priv->revoked)
> return ERR_PTR(-ENODEV);
>
> - return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> - priv->phys_vec, priv->nr_ranges,
> - priv->size, dir);
> + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider,
> + priv->phys_vec, priv->nr_ranges,
> + priv->size, dir);
> + if (IS_ERR(ret))
> + return ret;
> +
> + kref_get(&priv->kref);
> + return ret;
> }
>
> static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
> struct sg_table *sgt,
> enum dma_data_direction dir)
> {
> + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +
> + dma_resv_assert_held(priv->dmabuf->resv);
> +
> dma_buf_free_sgt(attachment, sgt, dir);
> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> }
>
> static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> @@ -287,6 +308,9 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
> goto err_dev_put;
> }
>
> + kref_init(&priv->kref);
> + init_completion(&priv->comp);
> +
> /* dma_buf_put() now frees priv */
> INIT_LIST_HEAD(&priv->dmabufs_elm);
> down_write(&vdev->memory_lock);
> @@ -326,6 +350,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> lockdep_assert_held_write(&vdev->memory_lock);
>
> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> + unsigned long wait;
> +
> if (!get_file_active(&priv->dmabuf->file))
> continue;
>
> @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> dma_resv_lock(priv->dmabuf->resv, NULL);
> priv->revoked = revoked;
> dma_buf_invalidate_mappings(priv->dmabuf);
> + dma_resv_wait_timeout(priv->dmabuf->resv,
> + DMA_RESV_USAGE_BOOKKEEP, false,
> + MAX_SCHEDULE_TIMEOUT);
> dma_resv_unlock(priv->dmabuf->resv);
> + if (revoked) {
> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> + /* Let's wait till all DMA unmap are completed. */
> + wait = wait_for_completion_timeout(
> + &priv->comp, secs_to_jiffies(1));
Is the 1-second constant sufficient for all hardware, or should the
invalidate_mappings() contract require the callback to block until
speculative reads are strictly fenced? I'm wondering about a case where
a device's firmware has a high response latency, perhaps due to internal
management tasks like error recovery or thermal and it exceeds the 1s
timeout.
If the device is in the middle of a large DMA burst and the firmware is
slow to flush the internal pipelines to a fully "quiesced"
read-and-discard state, reclaiming the memory at exactly 1.001 seconds
risks triggering platform-level faults..
Since the wen explicitly permit these speculative reads until unmap is
complete, relying on a hardcoded timeout in the exporter seems to
introduce a hardware-dependent race condition that could compromise
system stability via IOMMU errors or AER faults.
Should the importer instead be required to guarantee that all
speculative access has ceased before the invalidation call returns?
Thanks
Praan
> + /*
> + * If you see this WARN_ON, it means that
> + * importer didn't call unmap in response to
> + * dma_buf_invalidate_mappings() which is not
> + * allowed.
> + */
> + WARN(!wait,
> + "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
> + } else {
> + /*
> + * Kref is initialize again, because when revoke
> + * was performed the reference counter was decreased
> + * to zero to trigger completion.
> + */
> + kref_init(&priv->kref);
> + /*
> + * There is no need to wait as no mapping was
> + * performed when the previous status was
> + * priv->revoked == true.
> + */
> + reinit_completion(&priv->comp);
> + }
> }
> fput(priv->dmabuf->file);
> }
> @@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
>
> down_write(&vdev->memory_lock);
> list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> + unsigned long wait;
> +
> if (!get_file_active(&priv->dmabuf->file))
> continue;
>
> @@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> priv->vdev = NULL;
> priv->revoked = true;
> dma_buf_invalidate_mappings(priv->dmabuf);
> + dma_resv_wait_timeout(priv->dmabuf->resv,
> + DMA_RESV_USAGE_BOOKKEEP, false,
> + MAX_SCHEDULE_TIMEOUT);
> dma_resv_unlock(priv->dmabuf->resv);
> + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> + wait = wait_for_completion_timeout(&priv->comp,
> + secs_to_jiffies(1));
> + WARN_ON(!wait);
> vfio_device_put_registration(&vdev->vdev);
> fput(priv->dmabuf->file);
> }
>
> --
> 2.52.0
>
>
On Mon, Jan 26, 2026 at 08:53:57PM +0000, Pranjal Shrivastava wrote:
> On Sat, Jan 24, 2026 at 09:14:16PM +0200, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > dma-buf invalidation is handled asynchronously by the hardware, so VFIO
> > must wait until all affected objects have been fully invalidated.
> >
> > In addition, the dma-buf exporter is expecting that all importers unmap any
> > buffers they previously mapped.
> >
> > Fixes: 5d74781ebc86 ("vfio/pci: Add dma-buf export support for MMIO regions")
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> > drivers/vfio/pci/vfio_pci_dmabuf.c | 71 ++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 68 insertions(+), 3 deletions(-)
<...>
> > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > dma_resv_lock(priv->dmabuf->resv, NULL);
> > priv->revoked = revoked;
> > dma_buf_invalidate_mappings(priv->dmabuf);
> > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > + DMA_RESV_USAGE_BOOKKEEP, false,
> > + MAX_SCHEDULE_TIMEOUT);
> > dma_resv_unlock(priv->dmabuf->resv);
> > + if (revoked) {
> > + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> > + /* Let's wait till all DMA unmap are completed. */
> > + wait = wait_for_completion_timeout(
> > + &priv->comp, secs_to_jiffies(1));
>
> Is the 1-second constant sufficient for all hardware, or should the
> invalidate_mappings() contract require the callback to block until
> speculative reads are strictly fenced? I'm wondering about a case where
> a device's firmware has a high response latency, perhaps due to internal
> management tasks like error recovery or thermal and it exceeds the 1s
> timeout.
>
> If the device is in the middle of a large DMA burst and the firmware is
> slow to flush the internal pipelines to a fully "quiesced"
> read-and-discard state, reclaiming the memory at exactly 1.001 seconds
> risks triggering platform-level faults..
>
> Since the wen explicitly permit these speculative reads until unmap is
> complete, relying on a hardcoded timeout in the exporter seems to
> introduce a hardware-dependent race condition that could compromise
> system stability via IOMMU errors or AER faults.
>
> Should the importer instead be required to guarantee that all
> speculative access has ceased before the invalidation call returns?
It is guaranteed by the dma_resv_wait_timeout() call above. That call ensures
that the hardware has completed all pending operations. The 1‑second delay is
meant to catch cases where an in-kernel DMA unmap call is missing, which should
not trigger any DMA activity at that point.
So yes, one second is more than sufficient.
Thanks
>
> Thanks
> Praan
>
> > + /*
> > + * If you see this WARN_ON, it means that
> > + * importer didn't call unmap in response to
> > + * dma_buf_invalidate_mappings() which is not
> > + * allowed.
> > + */
> > + WARN(!wait,
> > + "Timed out waiting for DMABUF unmap, importer has a broken invalidate_mapping()");
> > + } else {
> > + /*
> > + * Kref is initialize again, because when revoke
> > + * was performed the reference counter was decreased
> > + * to zero to trigger completion.
> > + */
> > + kref_init(&priv->kref);
> > + /*
> > + * There is no need to wait as no mapping was
> > + * performed when the previous status was
> > + * priv->revoked == true.
> > + */
> > + reinit_completion(&priv->comp);
> > + }
> > }
> > fput(priv->dmabuf->file);
> > }
> > @@ -346,6 +402,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> >
> > down_write(&vdev->memory_lock);
> > list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> > + unsigned long wait;
> > +
> > if (!get_file_active(&priv->dmabuf->file))
> > continue;
> >
> > @@ -354,7 +412,14 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> > priv->vdev = NULL;
> > priv->revoked = true;
> > dma_buf_invalidate_mappings(priv->dmabuf);
> > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > + DMA_RESV_USAGE_BOOKKEEP, false,
> > + MAX_SCHEDULE_TIMEOUT);
> > dma_resv_unlock(priv->dmabuf->resv);
> > + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> > + wait = wait_for_completion_timeout(&priv->comp,
> > + secs_to_jiffies(1));
> > + WARN_ON(!wait);
> > vfio_device_put_registration(&vdev->vdev);
> > fput(priv->dmabuf->file);
> > }
> >
> > --
> > 2.52.0
> >
> >
>
On Tue, Jan 27, 2026 at 10:58:35AM +0200, Leon Romanovsky wrote:
> > > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> > > dma_resv_lock(priv->dmabuf->resv, NULL);
> > > priv->revoked = revoked;
> > > dma_buf_invalidate_mappings(priv->dmabuf);
> > > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > > + DMA_RESV_USAGE_BOOKKEEP, false,
> > > + MAX_SCHEDULE_TIMEOUT);
> > > dma_resv_unlock(priv->dmabuf->resv);
> > > + if (revoked) {
> > > + kref_put(&priv->kref, vfio_pci_dma_buf_done);
> > > + /* Let's wait till all DMA unmap are completed. */
> > > + wait = wait_for_completion_timeout(
> > > + &priv->comp, secs_to_jiffies(1));
> >
> > Is the 1-second constant sufficient for all hardware, or should the
> > invalidate_mappings() contract require the callback to block until
> > speculative reads are strictly fenced? I'm wondering about a case where
> > a device's firmware has a high response latency, perhaps due to internal
> > management tasks like error recovery or thermal and it exceeds the 1s
> > timeout.
> >
> > If the device is in the middle of a large DMA burst and the firmware is
> > slow to flush the internal pipelines to a fully "quiesced"
> > read-and-discard state, reclaiming the memory at exactly 1.001 seconds
> > risks triggering platform-level faults..
> >
> > Since the wen explicitly permit these speculative reads until unmap is
> > complete, relying on a hardcoded timeout in the exporter seems to
> > introduce a hardware-dependent race condition that could compromise
> > system stability via IOMMU errors or AER faults.
> >
> > Should the importer instead be required to guarantee that all
> > speculative access has ceased before the invalidation call returns?
>
> It is guaranteed by the dma_resv_wait_timeout() call above. That call ensures
> that the hardware has completed all pending operations. The 1‑second delay is
> meant to catch cases where an in-kernel DMA unmap call is missing, which should
> not trigger any DMA activity at that point.
Christian may know actual examples, but my general feeling is he was
worrying about drivers that have pushed the DMABUF to visibility on
the GPU and the move notify & fences only shoot down some access. So
it has to wait until the DMABUF is finally unmapped.
Pranjal's example should be covered by the driver adding a fence and
then the unbounded fence wait will complete it.
I think the open question here is if drivers that can't rely on their
fences should return dma_buf_attach_revocable() = false ? It depends
on how long they will leave the buffers mapped, and if it is "bounded
time".
The converse is we want to detect bugs where drivers have wrongly set
dma_buf_attach_revocable() = true and this turns into an infinite
sleep, so the logging is necessary, IMHO.
At worst the code should sleep 1s, print, then keep sleeping..
Jason
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Wednesday, January 28, 2026 12:28 AM
>
> On Tue, Jan 27, 2026 at 10:58:35AM +0200, Leon Romanovsky wrote:
> > > > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct
> vfio_pci_core_device *vdev, bool revoked)
> > > > dma_resv_lock(priv->dmabuf->resv, NULL);
> > > > priv->revoked = revoked;
> > > > dma_buf_invalidate_mappings(priv->dmabuf);
> > > > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > > > + DMA_RESV_USAGE_BOOKKEEP,
> false,
> > > > + MAX_SCHEDULE_TIMEOUT);
> > > > dma_resv_unlock(priv->dmabuf->resv);
> > > > + if (revoked) {
> > > > + kref_put(&priv->kref,
> vfio_pci_dma_buf_done);
> > > > + /* Let's wait till all DMA unmap are
> completed. */
> > > > + wait = wait_for_completion_timeout(
> > > > + &priv->comp, secs_to_jiffies(1));
> > >
> > > Is the 1-second constant sufficient for all hardware, or should the
> > > invalidate_mappings() contract require the callback to block until
> > > speculative reads are strictly fenced? I'm wondering about a case where
> > > a device's firmware has a high response latency, perhaps due to internal
> > > management tasks like error recovery or thermal and it exceeds the 1s
> > > timeout.
> > >
> > > If the device is in the middle of a large DMA burst and the firmware is
> > > slow to flush the internal pipelines to a fully "quiesced"
> > > read-and-discard state, reclaiming the memory at exactly 1.001 seconds
> > > risks triggering platform-level faults..
> > >
> > > Since the wen explicitly permit these speculative reads until unmap is
> > > complete, relying on a hardcoded timeout in the exporter seems to
> > > introduce a hardware-dependent race condition that could compromise
> > > system stability via IOMMU errors or AER faults.
> > >
> > > Should the importer instead be required to guarantee that all
> > > speculative access has ceased before the invalidation call returns?
> >
> > It is guaranteed by the dma_resv_wait_timeout() call above. That call
> ensures
> > that the hardware has completed all pending operations. The 1‑second
> delay is
> > meant to catch cases where an in-kernel DMA unmap call is missing, which
> should
> > not trigger any DMA activity at that point.
>
> Christian may know actual examples, but my general feeling is he was
> worrying about drivers that have pushed the DMABUF to visibility on
> the GPU and the move notify & fences only shoot down some access. So
> it has to wait until the DMABUF is finally unmapped.
>
> Pranjal's example should be covered by the driver adding a fence and
> then the unbounded fence wait will complete it.
>
Bear me if it's an ignorant question.
The commit msg of patch6 says that VFIO doesn't tolerate unbounded
wait, which is the reason behind the 2nd timeout wait here.
Then why is "the unbounded fence wait" not a problem in the same
code path? the use of MAX_SCHEDULE_TIMEOUT imply a worst-case
timeout in hundreds of years...
and it'd be helpful to put some words in the code based on what's
discussed here.
On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote: > Bear me if it's an ignorant question. > > The commit msg of patch6 says that VFIO doesn't tolerate unbounded > wait, which is the reason behind the 2nd timeout wait here. As far as I understand dmabuf design a fence wait should complete eventually under kernel control, because these sleeps are sprinkled all around the kernel today. I suspect that is not actually true for every HW, probably something like "shader programs can run forever technically". We can argue if those cases should not report revocable either, but at least this will work "correctly" even if it takes a huge amount of time. I wouldn't mind seeing a shorter timeout and print on the fence too just in case. Jason
On 1/29/26 15:58, Jason Gunthorpe wrote: > On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote: >> Bear me if it's an ignorant question. >> >> The commit msg of patch6 says that VFIO doesn't tolerate unbounded >> wait, which is the reason behind the 2nd timeout wait here. > > As far as I understand dmabuf design a fence wait should complete > eventually under kernel control, because these sleeps are > sprinkled all around the kernel today. Well it's a bit different, but we indeed guarantee that dma_fences complete in finite time. > I suspect that is not actually true for every HW, probably something > like "shader programs can run forever technically". Nope, stuff like that is strictly forbidden. Regards, Christian. > > We can argue if those cases should not report revocable either, but at > least this will work "correctly" even if it takes a huge amount of > time. > > I wouldn't mind seeing a shorter timeout and print on the fence too > just in case. > > Jason
> From: Jason Gunthorpe <jgg@ziepe.ca> > Sent: Thursday, January 29, 2026 10:59 PM > > On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote: > > Bear me if it's an ignorant question. > > > > The commit msg of patch6 says that VFIO doesn't tolerate unbounded > > wait, which is the reason behind the 2nd timeout wait here. > > As far as I understand dmabuf design a fence wait should complete > eventually under kernel control, because these sleeps are > sprinkled all around the kernel today. > > I suspect that is not actually true for every HW, probably something > like "shader programs can run forever technically". > > We can argue if those cases should not report revocable either, but at > least this will work "correctly" even if it takes a huge amount of > time. good to know those background. > > I wouldn't mind seeing a shorter timeout and print on the fence too > just in case. > either way is OK. It's not difficult to figure out a long wait anyway. 😊
On Fri, 30 Jan 2026 03:12:02 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Jason Gunthorpe <jgg@ziepe.ca> > > Sent: Thursday, January 29, 2026 10:59 PM > > > > On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote: > > > Bear me if it's an ignorant question. > > > > > > The commit msg of patch6 says that VFIO doesn't tolerate unbounded > > > wait, which is the reason behind the 2nd timeout wait here. > > > > As far as I understand dmabuf design a fence wait should complete > > eventually under kernel control, because these sleeps are > > sprinkled all around the kernel today. > > > > I suspect that is not actually true for every HW, probably something > > like "shader programs can run forever technically". > > > > We can argue if those cases should not report revocable either, but at > > least this will work "correctly" even if it takes a huge amount of > > time. > > good to know those background. > > > > > I wouldn't mind seeing a shorter timeout and print on the fence too > > just in case. > > > > either way is OK. It's not difficult to figure out a long wait anyway. 😊 Please don't use Outlook when answering to patches - or ensure that it is properly patched to only send plain text - which I don't think it is possible. If you look on this message source code, it is not in plain text: Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Your message content is: PiBGcm9tOiBKYXNvbiBHdW50aG9ycGUgPGpnZ0B6aWVwZS5jYT4NCj4gU2VudDogVGh1cnNkYXks IEphbnVhcnkgMjksIDIwMjYgMTA6NTkgUE0NCj4gDQo+IE9uIFRodSwgSmFuIDI5LCAyMDI2IGF0 IDA3OjA2OjM3QU0gKzAwMDAsIFRpYW4sIEtldmluIHdyb3RlOg0KPiA+IEJlYXIgbWUgaWYgaXQn cyBhbiBpZ25vcmFudCBxdWVzdGlvbi4NCj4gPg0KPiA+IFRoZSBjb21taXQgbXNnIG9mIHBhdGNo NiBzYXlzIHRoYXQgVkZJTyBkb2Vzbid0IHRvbGVyYXRlIHVuYm91bmRlZA0KPiA+IHdhaXQsIHdo aWNoIGlzIHRoZSByZWFzb24gYmVoaW5kIHRoZSAybmQgdGltZW91dCB3YWl0IGhlcmUuDQo+IA0K PiBBcyBmYXIgYXMgSSB1bmRlcnN0YW5kIGRtYWJ1ZiBkZXNpZ24gYSBmZW5jZSB3YWl0IHNob3Vs ZCBjb21wbGV0ZQ0KPiBldmVudHVhbGx5IHVuZGVyIGtlcm5lbCBjb250cm9sLCBiZWNhdXNlIHRo ZXNlIHNsZWVwcyBhcmUNCj4gc3ByaW5rbGVkIGFsbCBhcm91bmQgdGhlIGtlcm5lbCB0b2RheS4N Cj4gDQo+IEkgc3VzcGVjdCB0aGF0IGlzIG5vdCBhY3R1YWxseSB0cnVlIGZvciBldmVyeSBIVywg cHJvYmFibHkgc29tZXRoaW5nDQo+IGxpa2UgInNoYWRlciBwcm9ncmFtcyBjYW4gcnVuIGZvcmV2 ZXIgdGVjaG5pY2FsbHkiLg0KPiANCj4gV2UgY2FuIGFyZ3VlIGlmIHRob3NlIGNhc2VzIHNob3Vs ZCBub3QgcmVwb3J0IHJldm9jYWJsZSBlaXRoZXIsIGJ1dCBhdA0KPiBsZWFzdCB0aGlzIHdpbGwg d29yayAiY29ycmVjdGx5IiBldmVuIGlmIGl0IHRha2VzIGEgaHVnZSBhbW91bnQgb2YNCj4gdGlt ZS4NCg0KZ29vZCB0byBrbm93IHRob3NlIGJhY2tncm91bmQuDQoNCj4gDQo+IEkgd291bGRuJ3Qg bWluZCBzZWVpbmcgYSBzaG9ydGVyIHRpbWVvdXQgYW5kIHByaW50IG9uIHRoZSBmZW5jZSB0b28N Cj4ganVzdCBpbiBjYXNlLg0KPiANCg0KZWl0aGVyIHdheSBpcyBPSy4gSXQncyBub3QgZGlmZmlj dWx0IHRvIGZpZ3VyZSBvdXQgYSBsb25nIHdhaXQgYW55d2F5LiDwn5iKDQo= which is something that patch tools - in special patchwork - won't handle. Thanks, Mauro
> From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> > Sent: Friday, January 30, 2026 1:43 PM > > On Fri, 30 Jan 2026 03:12:02 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Jason Gunthorpe <jgg@ziepe.ca> > > > Sent: Thursday, January 29, 2026 10:59 PM > > > > > > On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote: > > > > Bear me if it's an ignorant question. > > > > > > > > The commit msg of patch6 says that VFIO doesn't tolerate unbounded > > > > wait, which is the reason behind the 2nd timeout wait here. > > > > > > As far as I understand dmabuf design a fence wait should complete > > > eventually under kernel control, because these sleeps are > > > sprinkled all around the kernel today. > > > > > > I suspect that is not actually true for every HW, probably something > > > like "shader programs can run forever technically". > > > > > > We can argue if those cases should not report revocable either, but at > > > least this will work "correctly" even if it takes a huge amount of > > > time. > > > > good to know those background. > > > > > > > > I wouldn't mind seeing a shorter timeout and print on the fence too > > > just in case. > > > > > > > either way is OK. It's not difficult to figure out a long wait anyway. 😊 > > Please don't use Outlook when answering to patches - or ensure that > it is properly patched to only send plain text - which I don't > think it is possible. > > If you look on this message source code, it is not in plain text: > > Content-Type: text/plain; charset="utf-8" > Content-Transfer-Encoding: base64 it's likely caused by the trailing smile icon. I'll pay attention to it. > > Your message content is: > > PiBGcm9tOiBKYXNvbiBHdW50aG9ycGUgPGpnZ0B6aWVwZS5jYT4NCj > 4gU2VudDogVGh1cnNkYXks > IEphbnVhcnkgMjksIDIwMjYgMTA6NTkgUE0NCj4gDQo+IE9uIFRodSw > gSmFuIDI5LCAyMDI2IGF0 > IDA3OjA2OjM3QU0gKzAwMDAsIFRpYW4sIEtldmluIHdyb3RlOg0KPiA+ > IEJlYXIgbWUgaWYgaXQn > cyBhbiBpZ25vcmFudCBxdWVzdGlvbi4NCj4gPg0KPiA+IFRoZSBjb21taX > QgbXNnIG9mIHBhdGNo > NiBzYXlzIHRoYXQgVkZJTyBkb2Vzbid0IHRvbGVyYXRlIHVuYm91bmRlZ > A0KPiA+IHdhaXQsIHdo > aWNoIGlzIHRoZSByZWFzb24gYmVoaW5kIHRoZSAybmQgdGltZW91dC > B3YWl0IGhlcmUuDQo+IA0K > PiBBcyBmYXIgYXMgSSB1bmRlcnN0YW5kIGRtYWJ1ZiBkZXNpZ24gYSB > mZW5jZSB3YWl0IHNob3Vs > ZCBjb21wbGV0ZQ0KPiBldmVudHVhbGx5IHVuZGVyIGtlcm5lbCBjb250 > cm9sLCBiZWNhdXNlIHRo > ZXNlIHNsZWVwcyBhcmUNCj4gc3ByaW5rbGVkIGFsbCBhcm91bmQgd > GhlIGtlcm5lbCB0b2RheS4N > Cj4gDQo+IEkgc3VzcGVjdCB0aGF0IGlzIG5vdCBhY3R1YWxseSB0cnVlIG > ZvciBldmVyeSBIVywg > cHJvYmFibHkgc29tZXRoaW5nDQo+IGxpa2UgInNoYWRlciBwcm9ncmF > tcyBjYW4gcnVuIGZvcmV2 > ZXIgdGVjaG5pY2FsbHkiLg0KPiANCj4gV2UgY2FuIGFyZ3VlIGlmIHRob3 > NlIGNhc2VzIHNob3Vs > ZCBub3QgcmVwb3J0IHJldm9jYWJsZSBlaXRoZXIsIGJ1dCBhdA0KPiBsZ > WFzdCB0aGlzIHdpbGwg > d29yayAiY29ycmVjdGx5IiBldmVuIGlmIGl0IHRha2VzIGEgaHVnZSBhbW > 91bnQgb2YNCj4gdGlt > ZS4NCg0KZ29vZCB0byBrbm93IHRob3NlIGJhY2tncm91bmQuDQoNCj4 > gDQo+IEkgd291bGRuJ3Qg > bWluZCBzZWVpbmcgYSBzaG9ydGVyIHRpbWVvdXQgYW5kIHByaW50I > G9uIHRoZSBmZW5jZSB0b28N > Cj4ganVzdCBpbiBjYXNlLg0KPiANCg0KZWl0aGVyIHdheSBpcyBPSy4gSX > QncyBub3QgZGlmZmlj > dWx0IHRvIGZpZ3VyZSBvdXQgYSBsb25nIHdhaXQgYW55d2F5LiDwn5i > KDQo= > > which is something that patch tools - in special patchwork - won't handle. > > Thanks, > Mauro
On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <jgg@ziepe.ca>
> > Sent: Wednesday, January 28, 2026 12:28 AM
> >
> > On Tue, Jan 27, 2026 at 10:58:35AM +0200, Leon Romanovsky wrote:
> > > > > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct
> > vfio_pci_core_device *vdev, bool revoked)
> > > > > dma_resv_lock(priv->dmabuf->resv, NULL);
> > > > > priv->revoked = revoked;
> > > > > dma_buf_invalidate_mappings(priv->dmabuf);
> > > > > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > > > > + DMA_RESV_USAGE_BOOKKEEP,
> > false,
> > > > > + MAX_SCHEDULE_TIMEOUT);
> > > > > dma_resv_unlock(priv->dmabuf->resv);
> > > > > + if (revoked) {
> > > > > + kref_put(&priv->kref,
> > vfio_pci_dma_buf_done);
> > > > > + /* Let's wait till all DMA unmap are
> > completed. */
> > > > > + wait = wait_for_completion_timeout(
> > > > > + &priv->comp, secs_to_jiffies(1));
> > > >
> > > > Is the 1-second constant sufficient for all hardware, or should the
> > > > invalidate_mappings() contract require the callback to block until
> > > > speculative reads are strictly fenced? I'm wondering about a case where
> > > > a device's firmware has a high response latency, perhaps due to internal
> > > > management tasks like error recovery or thermal and it exceeds the 1s
> > > > timeout.
> > > >
> > > > If the device is in the middle of a large DMA burst and the firmware is
> > > > slow to flush the internal pipelines to a fully "quiesced"
> > > > read-and-discard state, reclaiming the memory at exactly 1.001 seconds
> > > > risks triggering platform-level faults..
> > > >
> > > > Since the wen explicitly permit these speculative reads until unmap is
> > > > complete, relying on a hardcoded timeout in the exporter seems to
> > > > introduce a hardware-dependent race condition that could compromise
> > > > system stability via IOMMU errors or AER faults.
> > > >
> > > > Should the importer instead be required to guarantee that all
> > > > speculative access has ceased before the invalidation call returns?
> > >
> > > It is guaranteed by the dma_resv_wait_timeout() call above. That call
> > ensures
> > > that the hardware has completed all pending operations. The 1‑second
> > delay is
> > > meant to catch cases where an in-kernel DMA unmap call is missing, which
> > should
> > > not trigger any DMA activity at that point.
> >
> > Christian may know actual examples, but my general feeling is he was
> > worrying about drivers that have pushed the DMABUF to visibility on
> > the GPU and the move notify & fences only shoot down some access. So
> > it has to wait until the DMABUF is finally unmapped.
> >
> > Pranjal's example should be covered by the driver adding a fence and
> > then the unbounded fence wait will complete it.
> >
>
> Bear me if it's an ignorant question.
>
> The commit msg of patch6 says that VFIO doesn't tolerate unbounded
> wait, which is the reason behind the 2nd timeout wait here.
It is not accurate. A second timeout is present both in the
description of patch 6 and in VFIO implementation. The difference is
that the timeout is enforced within VFIO.
>
> Then why is "the unbounded fence wait" not a problem in the same
> code path? the use of MAX_SCHEDULE_TIMEOUT imply a worst-case
> timeout in hundreds of years...
"An unbounded fence wait" is a different class of wait. It indicates broken
hardware that continues to issue DMA transactions even after it has been told to
stop.
The second wait exists to catch software bugs or misuse, where the dma-buf
importer has misrepresented its capabilities.
>
> and it'd be helpful to put some words in the code based on what's
> discussed here.
We've documented as much as we can in dma_buf_attach_revocable() and
dma_buf_invalidate_mappings(). Do you have any suggestions on what else
should be added here?
Thanks
> From: Leon Romanovsky <leon@kernel.org>
> Sent: Thursday, January 29, 2026 3:34 PM
>
> On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote:
> > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > Sent: Wednesday, January 28, 2026 12:28 AM
> > >
> > > On Tue, Jan 27, 2026 at 10:58:35AM +0200, Leon Romanovsky wrote:
> > > > > > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct
> > > vfio_pci_core_device *vdev, bool revoked)
> > > > > > dma_resv_lock(priv->dmabuf->resv, NULL);
> > > > > > priv->revoked = revoked;
> > > > > > dma_buf_invalidate_mappings(priv-
> >dmabuf);
> > > > > > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > > > > > +
> DMA_RESV_USAGE_BOOKKEEP,
> > > false,
> > > > > > +
> MAX_SCHEDULE_TIMEOUT);
> > > > > > dma_resv_unlock(priv->dmabuf->resv);
> > > > > > + if (revoked) {
> > > > > > + kref_put(&priv->kref,
> > > vfio_pci_dma_buf_done);
> > > > > > + /* Let's wait till all DMA unmap are
> > > completed. */
> > > > > > + wait = wait_for_completion_timeout(
> > > > > > + &priv->comp,
> secs_to_jiffies(1));
> > > > >
> > > > > Is the 1-second constant sufficient for all hardware, or should the
> > > > > invalidate_mappings() contract require the callback to block until
> > > > > speculative reads are strictly fenced? I'm wondering about a case
> where
> > > > > a device's firmware has a high response latency, perhaps due to
> internal
> > > > > management tasks like error recovery or thermal and it exceeds the
> 1s
> > > > > timeout.
> > > > >
> > > > > If the device is in the middle of a large DMA burst and the firmware is
> > > > > slow to flush the internal pipelines to a fully "quiesced"
> > > > > read-and-discard state, reclaiming the memory at exactly 1.001
> seconds
> > > > > risks triggering platform-level faults..
> > > > >
> > > > > Since the wen explicitly permit these speculative reads until unmap is
> > > > > complete, relying on a hardcoded timeout in the exporter seems to
> > > > > introduce a hardware-dependent race condition that could
> compromise
> > > > > system stability via IOMMU errors or AER faults.
> > > > >
> > > > > Should the importer instead be required to guarantee that all
> > > > > speculative access has ceased before the invalidation call returns?
> > > >
> > > > It is guaranteed by the dma_resv_wait_timeout() call above. That call
> > > ensures
> > > > that the hardware has completed all pending operations. The 1‑second
> > > delay is
> > > > meant to catch cases where an in-kernel DMA unmap call is missing,
> which
> > > should
> > > > not trigger any DMA activity at that point.
> > >
> > > Christian may know actual examples, but my general feeling is he was
> > > worrying about drivers that have pushed the DMABUF to visibility on
> > > the GPU and the move notify & fences only shoot down some access. So
> > > it has to wait until the DMABUF is finally unmapped.
> > >
> > > Pranjal's example should be covered by the driver adding a fence and
> > > then the unbounded fence wait will complete it.
> > >
> >
> > Bear me if it's an ignorant question.
> >
> > The commit msg of patch6 says that VFIO doesn't tolerate unbounded
> > wait, which is the reason behind the 2nd timeout wait here.
>
> It is not accurate. A second timeout is present both in the
> description of patch 6 and in VFIO implementation. The difference is
> that the timeout is enforced within VFIO.
>
> >
> > Then why is "the unbounded fence wait" not a problem in the same
> > code path? the use of MAX_SCHEDULE_TIMEOUT imply a worst-case
> > timeout in hundreds of years...
>
> "An unbounded fence wait" is a different class of wait. It indicates broken
> hardware that continues to issue DMA transactions even after it has been
> told to
> stop.
>
> The second wait exists to catch software bugs or misuse, where the dma-buf
> importer has misrepresented its capabilities.
>
Okay I see.
> >
> > and it'd be helpful to put some words in the code based on what's
> > discussed here.
>
> We've documented as much as we can in dma_buf_attach_revocable() and
> dma_buf_invalidate_mappings(). Do you have any suggestions on what else
> should be added here?
>
the selection of 1s?
then,
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
On Thu, Jan 29, 2026 at 08:13:18AM +0000, Tian, Kevin wrote:
> > From: Leon Romanovsky <leon@kernel.org>
> > Sent: Thursday, January 29, 2026 3:34 PM
> >
> > On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote:
> > > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > > Sent: Wednesday, January 28, 2026 12:28 AM
> > > >
> > > > On Tue, Jan 27, 2026 at 10:58:35AM +0200, Leon Romanovsky wrote:
> > > > > > > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct
> > > > vfio_pci_core_device *vdev, bool revoked)
> > > > > > > dma_resv_lock(priv->dmabuf->resv, NULL);
> > > > > > > priv->revoked = revoked;
> > > > > > > dma_buf_invalidate_mappings(priv-
> > >dmabuf);
> > > > > > > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > > > > > > +
> > DMA_RESV_USAGE_BOOKKEEP,
> > > > false,
> > > > > > > +
> > MAX_SCHEDULE_TIMEOUT);
> > > > > > > dma_resv_unlock(priv->dmabuf->resv);
> > > > > > > + if (revoked) {
> > > > > > > + kref_put(&priv->kref,
> > > > vfio_pci_dma_buf_done);
> > > > > > > + /* Let's wait till all DMA unmap are
> > > > completed. */
> > > > > > > + wait = wait_for_completion_timeout(
> > > > > > > + &priv->comp,
> > secs_to_jiffies(1));
> > > > > >
> > > > > > Is the 1-second constant sufficient for all hardware, or should the
> > > > > > invalidate_mappings() contract require the callback to block until
> > > > > > speculative reads are strictly fenced? I'm wondering about a case
> > where
> > > > > > a device's firmware has a high response latency, perhaps due to
> > internal
> > > > > > management tasks like error recovery or thermal and it exceeds the
> > 1s
> > > > > > timeout.
> > > > > >
> > > > > > If the device is in the middle of a large DMA burst and the firmware is
> > > > > > slow to flush the internal pipelines to a fully "quiesced"
> > > > > > read-and-discard state, reclaiming the memory at exactly 1.001
> > seconds
> > > > > > risks triggering platform-level faults..
> > > > > >
> > > > > > Since the wen explicitly permit these speculative reads until unmap is
> > > > > > complete, relying on a hardcoded timeout in the exporter seems to
> > > > > > introduce a hardware-dependent race condition that could
> > compromise
> > > > > > system stability via IOMMU errors or AER faults.
> > > > > >
> > > > > > Should the importer instead be required to guarantee that all
> > > > > > speculative access has ceased before the invalidation call returns?
> > > > >
> > > > > It is guaranteed by the dma_resv_wait_timeout() call above. That call
> > > > ensures
> > > > > that the hardware has completed all pending operations. The 1‑second
> > > > delay is
> > > > > meant to catch cases where an in-kernel DMA unmap call is missing,
> > which
> > > > should
> > > > > not trigger any DMA activity at that point.
> > > >
> > > > Christian may know actual examples, but my general feeling is he was
> > > > worrying about drivers that have pushed the DMABUF to visibility on
> > > > the GPU and the move notify & fences only shoot down some access. So
> > > > it has to wait until the DMABUF is finally unmapped.
> > > >
> > > > Pranjal's example should be covered by the driver adding a fence and
> > > > then the unbounded fence wait will complete it.
> > > >
> > >
> > > Bear me if it's an ignorant question.
> > >
> > > The commit msg of patch6 says that VFIO doesn't tolerate unbounded
> > > wait, which is the reason behind the 2nd timeout wait here.
> >
> > It is not accurate. A second timeout is present both in the
> > description of patch 6 and in VFIO implementation. The difference is
> > that the timeout is enforced within VFIO.
> >
> > >
> > > Then why is "the unbounded fence wait" not a problem in the same
> > > code path? the use of MAX_SCHEDULE_TIMEOUT imply a worst-case
> > > timeout in hundreds of years...
> >
> > "An unbounded fence wait" is a different class of wait. It indicates broken
> > hardware that continues to issue DMA transactions even after it has been
> > told to
> > stop.
> >
> > The second wait exists to catch software bugs or misuse, where the dma-buf
> > importer has misrepresented its capabilities.
> >
>
> Okay I see.
>
> > >
> > > and it'd be helpful to put some words in the code based on what's
> > > discussed here.
> >
> > We've documented as much as we can in dma_buf_attach_revocable() and
> > dma_buf_invalidate_mappings(). Do you have any suggestions on what else
> > should be added here?
> >
>
> the selection of 1s?
It is indirectly written in description of WARN_ON(), but let's add
more. What about the following?
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 93795ad2e025..948ba75288c6 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -357,7 +357,13 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
dma_resv_unlock(priv->dmabuf->resv);
if (revoked) {
kref_put(&priv->kref, vfio_pci_dma_buf_done);
- /* Let's wait till all DMA unmap are completed. */
+ /*
+ * Let's wait for 1 second till all DMA unmap
+ * are completed. It is supposed to catch dma-buf
+ * importers which lied about their support
+ * of dmabuf revoke. See dma_buf_invalidate_mappings()
+ * for the expected behaviour,
+ */
wait = wait_for_completion_timeout(
&priv->comp, secs_to_jiffies(1));
/*
>
> then,
>
> Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Thanks
> From: Leon Romanovsky <leon@kernel.org>
> Sent: Thursday, January 29, 2026 4:42 PM
>
> On Thu, Jan 29, 2026 at 08:13:18AM +0000, Tian, Kevin wrote:
> > > From: Leon Romanovsky <leon@kernel.org>
> > > Sent: Thursday, January 29, 2026 3:34 PM
> > >
> > > On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote:
> > > > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > > > Sent: Wednesday, January 28, 2026 12:28 AM
> > > > >
> > > > > On Tue, Jan 27, 2026 at 10:58:35AM +0200, Leon Romanovsky wrote:
> > > > > > > > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct
> > > > > vfio_pci_core_device *vdev, bool revoked)
> > > > > > > > dma_resv_lock(priv->dmabuf->resv, NULL);
> > > > > > > > priv->revoked = revoked;
> > > > > > > > dma_buf_invalidate_mappings(priv-
> > > >dmabuf);
> > > > > > > > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > > > > > > > +
> > > DMA_RESV_USAGE_BOOKKEEP,
> > > > > false,
> > > > > > > > +
> > > MAX_SCHEDULE_TIMEOUT);
> > > > > > > > dma_resv_unlock(priv->dmabuf->resv);
> > > > > > > > + if (revoked) {
> > > > > > > > + kref_put(&priv->kref,
> > > > > vfio_pci_dma_buf_done);
> > > > > > > > + /* Let's wait till all DMA unmap are
> > > > > completed. */
> > > > > > > > + wait = wait_for_completion_timeout(
> > > > > > > > + &priv->comp,
> > > secs_to_jiffies(1));
> > > > > > >
> > > > > > > Is the 1-second constant sufficient for all hardware, or should the
> > > > > > > invalidate_mappings() contract require the callback to block until
> > > > > > > speculative reads are strictly fenced? I'm wondering about a case
> > > where
> > > > > > > a device's firmware has a high response latency, perhaps due to
> > > internal
> > > > > > > management tasks like error recovery or thermal and it exceeds
> the
> > > 1s
> > > > > > > timeout.
> > > > > > >
> > > > > > > If the device is in the middle of a large DMA burst and the
> firmware is
> > > > > > > slow to flush the internal pipelines to a fully "quiesced"
> > > > > > > read-and-discard state, reclaiming the memory at exactly 1.001
> > > seconds
> > > > > > > risks triggering platform-level faults..
> > > > > > >
> > > > > > > Since the wen explicitly permit these speculative reads until
> unmap is
> > > > > > > complete, relying on a hardcoded timeout in the exporter seems
> to
> > > > > > > introduce a hardware-dependent race condition that could
> > > compromise
> > > > > > > system stability via IOMMU errors or AER faults.
> > > > > > >
> > > > > > > Should the importer instead be required to guarantee that all
> > > > > > > speculative access has ceased before the invalidation call returns?
> > > > > >
> > > > > > It is guaranteed by the dma_resv_wait_timeout() call above. That
> call
> > > > > ensures
> > > > > > that the hardware has completed all pending operations. The
> 1‑second
> > > > > delay is
> > > > > > meant to catch cases where an in-kernel DMA unmap call is missing,
> > > which
> > > > > should
> > > > > > not trigger any DMA activity at that point.
> > > > >
> > > > > Christian may know actual examples, but my general feeling is he was
> > > > > worrying about drivers that have pushed the DMABUF to visibility on
> > > > > the GPU and the move notify & fences only shoot down some access.
> So
> > > > > it has to wait until the DMABUF is finally unmapped.
> > > > >
> > > > > Pranjal's example should be covered by the driver adding a fence and
> > > > > then the unbounded fence wait will complete it.
> > > > >
> > > >
> > > > Bear me if it's an ignorant question.
> > > >
> > > > The commit msg of patch6 says that VFIO doesn't tolerate unbounded
> > > > wait, which is the reason behind the 2nd timeout wait here.
> > >
> > > It is not accurate. A second timeout is present both in the
> > > description of patch 6 and in VFIO implementation. The difference is
> > > that the timeout is enforced within VFIO.
> > >
> > > >
> > > > Then why is "the unbounded fence wait" not a problem in the same
> > > > code path? the use of MAX_SCHEDULE_TIMEOUT imply a worst-case
> > > > timeout in hundreds of years...
> > >
> > > "An unbounded fence wait" is a different class of wait. It indicates broken
> > > hardware that continues to issue DMA transactions even after it has been
> > > told to
> > > stop.
> > >
> > > The second wait exists to catch software bugs or misuse, where the dma-
> buf
> > > importer has misrepresented its capabilities.
> > >
> >
> > Okay I see.
> >
> > > >
> > > > and it'd be helpful to put some words in the code based on what's
> > > > discussed here.
> > >
> > > We've documented as much as we can in dma_buf_attach_revocable()
> and
> > > dma_buf_invalidate_mappings(). Do you have any suggestions on what
> else
> > > should be added here?
> > >
> >
> > the selection of 1s?
>
> It is indirectly written in description of WARN_ON(), but let's add
> more. What about the following?
>
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c
> b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index 93795ad2e025..948ba75288c6 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -357,7 +357,13 @@ void vfio_pci_dma_buf_move(struct
> vfio_pci_core_device *vdev, bool revoked)
> dma_resv_unlock(priv->dmabuf->resv);
> if (revoked) {
> kref_put(&priv->kref, vfio_pci_dma_buf_done);
> - /* Let's wait till all DMA unmap are completed. */
> + /*
> + * Let's wait for 1 second till all DMA unmap
> + * are completed. It is supposed to catch dma-buf
> + * importers which lied about their support
> + * of dmabuf revoke. See dma_buf_invalidate_mappings()
> + * for the expected behaviour,
> + */
> wait = wait_for_completion_timeout(
> &priv->comp, secs_to_jiffies(1));
> /*
>
looks good. Just replace the trailing "," with "."
On Thu, 29 Jan 2026 10:41:56 +0200
Leon Romanovsky <leon@kernel.org> wrote:
> On Thu, Jan 29, 2026 at 08:13:18AM +0000, Tian, Kevin wrote:
> > > From: Leon Romanovsky <leon@kernel.org>
> > > Sent: Thursday, January 29, 2026 3:34 PM
> > >
> > > On Thu, Jan 29, 2026 at 07:06:37AM +0000, Tian, Kevin wrote:
> > > > > From: Jason Gunthorpe <jgg@ziepe.ca>
> > > > > Sent: Wednesday, January 28, 2026 12:28 AM
> > > > >
> > > > > On Tue, Jan 27, 2026 at 10:58:35AM +0200, Leon Romanovsky wrote:
> > > > > > > > @@ -333,7 +359,37 @@ void vfio_pci_dma_buf_move(struct
> > > > > vfio_pci_core_device *vdev, bool revoked)
> > > > > > > > dma_resv_lock(priv->dmabuf->resv, NULL);
> > > > > > > > priv->revoked = revoked;
> > > > > > > > dma_buf_invalidate_mappings(priv-
> > > >dmabuf);
> > > > > > > > + dma_resv_wait_timeout(priv->dmabuf->resv,
> > > > > > > > +
> > > DMA_RESV_USAGE_BOOKKEEP,
> > > > > false,
> > > > > > > > +
> > > MAX_SCHEDULE_TIMEOUT);
> > > > > > > > dma_resv_unlock(priv->dmabuf->resv);
> > > > > > > > + if (revoked) {
> > > > > > > > + kref_put(&priv->kref,
> > > > > vfio_pci_dma_buf_done);
> > > > > > > > + /* Let's wait till all DMA unmap are
> > > > > completed. */
> > > > > > > > + wait = wait_for_completion_timeout(
> > > > > > > > + &priv->comp,
> > > secs_to_jiffies(1));
> > > > > > >
> > > > > > > Is the 1-second constant sufficient for all hardware, or should the
> > > > > > > invalidate_mappings() contract require the callback to block until
> > > > > > > speculative reads are strictly fenced? I'm wondering about a case
> > > where
> > > > > > > a device's firmware has a high response latency, perhaps due to
> > > internal
> > > > > > > management tasks like error recovery or thermal and it exceeds the
> > > 1s
> > > > > > > timeout.
> > > > > > >
> > > > > > > If the device is in the middle of a large DMA burst and the firmware is
> > > > > > > slow to flush the internal pipelines to a fully "quiesced"
> > > > > > > read-and-discard state, reclaiming the memory at exactly 1.001
> > > seconds
> > > > > > > risks triggering platform-level faults..
> > > > > > >
> > > > > > > Since the wen explicitly permit these speculative reads until unmap is
> > > > > > > complete, relying on a hardcoded timeout in the exporter seems to
> > > > > > > introduce a hardware-dependent race condition that could
> > > compromise
> > > > > > > system stability via IOMMU errors or AER faults.
> > > > > > >
> > > > > > > Should the importer instead be required to guarantee that all
> > > > > > > speculative access has ceased before the invalidation call returns?
> > > > > >
> > > > > > It is guaranteed by the dma_resv_wait_timeout() call above. That call
> > > > > ensures
> > > > > > that the hardware has completed all pending operations. The 1‑second
> > > > > delay is
> > > > > > meant to catch cases where an in-kernel DMA unmap call is missing,
> > > which
> > > > > should
> > > > > > not trigger any DMA activity at that point.
> > > > >
> > > > > Christian may know actual examples, but my general feeling is he was
> > > > > worrying about drivers that have pushed the DMABUF to visibility on
> > > > > the GPU and the move notify & fences only shoot down some access. So
> > > > > it has to wait until the DMABUF is finally unmapped.
> > > > >
> > > > > Pranjal's example should be covered by the driver adding a fence and
> > > > > then the unbounded fence wait will complete it.
> > > > >
> > > >
> > > > Bear me if it's an ignorant question.
> > > >
> > > > The commit msg of patch6 says that VFIO doesn't tolerate unbounded
> > > > wait, which is the reason behind the 2nd timeout wait here.
> > >
> > > It is not accurate. A second timeout is present both in the
> > > description of patch 6 and in VFIO implementation. The difference is
> > > that the timeout is enforced within VFIO.
> > >
> > > >
> > > > Then why is "the unbounded fence wait" not a problem in the same
> > > > code path? the use of MAX_SCHEDULE_TIMEOUT imply a worst-case
> > > > timeout in hundreds of years...
> > >
> > > "An unbounded fence wait" is a different class of wait. It indicates broken
> > > hardware that continues to issue DMA transactions even after it has been
> > > told to
> > > stop.
> > >
> > > The second wait exists to catch software bugs or misuse, where the dma-buf
> > > importer has misrepresented its capabilities.
> > >
> >
> > Okay I see.
> >
> > > >
> > > > and it'd be helpful to put some words in the code based on what's
> > > > discussed here.
> > >
> > > We've documented as much as we can in dma_buf_attach_revocable() and
> > > dma_buf_invalidate_mappings(). Do you have any suggestions on what else
> > > should be added here?
> > >
> >
> > the selection of 1s?
>
> It is indirectly written in description of WARN_ON(), but let's add
> more. What about the following?
>
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
> index 93795ad2e025..948ba75288c6 100644
> --- a/drivers/vfio/pci/vfio_pci_dmabuf.c
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -357,7 +357,13 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
> dma_resv_unlock(priv->dmabuf->resv);
> if (revoked) {
> kref_put(&priv->kref, vfio_pci_dma_buf_done);
> - /* Let's wait till all DMA unmap are completed. */
> + /*
> + * Let's wait for 1 second till all DMA unmap
> + * are completed. It is supposed to catch dma-buf
> + * importers which lied about their support
> + * of dmabuf revoke. See dma_buf_invalidate_mappings()
> + * for the expected behaviour,
> + */
> wait = wait_for_completion_timeout(
> &priv->comp, secs_to_jiffies(1));
> /*
>
> >
> > then,
> >
> > Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Alex Williamson <alex@shazbot.org>
© 2016 - 2026 Red Hat, Inc.