[PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events

Shameer Kolothum posted 5 patches 1 month, 1 week ago
Maintainers: Yi Liu <yi.l.liu@intel.com>, Eric Auger <eric.auger@redhat.com>, Zhenzhong Duan <zhenzhong.duan@intel.com>, Peter Maydell <peter.maydell@linaro.org>
[PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events
Posted by Shameer Kolothum 1 month, 1 week ago
Install an event handler on the vEVENTQ fd to read and propagate host
generated vIOMMU events to the guest.

The handler runs in QEMU's main loop, using a non-blocking fd registered
via qemu_set_fd_handler().

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
 hw/arm/smmuv3-accel.c | 64 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index f703ea1aac..17306cd04b 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -390,6 +390,50 @@ bool smmuv3_accel_issue_inv_cmd(SMMUv3State *bs, void *cmd, SMMUDevice *sdev,
                    sizeof(Cmd), &entry_num, cmd, errp);
 }
 
+static void smmuv3_accel_event_read(void *opaque)
+{
+    SMMUv3State *s = opaque;
+    IOMMUFDVeventq *veventq = s->s_accel->veventq;
+    struct {
+        struct iommufd_vevent_header hdr;
+        struct iommu_vevent_arm_smmuv3 vevent;
+    } buf;
+    enum iommu_veventq_type type = IOMMU_VEVENTQ_TYPE_ARM_SMMUV3;
+    uint32_t id = veventq->veventq_id;
+    uint32_t last_seq = veventq->last_event_seq;
+    ssize_t bytes;
+
+    bytes = read(veventq->veventq_fd, &buf, sizeof(buf));
+    if (bytes <= 0) {
+        if (errno == EAGAIN || errno == EINTR) {
+            return;
+        }
+        error_report_once("vEVENTQ(type %u id %u): read failed (%m)", type, id);
+        return;
+    }
+
+    if (bytes == sizeof(buf.hdr) &&
+        (buf.hdr.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS)) {
+        error_report_once("vEVENTQ(type %u id %u): overflowed", type, id);
+        veventq->event_start = false;
+        return;
+    }
+    if (bytes < sizeof(buf)) {
+        error_report_once("vEVENTQ(type %u id %u): short read(%zd/%zd bytes)",
+                          type, id, bytes, sizeof(buf));
+        return;
+    }
+
+    /* Check sequence in hdr for lost events if any */
+    if (veventq->event_start && (buf.hdr.sequence - last_seq != 1)) {
+        error_report_once("vEVENTQ(type %u id %u): lost %u event(s)",
+                          type, id, buf.hdr.sequence - last_seq - 1);
+    }
+    veventq->last_event_seq = buf.hdr.sequence;
+    veventq->event_start = true;
+    smmuv3_propagate_event(s, (Evt *)&buf.vevent);
+}
+
 static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
 {
     IOMMUFDVeventq *veventq = accel->veventq;
@@ -397,6 +441,7 @@ static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
     if (!veventq) {
         return;
     }
+    qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
     close(veventq->veventq_fd);
     iommufd_backend_free_id(accel->viommu->iommufd, veventq->veventq_id);
     g_free(veventq);
@@ -424,6 +469,7 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s, Error **errp)
     IOMMUFDVeventq *veventq;
     uint32_t veventq_id;
     uint32_t veventq_fd;
+    int flags;
 
     if (!accel || !accel->viommu) {
         return true;
@@ -445,12 +491,30 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s, Error **errp)
         return false;
     }
 
+    flags = fcntl(veventq_fd, F_GETFL);
+    if (flags < 0) {
+        error_setg_errno(errp, errno, "Failed to get flags for vEVENTQ fd");
+        goto free_veventq;
+    }
+    if (fcntl(veventq_fd, F_SETFL, flags | O_NONBLOCK) < 0) {
+        error_setg_errno(errp, errno, "Failed to set O_NONBLOCK on vEVENTQ fd");
+        goto free_veventq;
+    }
+
     veventq = g_new0(IOMMUFDVeventq, 1);
     veventq->veventq_id = veventq_id;
     veventq->veventq_fd = veventq_fd;
     veventq->viommu = accel->viommu;
     accel->veventq = veventq;
+
+    /* Set up event handler for veventq fd */
+    qemu_set_fd_handler(veventq_fd, smmuv3_accel_event_read, NULL, s);
     return true;
+
+free_veventq:
+    close(veventq_fd);
+    iommufd_backend_free_id(accel->viommu->iommufd, veventq_id);
+    return false;
 }
 
 static bool
-- 
2.43.0
RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events
Posted by Duan, Zhenzhong 1 month, 1 week ago
Hi Shameer,

>-----Original Message-----
>From: Shameer Kolothum <skolothumtho@nvidia.com>
>Subject: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>vIOMMU events
>
>Install an event handler on the vEVENTQ fd to read and propagate host
>generated vIOMMU events to the guest.
>
>The handler runs in QEMU's main loop, using a non-blocking fd registered
>via qemu_set_fd_handler().
>
>Tested-by: Nicolin Chen <nicolinc@nvidia.com>
>Reviewed-by: Eric Auger <eric.auger@redhat.com>
>Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
>Tested-by: Eric Auger <eric.auger@redhat.com>
>Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
>Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
>---
> hw/arm/smmuv3-accel.c | 64
>+++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 64 insertions(+)
>
>diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
>index f703ea1aac..17306cd04b 100644
>--- a/hw/arm/smmuv3-accel.c
>+++ b/hw/arm/smmuv3-accel.c
>@@ -390,6 +390,50 @@ bool smmuv3_accel_issue_inv_cmd(SMMUv3State *bs,
>void *cmd, SMMUDevice *sdev,
>                    sizeof(Cmd), &entry_num, cmd, errp);
> }
>
>+static void smmuv3_accel_event_read(void *opaque)
>+{
>+    SMMUv3State *s = opaque;
>+    IOMMUFDVeventq *veventq = s->s_accel->veventq;
>+    struct {
>+        struct iommufd_vevent_header hdr;
>+        struct iommu_vevent_arm_smmuv3 vevent;
>+    } buf;
>+    enum iommu_veventq_type type = IOMMU_VEVENTQ_TYPE_ARM_SMMUV3;
>+    uint32_t id = veventq->veventq_id;
>+    uint32_t last_seq = veventq->last_event_seq;
>+    ssize_t bytes;
>+
>+    bytes = read(veventq->veventq_fd, &buf, sizeof(buf));
>+    if (bytes <= 0) {
>+        if (errno == EAGAIN || errno == EINTR) {
>+            return;
>+        }
>+        error_report_once("vEVENTQ(type %u id %u): read failed (%m)", type, id);
>+        return;
>+    }
>+
>+    if (bytes == sizeof(buf.hdr) &&
>+        (buf.hdr.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS)) {
>+        error_report_once("vEVENTQ(type %u id %u): overflowed", type, id);
>+        veventq->event_start = false;
>+        return;
>+    }
>+    if (bytes < sizeof(buf)) {
>+        error_report_once("vEVENTQ(type %u id %u): short read(%zd/%zd bytes)",
>+                          type, id, bytes, sizeof(buf));
>+        return;
>+    }
>+
>+    /* Check sequence in hdr for lost events if any */
>+    if (veventq->event_start && (buf.hdr.sequence - last_seq != 1)) {
>+        error_report_once("vEVENTQ(type %u id %u): lost %u event(s)",
>+                          type, id, buf.hdr.sequence - last_seq - 1);
>+    }
>+    veventq->last_event_seq = buf.hdr.sequence;
>+    veventq->event_start = true;
>+    smmuv3_propagate_event(s, (Evt *)&buf.vevent);
>+}
>+
> static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> {
>     IOMMUFDVeventq *veventq = accel->veventq;
>@@ -397,6 +441,7 @@ static void
>smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>     if (!veventq) {
>         return;
>     }
>+    qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
>     close(veventq->veventq_fd);
>     iommufd_backend_free_id(accel->viommu->iommufd, veventq->veventq_id);

Did you ever see free_id failure here? I'm working on PRQ using FAULTQ in intel_iommu
and find a race condition, I suspect the same for vEVENTQ because both have similar
implementation in kernel and qemu.

qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's async,
this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
during veventq_fd releasing, it drops reference of veventq_id but it's too late
and free_id(veventq_id) already failed.

It's not easy to reproduce, we had a test case in guest to trigger FAULTQ allocation
and freeing sequence in loops.

Thanks
Zhenzhong

>     g_free(veventq);
>@@ -424,6 +469,7 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s,
>Error **errp)
>     IOMMUFDVeventq *veventq;
>     uint32_t veventq_id;
>     uint32_t veventq_fd;
>+    int flags;
>
>     if (!accel || !accel->viommu) {
>         return true;
>@@ -445,12 +491,30 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s,
>Error **errp)
>         return false;
>     }
>
>+    flags = fcntl(veventq_fd, F_GETFL);
>+    if (flags < 0) {
>+        error_setg_errno(errp, errno, "Failed to get flags for vEVENTQ fd");
>+        goto free_veventq;
>+    }
>+    if (fcntl(veventq_fd, F_SETFL, flags | O_NONBLOCK) < 0) {
>+        error_setg_errno(errp, errno, "Failed to set O_NONBLOCK on vEVENTQ fd");
>+        goto free_veventq;
>+    }
>+
>     veventq = g_new0(IOMMUFDVeventq, 1);
>     veventq->veventq_id = veventq_id;
>     veventq->veventq_fd = veventq_fd;
>     veventq->viommu = accel->viommu;
>     accel->veventq = veventq;
>+
>+    /* Set up event handler for veventq fd */
>+    qemu_set_fd_handler(veventq_fd, smmuv3_accel_event_read, NULL, s);
>     return true;
>+
>+free_veventq:
>+    close(veventq_fd);
>+    iommufd_backend_free_id(accel->viommu->iommufd, veventq_id);
>+    return false;
> }
>
> static bool
>--
>2.43.0
RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events
Posted by Shameer Kolothum Thodi 1 month, 1 week ago
Hi Zhenzhong,

> -----Original Message-----
> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
> Sent: 28 February 2026 06:28
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
> vIOMMU events

[...]

> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> > {
> >     IOMMUFDVeventq *veventq = accel->veventq;
> >@@ -397,6 +441,7 @@ static void
> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> >     if (!veventq) {
> >         return;
> >     }
> >+    qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
> >     close(veventq->veventq_fd);
> >     iommufd_backend_free_id(accel->viommu->iommufd, veventq-
> >veventq_id);
> 
> Did you ever see free_id failure here?

So far I have not seen free_id() failure in my test setup.

 I'm working on PRQ using FAULTQ in
> intel_iommu
> and find a race condition, I suspect the same for vEVENTQ because both have
> similar
> implementation in kernel and qemu.
> 
> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
> async,
> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
> during veventq_fd releasing, it drops reference of veventq_id but it's too late
> and free_id(veventq_id) already failed.
> 
> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
> allocation
> and freeing sequence in loops.

From a quick look:

qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()

aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
node->pfd.events = 0 and then calls:

ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL

In the epoll backend (util/fdmon-epoll.c) this results in:

epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);

As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
the fd has been removed from the epoll interest list in the kernel.

Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same thread for
a given AioContext, and dispatch runs with ctx->list_lock held (via
lockcnt). So looks like handler removal and dispatch are serialized at the QEMU
level.

Do you see a specific window where the fd could still trigger after
EPOLL_CTL_DEL returns?

Could you confirm which fdmon backend you are using..poll, epoll, or
io_uring?

One more question: is the FAULTQ register/unregister model identical to
vEVENTQ in terms of allocation and teardown ordering? For vEVENTQ we only
unregister when the last device is removed from the vIOMMU, typically via
device_del hot unplug.

Please let me know if you see a specific race case here that I may be
missing.

Thanks,
Shameer
RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events
Posted by Duan, Zhenzhong 1 month, 1 week ago
Hi Shameer,

>-----Original Message-----
>From: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>vIOMMU events
>
>Hi Zhenzhong,
>
>> -----Original Message-----
>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>> Sent: 28 February 2026 06:28
>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
>> arm@nongnu.org; qemu-devel@nongnu.org
>> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
>> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
>> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
>> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
>> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
>> <kjaju@nvidia.com>
>> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>> vIOMMU events
>
>[...]
>
>> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>> > {
>> >     IOMMUFDVeventq *veventq = accel->veventq;
>> >@@ -397,6 +441,7 @@ static void
>> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>> >     if (!veventq) {
>> >         return;
>> >     }
>> >+    qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
>> >     close(veventq->veventq_fd);
>> >     iommufd_backend_free_id(accel->viommu->iommufd, veventq-
>> >veventq_id);
>>
>> Did you ever see free_id failure here?
>
>So far I have not seen free_id() failure in my test setup.
>
> I'm working on PRQ using FAULTQ in
>> intel_iommu
>> and find a race condition, I suspect the same for vEVENTQ because both have
>> similar
>> implementation in kernel and qemu.
>>
>> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
>> async,
>> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
>> during veventq_fd releasing, it drops reference of veventq_id but it's too late
>> and free_id(veventq_id) already failed.
>>
>> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
>> allocation
>> and freeing sequence in loops.
>
>From a quick look:
>
>qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()
>
>aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
>node->pfd.events = 0 and then calls:
>
>ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL
>
>In the epoll backend (util/fdmon-epoll.c) this results in:
>
>epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);
>
>As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
>the fd has been removed from the epoll interest list in the kernel.

OK, I think I understand wrongly.

>
>Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same thread for
>a given AioContext, and dispatch runs with ctx->list_lock held (via
>lockcnt). So looks like handler removal and dispatch are serialized at the QEMU
>level.

I just found my scenario is a bit different from yours, aio_set_fd_handler()
is called in vcpu thread and aio_dispatch() is called in main thread in my scenario.

>
>Do you see a specific window where the fd could still trigger after
>EPOLL_CTL_DEL returns?

No, just speculation.

>
>Could you confirm which fdmon backend you are using..poll, epoll, or
>io_uring?

There is no CONFIG_LINUX_IO_URING defined in my env.
GDB printed fdmon_poll_ops  for iohandler_ctx->fdmon_ops, I also set
breakpoint on fdmon_epoll_try_upgrade(), no catch, I think it was
using poll.

>
>One more question: is the FAULTQ register/unregister model identical to
>vEVENTQ in terms of allocation and teardown ordering?

register/unregister model are similar, see https://github.com/yiliu1765/qemu/commit/28b72d6b372555494d0e6a1ccfc97c80bc34423a
the full branch: https://github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_prq_wip/

> For vEVENTQ we only
>unregister when the last device is removed from the vIOMMU, typically via
>device_del hot unplug.

OK, this is different, register/unregister model is triggered in vcpu thread
when guest send PASID table invalidation request in my scenario.

>
>Please let me know if you see a specific race case here that I may be
>missing.

Maybe this is a race case only in my scenario, I need to dig further.
Thanks for your analysis.

BRs,
Zhenzhong 
RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events
Posted by Duan, Zhenzhong 1 month ago


>-----Original Message-----
>From: Duan, Zhenzhong
>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>vIOMMU events
>
>Hi Shameer,
>
>>-----Original Message-----
>>From: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
>>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>>vIOMMU events
>>
>>Hi Zhenzhong,
>>
>>> -----Original Message-----
>>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>>> Sent: 28 February 2026 06:28
>>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
>>> arm@nongnu.org; qemu-devel@nongnu.org
>>> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
>>> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
>>> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
>>> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
>>> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
>>> <kjaju@nvidia.com>
>>> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>>> vIOMMU events
>>
>>[...]
>>
>>> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>>> > {
>>> >     IOMMUFDVeventq *veventq = accel->veventq;
>>> >@@ -397,6 +441,7 @@ static void
>>> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>>> >     if (!veventq) {
>>> >         return;
>>> >     }
>>> >+    qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
>>> >     close(veventq->veventq_fd);
>>> >     iommufd_backend_free_id(accel->viommu->iommufd, veventq-
>>> >veventq_id);
>>>
>>> Did you ever see free_id failure here?
>>
>>So far I have not seen free_id() failure in my test setup.
>>
>> I'm working on PRQ using FAULTQ in
>>> intel_iommu
>>> and find a race condition, I suspect the same for vEVENTQ because both have
>>> similar
>>> implementation in kernel and qemu.
>>>
>>> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
>>> async,
>>> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
>>> during veventq_fd releasing, it drops reference of veventq_id but it's too late
>>> and free_id(veventq_id) already failed.
>>>
>>> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
>>> allocation
>>> and freeing sequence in loops.
>>
>>From a quick look:
>>
>>qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()
>>
>>aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
>>node->pfd.events = 0 and then calls:
>>
>>ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL
>>
>>In the epoll backend (util/fdmon-epoll.c) this results in:
>>
>>epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);
>>
>>As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
>>the fd has been removed from the epoll interest list in the kernel.
>
>OK, I think I understand wrongly.
>
>>
>>Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same thread for
>>a given AioContext, and dispatch runs with ctx->list_lock held (via
>>lockcnt). So looks like handler removal and dispatch are serialized at the QEMU
>>level.
>
>I just found my scenario is a bit different from yours, aio_set_fd_handler()
>is called in vcpu thread and aio_dispatch() is called in main thread in my scenario.
>
>>
>>Do you see a specific window where the fd could still trigger after
>>EPOLL_CTL_DEL returns?
>
>No, just speculation.
>
>>
>>Could you confirm which fdmon backend you are using..poll, epoll, or
>>io_uring?
>
>There is no CONFIG_LINUX_IO_URING defined in my env.
>GDB printed fdmon_poll_ops  for iohandler_ctx->fdmon_ops, I also set
>breakpoint on fdmon_epoll_try_upgrade(), no catch, I think it was
>using poll.
>
>>
>>One more question: is the FAULTQ register/unregister model identical to
>>vEVENTQ in terms of allocation and teardown ordering?
>
>register/unregister model are similar, see
>https://github.com/yiliu1765/qemu/commit/28b72d6b372555494d0e6a1ccfc97c8
>0bc34423a
>the full branch:
>https://github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_prq_wip/
>
>> For vEVENTQ we only
>>unregister when the last device is removed from the vIOMMU, typically via
>>device_del hot unplug.
>
>OK, this is different, register/unregister model is triggered in vcpu thread
>when guest send PASID table invalidation request in my scenario.
>
>>
>>Please let me know if you see a specific race case here that I may be
>>missing.
>
>Maybe this is a race case only in my scenario, I need to dig further.
>Thanks for your analysis.

Sorry for the noise, find root cause, this is indeed a race case only in my
scenario, because I need to delete fault_fd from polling fds in vcpu thread.

While aio_set_fd_handler() indeed wake up the main thread from poll(),
but there still could be a small window we call free_id(fault_id) before
poll() exit and release fault_id file reference.

I wrote below to make it work, in worst case I see free_id(fault_id) failed once
and could succeed in second try.

--- a/hw/i386/intel_iommu_accel.c
+++ b/hw/i386/intel_iommu_accel.c
@@ -206,13 +206,20 @@ static void vtd_destroy_fs_faultq(VTDHostIOMMUDevice *vtd_hiod,
                                   uint32_t fault_id, uint32_t fault_fd)
 {
     HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+    int ret, try_cnt = 5;

     if (!fault_id) {
         return;
     }

     close(fault_fd);
-    iommufd_backend_free_id(idev->iommufd, fault_id);
+
+    while(try_cnt--) {
+        ret = iommufd_backend_free_id(idev->iommufd, fault_id);
+        if (!ret || errno != EBUSY ) {
+            break;
+        }
+    }
 }

BRs,
Zhenzhong
RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events
Posted by Shameer Kolothum Thodi 1 month ago

> -----Original Message-----
> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
> Sent: 04 March 2026 08:56
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
> vIOMMU events
> 
> External email: Use caution opening links or attachments
> 
> 
> >-----Original Message-----
> >From: Duan, Zhenzhong
> >Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate
> host
> >vIOMMU events
> >
> >Hi Shameer,
> >
> >>-----Original Message-----
> >>From: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
> >>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate
> host
> >>vIOMMU events
> >>
> >>Hi Zhenzhong,
> >>
> >>> -----Original Message-----
> >>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
> >>> Sent: 28 February 2026 06:28
> >>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> >>> arm@nongnu.org; qemu-devel@nongnu.org
> >>> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
> >>> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> >>> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
> >>> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
> >>> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
> >>> <kjaju@nvidia.com>
> >>> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate
> host
> >>> vIOMMU events
> >>
> >>[...]
> >>
> >>> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> >>> > {
> >>> >     IOMMUFDVeventq *veventq = accel->veventq;
> >>> >@@ -397,6 +441,7 @@ static void
> >>> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> >>> >     if (!veventq) {
> >>> >         return;
> >>> >     }
> >>> >+    qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
> >>> >     close(veventq->veventq_fd);
> >>> >     iommufd_backend_free_id(accel->viommu->iommufd, veventq-
> >>> >veventq_id);
> >>>
> >>> Did you ever see free_id failure here?
> >>
> >>So far I have not seen free_id() failure in my test setup.
> >>
> >> I'm working on PRQ using FAULTQ in
> >>> intel_iommu
> >>> and find a race condition, I suspect the same for vEVENTQ because both
> have
> >>> similar
> >>> implementation in kernel and qemu.
> >>>
> >>> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
> >>> async,
> >>> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
> >>> during veventq_fd releasing, it drops reference of veventq_id but it's too
> late
> >>> and free_id(veventq_id) already failed.
> >>>
> >>> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
> >>> allocation
> >>> and freeing sequence in loops.
> >>
> >>From a quick look:
> >>
> >>qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()
> >>
> >>aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
> >>node->pfd.events = 0 and then calls:
> >>
> >>ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL
> >>
> >>In the epoll backend (util/fdmon-epoll.c) this results in:
> >>
> >>epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);
> >>
> >>As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
> >>the fd has been removed from the epoll interest list in the kernel.
> >
> >OK, I think I understand wrongly.
> >
> >>
> >>Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same
> thread for
> >>a given AioContext, and dispatch runs with ctx->list_lock held (via
> >>lockcnt). So looks like handler removal and dispatch are serialized at the
> QEMU
> >>level.
> >
> >I just found my scenario is a bit different from yours, aio_set_fd_handler()
> >is called in vcpu thread and aio_dispatch() is called in main thread in my
> scenario.
> >
> >>
> >>Do you see a specific window where the fd could still trigger after
> >>EPOLL_CTL_DEL returns?
> >
> >No, just speculation.
> >
> >>
> >>Could you confirm which fdmon backend you are using..poll, epoll, or
> >>io_uring?
> >
> >There is no CONFIG_LINUX_IO_URING defined in my env.
> >GDB printed fdmon_poll_ops  for iohandler_ctx->fdmon_ops, I also set
> >breakpoint on fdmon_epoll_try_upgrade(), no catch, I think it was
> >using poll.
> >
> >>
> >>One more question: is the FAULTQ register/unregister model identical to
> >>vEVENTQ in terms of allocation and teardown ordering?
> >
> >register/unregister model are similar, see
> >https://github.com/yiliu1765/qemu/commit/28b72d6b372555494d0e6a1
> ccfc97c8
> >0bc34423a
> >the full branch:
> >https://github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_prq_wi
> p/
> >
> >> For vEVENTQ we only
> >>unregister when the last device is removed from the vIOMMU, typically via
> >>device_del hot unplug.
> >
> >OK, this is different, register/unregister model is triggered in vcpu thread
> >when guest send PASID table invalidation request in my scenario.
> >
> >>
> >>Please let me know if you see a specific race case here that I may be
> >>missing.
> >
> >Maybe this is a race case only in my scenario, I need to dig further.
> >Thanks for your analysis.
> 
> Sorry for the noise, find root cause, this is indeed a race case only in my
> scenario, because I need to delete fault_fd from polling fds in vcpu thread.
> 
> While aio_set_fd_handler() indeed wake up the main thread from poll(),
> but there still could be a small window we call free_id(fault_id) before
> poll() exit and release fault_id file reference.
> 
> I wrote below to make it work, in worst case I see free_id(fault_id) failed once
> and could succeed in second try.
> 
> --- a/hw/i386/intel_iommu_accel.c
> +++ b/hw/i386/intel_iommu_accel.c
> @@ -206,13 +206,20 @@ static void
> vtd_destroy_fs_faultq(VTDHostIOMMUDevice *vtd_hiod,
>                                    uint32_t fault_id, uint32_t fault_fd)
>  {
>      HostIOMMUDeviceIOMMUFD *idev =
> HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> +    int ret, try_cnt = 5;
> 
>      if (!fault_id) {
>          return;
>      }
> 
>      close(fault_fd);
> -    iommufd_backend_free_id(idev->iommufd, fault_id);
> +
> +    while(try_cnt--) {
> +        ret = iommufd_backend_free_id(idev->iommufd, fault_id);
> +        if (!ret || errno != EBUSY ) {
> +            break;
> +        }
> +    }
>  }

Is it possible for you to use aio_bh_schedule_oneshot() here to force the 
unregister run in QEMU I/O handler AioContext instead. I think that is
what this fn does( though I have not used this before).

Something like,

aio_bh_schedule_oneshot(iohandler_get_aio_context(),
                        faultq_teardown_bh,
                        data);
 
and 

static void faultq_teardown_bh(void *opaque)
{
    data *fq = opaque;

    qemu_set_fd_handler(fq->fd, NULL, NULL, NULL);

    close(fq->fd);

    iommufd_backend_free_id(fq->iommufd, fq->id);
}

Thanks,
Shameer
RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host vIOMMU events
Posted by Duan, Zhenzhong 1 month ago
>> >Maybe this is a race case only in my scenario, I need to dig further.
>> >Thanks for your analysis.
>>
>> Sorry for the noise, find root cause, this is indeed a race case only in my
>> scenario, because I need to delete fault_fd from polling fds in vcpu thread.
>>
>> While aio_set_fd_handler() indeed wake up the main thread from poll(),
>> but there still could be a small window we call free_id(fault_id) before
>> poll() exit and release fault_id file reference.
>>
>> I wrote below to make it work, in worst case I see free_id(fault_id) failed once
>> and could succeed in second try.
>>
>> --- a/hw/i386/intel_iommu_accel.c
>> +++ b/hw/i386/intel_iommu_accel.c
>> @@ -206,13 +206,20 @@ static void
>> vtd_destroy_fs_faultq(VTDHostIOMMUDevice *vtd_hiod,
>>                                    uint32_t fault_id, uint32_t fault_fd)
>>  {
>>      HostIOMMUDeviceIOMMUFD *idev =
>> HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> +    int ret, try_cnt = 5;
>>
>>      if (!fault_id) {
>>          return;
>>      }
>>
>>      close(fault_fd);
>> -    iommufd_backend_free_id(idev->iommufd, fault_id);
>> +
>> +    while(try_cnt--) {
>> +        ret = iommufd_backend_free_id(idev->iommufd, fault_id);
>> +        if (!ret || errno != EBUSY ) {
>> +            break;
>> +        }
>> +    }
>>  }
>
>Is it possible for you to use aio_bh_schedule_oneshot() here to force the
>unregister run in QEMU I/O handler AioContext instead. I think that is
>what this fn does( though I have not used this before).
>
>Something like,
>
>aio_bh_schedule_oneshot(iohandler_get_aio_context(),
>                        faultq_teardown_bh,
>                        data);
>
>and
>
>static void faultq_teardown_bh(void *opaque)
>{
>    data *fq = opaque;
>
>    qemu_set_fd_handler(fq->fd, NULL, NULL, NULL);
>
>    close(fq->fd);
>
>    iommufd_backend_free_id(fq->iommufd, fq->id);
>}

Good suggestion, I just confirmed this works fine for my scenario.

Thanks
Zhenzhong