Install an event handler on the vEVENTQ fd to read and propagate host
generated vIOMMU events to the guest.
The handler runs in QEMU's main loop, using a non-blocking fd registered
via qemu_set_fd_handler().
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
---
hw/arm/smmuv3-accel.c | 64 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
index f703ea1aac..17306cd04b 100644
--- a/hw/arm/smmuv3-accel.c
+++ b/hw/arm/smmuv3-accel.c
@@ -390,6 +390,50 @@ bool smmuv3_accel_issue_inv_cmd(SMMUv3State *bs, void *cmd, SMMUDevice *sdev,
sizeof(Cmd), &entry_num, cmd, errp);
}
+static void smmuv3_accel_event_read(void *opaque)
+{
+ SMMUv3State *s = opaque;
+ IOMMUFDVeventq *veventq = s->s_accel->veventq;
+ struct {
+ struct iommufd_vevent_header hdr;
+ struct iommu_vevent_arm_smmuv3 vevent;
+ } buf;
+ enum iommu_veventq_type type = IOMMU_VEVENTQ_TYPE_ARM_SMMUV3;
+ uint32_t id = veventq->veventq_id;
+ uint32_t last_seq = veventq->last_event_seq;
+ ssize_t bytes;
+
+ bytes = read(veventq->veventq_fd, &buf, sizeof(buf));
+ if (bytes <= 0) {
+ if (errno == EAGAIN || errno == EINTR) {
+ return;
+ }
+ error_report_once("vEVENTQ(type %u id %u): read failed (%m)", type, id);
+ return;
+ }
+
+ if (bytes == sizeof(buf.hdr) &&
+ (buf.hdr.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS)) {
+ error_report_once("vEVENTQ(type %u id %u): overflowed", type, id);
+ veventq->event_start = false;
+ return;
+ }
+ if (bytes < sizeof(buf)) {
+ error_report_once("vEVENTQ(type %u id %u): short read(%zd/%zd bytes)",
+ type, id, bytes, sizeof(buf));
+ return;
+ }
+
+ /* Check sequence in hdr for lost events if any */
+ if (veventq->event_start && (buf.hdr.sequence - last_seq != 1)) {
+ error_report_once("vEVENTQ(type %u id %u): lost %u event(s)",
+ type, id, buf.hdr.sequence - last_seq - 1);
+ }
+ veventq->last_event_seq = buf.hdr.sequence;
+ veventq->event_start = true;
+ smmuv3_propagate_event(s, (Evt *)&buf.vevent);
+}
+
static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
{
IOMMUFDVeventq *veventq = accel->veventq;
@@ -397,6 +441,7 @@ static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
if (!veventq) {
return;
}
+ qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
close(veventq->veventq_fd);
iommufd_backend_free_id(accel->viommu->iommufd, veventq->veventq_id);
g_free(veventq);
@@ -424,6 +469,7 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s, Error **errp)
IOMMUFDVeventq *veventq;
uint32_t veventq_id;
uint32_t veventq_fd;
+ int flags;
if (!accel || !accel->viommu) {
return true;
@@ -445,12 +491,30 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s, Error **errp)
return false;
}
+ flags = fcntl(veventq_fd, F_GETFL);
+ if (flags < 0) {
+ error_setg_errno(errp, errno, "Failed to get flags for vEVENTQ fd");
+ goto free_veventq;
+ }
+ if (fcntl(veventq_fd, F_SETFL, flags | O_NONBLOCK) < 0) {
+ error_setg_errno(errp, errno, "Failed to set O_NONBLOCK on vEVENTQ fd");
+ goto free_veventq;
+ }
+
veventq = g_new0(IOMMUFDVeventq, 1);
veventq->veventq_id = veventq_id;
veventq->veventq_fd = veventq_fd;
veventq->viommu = accel->viommu;
accel->veventq = veventq;
+
+ /* Set up event handler for veventq fd */
+ qemu_set_fd_handler(veventq_fd, smmuv3_accel_event_read, NULL, s);
return true;
+
+free_veventq:
+ close(veventq_fd);
+ iommufd_backend_free_id(accel->viommu->iommufd, veventq_id);
+ return false;
}
static bool
--
2.43.0
Hi Shameer,
>-----Original Message-----
>From: Shameer Kolothum <skolothumtho@nvidia.com>
>Subject: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>vIOMMU events
>
>Install an event handler on the vEVENTQ fd to read and propagate host
>generated vIOMMU events to the guest.
>
>The handler runs in QEMU's main loop, using a non-blocking fd registered
>via qemu_set_fd_handler().
>
>Tested-by: Nicolin Chen <nicolinc@nvidia.com>
>Reviewed-by: Eric Auger <eric.auger@redhat.com>
>Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
>Tested-by: Eric Auger <eric.auger@redhat.com>
>Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
>Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
>---
> hw/arm/smmuv3-accel.c | 64
>+++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 64 insertions(+)
>
>diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c
>index f703ea1aac..17306cd04b 100644
>--- a/hw/arm/smmuv3-accel.c
>+++ b/hw/arm/smmuv3-accel.c
>@@ -390,6 +390,50 @@ bool smmuv3_accel_issue_inv_cmd(SMMUv3State *bs,
>void *cmd, SMMUDevice *sdev,
> sizeof(Cmd), &entry_num, cmd, errp);
> }
>
>+static void smmuv3_accel_event_read(void *opaque)
>+{
>+ SMMUv3State *s = opaque;
>+ IOMMUFDVeventq *veventq = s->s_accel->veventq;
>+ struct {
>+ struct iommufd_vevent_header hdr;
>+ struct iommu_vevent_arm_smmuv3 vevent;
>+ } buf;
>+ enum iommu_veventq_type type = IOMMU_VEVENTQ_TYPE_ARM_SMMUV3;
>+ uint32_t id = veventq->veventq_id;
>+ uint32_t last_seq = veventq->last_event_seq;
>+ ssize_t bytes;
>+
>+ bytes = read(veventq->veventq_fd, &buf, sizeof(buf));
>+ if (bytes <= 0) {
>+ if (errno == EAGAIN || errno == EINTR) {
>+ return;
>+ }
>+ error_report_once("vEVENTQ(type %u id %u): read failed (%m)", type, id);
>+ return;
>+ }
>+
>+ if (bytes == sizeof(buf.hdr) &&
>+ (buf.hdr.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS)) {
>+ error_report_once("vEVENTQ(type %u id %u): overflowed", type, id);
>+ veventq->event_start = false;
>+ return;
>+ }
>+ if (bytes < sizeof(buf)) {
>+ error_report_once("vEVENTQ(type %u id %u): short read(%zd/%zd bytes)",
>+ type, id, bytes, sizeof(buf));
>+ return;
>+ }
>+
>+ /* Check sequence in hdr for lost events if any */
>+ if (veventq->event_start && (buf.hdr.sequence - last_seq != 1)) {
>+ error_report_once("vEVENTQ(type %u id %u): lost %u event(s)",
>+ type, id, buf.hdr.sequence - last_seq - 1);
>+ }
>+ veventq->last_event_seq = buf.hdr.sequence;
>+ veventq->event_start = true;
>+ smmuv3_propagate_event(s, (Evt *)&buf.vevent);
>+}
>+
> static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> {
> IOMMUFDVeventq *veventq = accel->veventq;
>@@ -397,6 +441,7 @@ static void
>smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> if (!veventq) {
> return;
> }
>+ qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
> close(veventq->veventq_fd);
> iommufd_backend_free_id(accel->viommu->iommufd, veventq->veventq_id);
Did you ever see free_id failure here? I'm working on PRQ using FAULTQ in intel_iommu
and find a race condition, I suspect the same for vEVENTQ because both have similar
implementation in kernel and qemu.
qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's async,
this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
during veventq_fd releasing, it drops reference of veventq_id but it's too late
and free_id(veventq_id) already failed.
It's not easy to reproduce, we had a test case in guest to trigger FAULTQ allocation
and freeing sequence in loops.
Thanks
Zhenzhong
> g_free(veventq);
>@@ -424,6 +469,7 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s,
>Error **errp)
> IOMMUFDVeventq *veventq;
> uint32_t veventq_id;
> uint32_t veventq_fd;
>+ int flags;
>
> if (!accel || !accel->viommu) {
> return true;
>@@ -445,12 +491,30 @@ bool smmuv3_accel_alloc_veventq(SMMUv3State *s,
>Error **errp)
> return false;
> }
>
>+ flags = fcntl(veventq_fd, F_GETFL);
>+ if (flags < 0) {
>+ error_setg_errno(errp, errno, "Failed to get flags for vEVENTQ fd");
>+ goto free_veventq;
>+ }
>+ if (fcntl(veventq_fd, F_SETFL, flags | O_NONBLOCK) < 0) {
>+ error_setg_errno(errp, errno, "Failed to set O_NONBLOCK on vEVENTQ fd");
>+ goto free_veventq;
>+ }
>+
> veventq = g_new0(IOMMUFDVeventq, 1);
> veventq->veventq_id = veventq_id;
> veventq->veventq_fd = veventq_fd;
> veventq->viommu = accel->viommu;
> accel->veventq = veventq;
>+
>+ /* Set up event handler for veventq fd */
>+ qemu_set_fd_handler(veventq_fd, smmuv3_accel_event_read, NULL, s);
> return true;
>+
>+free_veventq:
>+ close(veventq_fd);
>+ iommufd_backend_free_id(accel->viommu->iommufd, veventq_id);
>+ return false;
> }
>
> static bool
>--
>2.43.0
Hi Zhenzhong,
> -----Original Message-----
> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
> Sent: 28 February 2026 06:28
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
> vIOMMU events
[...]
> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> > {
> > IOMMUFDVeventq *veventq = accel->veventq;
> >@@ -397,6 +441,7 @@ static void
> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> > if (!veventq) {
> > return;
> > }
> >+ qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
> > close(veventq->veventq_fd);
> > iommufd_backend_free_id(accel->viommu->iommufd, veventq-
> >veventq_id);
>
> Did you ever see free_id failure here?
So far I have not seen free_id() failure in my test setup.
I'm working on PRQ using FAULTQ in
> intel_iommu
> and find a race condition, I suspect the same for vEVENTQ because both have
> similar
> implementation in kernel and qemu.
>
> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
> async,
> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
> during veventq_fd releasing, it drops reference of veventq_id but it's too late
> and free_id(veventq_id) already failed.
>
> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
> allocation
> and freeing sequence in loops.
From a quick look:
qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()
aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
node->pfd.events = 0 and then calls:
ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL
In the epoll backend (util/fdmon-epoll.c) this results in:
epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);
As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
the fd has been removed from the epoll interest list in the kernel.
Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same thread for
a given AioContext, and dispatch runs with ctx->list_lock held (via
lockcnt). So looks like handler removal and dispatch are serialized at the QEMU
level.
Do you see a specific window where the fd could still trigger after
EPOLL_CTL_DEL returns?
Could you confirm which fdmon backend you are using..poll, epoll, or
io_uring?
One more question: is the FAULTQ register/unregister model identical to
vEVENTQ in terms of allocation and teardown ordering? For vEVENTQ we only
unregister when the last device is removed from the vIOMMU, typically via
device_del hot unplug.
Please let me know if you see a specific race case here that I may be
missing.
Thanks,
Shameer
Hi Shameer,
>-----Original Message-----
>From: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>vIOMMU events
>
>Hi Zhenzhong,
>
>> -----Original Message-----
>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>> Sent: 28 February 2026 06:28
>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
>> arm@nongnu.org; qemu-devel@nongnu.org
>> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
>> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
>> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
>> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
>> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
>> <kjaju@nvidia.com>
>> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>> vIOMMU events
>
>[...]
>
>> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>> > {
>> > IOMMUFDVeventq *veventq = accel->veventq;
>> >@@ -397,6 +441,7 @@ static void
>> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>> > if (!veventq) {
>> > return;
>> > }
>> >+ qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
>> > close(veventq->veventq_fd);
>> > iommufd_backend_free_id(accel->viommu->iommufd, veventq-
>> >veventq_id);
>>
>> Did you ever see free_id failure here?
>
>So far I have not seen free_id() failure in my test setup.
>
> I'm working on PRQ using FAULTQ in
>> intel_iommu
>> and find a race condition, I suspect the same for vEVENTQ because both have
>> similar
>> implementation in kernel and qemu.
>>
>> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
>> async,
>> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
>> during veventq_fd releasing, it drops reference of veventq_id but it's too late
>> and free_id(veventq_id) already failed.
>>
>> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
>> allocation
>> and freeing sequence in loops.
>
>From a quick look:
>
>qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()
>
>aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
>node->pfd.events = 0 and then calls:
>
>ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL
>
>In the epoll backend (util/fdmon-epoll.c) this results in:
>
>epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);
>
>As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
>the fd has been removed from the epoll interest list in the kernel.
OK, I think I understand wrongly.
>
>Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same thread for
>a given AioContext, and dispatch runs with ctx->list_lock held (via
>lockcnt). So looks like handler removal and dispatch are serialized at the QEMU
>level.
I just found my scenario is a bit different from yours, aio_set_fd_handler()
is called in vcpu thread and aio_dispatch() is called in main thread in my scenario.
>
>Do you see a specific window where the fd could still trigger after
>EPOLL_CTL_DEL returns?
No, just speculation.
>
>Could you confirm which fdmon backend you are using..poll, epoll, or
>io_uring?
There is no CONFIG_LINUX_IO_URING defined in my env.
GDB printed fdmon_poll_ops for iohandler_ctx->fdmon_ops, I also set
breakpoint on fdmon_epoll_try_upgrade(), no catch, I think it was
using poll.
>
>One more question: is the FAULTQ register/unregister model identical to
>vEVENTQ in terms of allocation and teardown ordering?
register/unregister model are similar, see https://github.com/yiliu1765/qemu/commit/28b72d6b372555494d0e6a1ccfc97c80bc34423a
the full branch: https://github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_prq_wip/
> For vEVENTQ we only
>unregister when the last device is removed from the vIOMMU, typically via
>device_del hot unplug.
OK, this is different, register/unregister model is triggered in vcpu thread
when guest send PASID table invalidation request in my scenario.
>
>Please let me know if you see a specific race case here that I may be
>missing.
Maybe this is a race case only in my scenario, I need to dig further.
Thanks for your analysis.
BRs,
Zhenzhong
>-----Original Message-----
>From: Duan, Zhenzhong
>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>vIOMMU events
>
>Hi Shameer,
>
>>-----Original Message-----
>>From: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
>>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>>vIOMMU events
>>
>>Hi Zhenzhong,
>>
>>> -----Original Message-----
>>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
>>> Sent: 28 February 2026 06:28
>>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
>>> arm@nongnu.org; qemu-devel@nongnu.org
>>> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
>>> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
>>> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
>>> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
>>> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
>>> <kjaju@nvidia.com>
>>> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
>>> vIOMMU events
>>
>>[...]
>>
>>> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>>> > {
>>> > IOMMUFDVeventq *veventq = accel->veventq;
>>> >@@ -397,6 +441,7 @@ static void
>>> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
>>> > if (!veventq) {
>>> > return;
>>> > }
>>> >+ qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
>>> > close(veventq->veventq_fd);
>>> > iommufd_backend_free_id(accel->viommu->iommufd, veventq-
>>> >veventq_id);
>>>
>>> Did you ever see free_id failure here?
>>
>>So far I have not seen free_id() failure in my test setup.
>>
>> I'm working on PRQ using FAULTQ in
>>> intel_iommu
>>> and find a race condition, I suspect the same for vEVENTQ because both have
>>> similar
>>> implementation in kernel and qemu.
>>>
>>> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
>>> async,
>>> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
>>> during veventq_fd releasing, it drops reference of veventq_id but it's too late
>>> and free_id(veventq_id) already failed.
>>>
>>> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
>>> allocation
>>> and freeing sequence in loops.
>>
>>From a quick look:
>>
>>qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()
>>
>>aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
>>node->pfd.events = 0 and then calls:
>>
>>ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL
>>
>>In the epoll backend (util/fdmon-epoll.c) this results in:
>>
>>epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);
>>
>>As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
>>the fd has been removed from the epoll interest list in the kernel.
>
>OK, I think I understand wrongly.
>
>>
>>Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same thread for
>>a given AioContext, and dispatch runs with ctx->list_lock held (via
>>lockcnt). So looks like handler removal and dispatch are serialized at the QEMU
>>level.
>
>I just found my scenario is a bit different from yours, aio_set_fd_handler()
>is called in vcpu thread and aio_dispatch() is called in main thread in my scenario.
>
>>
>>Do you see a specific window where the fd could still trigger after
>>EPOLL_CTL_DEL returns?
>
>No, just speculation.
>
>>
>>Could you confirm which fdmon backend you are using..poll, epoll, or
>>io_uring?
>
>There is no CONFIG_LINUX_IO_URING defined in my env.
>GDB printed fdmon_poll_ops for iohandler_ctx->fdmon_ops, I also set
>breakpoint on fdmon_epoll_try_upgrade(), no catch, I think it was
>using poll.
>
>>
>>One more question: is the FAULTQ register/unregister model identical to
>>vEVENTQ in terms of allocation and teardown ordering?
>
>register/unregister model are similar, see
>https://github.com/yiliu1765/qemu/commit/28b72d6b372555494d0e6a1ccfc97c8
>0bc34423a
>the full branch:
>https://github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_prq_wip/
>
>> For vEVENTQ we only
>>unregister when the last device is removed from the vIOMMU, typically via
>>device_del hot unplug.
>
>OK, this is different, register/unregister model is triggered in vcpu thread
>when guest send PASID table invalidation request in my scenario.
>
>>
>>Please let me know if you see a specific race case here that I may be
>>missing.
>
>Maybe this is a race case only in my scenario, I need to dig further.
>Thanks for your analysis.
Sorry for the noise, find root cause, this is indeed a race case only in my
scenario, because I need to delete fault_fd from polling fds in vcpu thread.
While aio_set_fd_handler() indeed wake up the main thread from poll(),
but there still could be a small window we call free_id(fault_id) before
poll() exit and release fault_id file reference.
I wrote below to make it work, in worst case I see free_id(fault_id) failed once
and could succeed in second try.
--- a/hw/i386/intel_iommu_accel.c
+++ b/hw/i386/intel_iommu_accel.c
@@ -206,13 +206,20 @@ static void vtd_destroy_fs_faultq(VTDHostIOMMUDevice *vtd_hiod,
uint32_t fault_id, uint32_t fault_fd)
{
HostIOMMUDeviceIOMMUFD *idev = HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
+ int ret, try_cnt = 5;
if (!fault_id) {
return;
}
close(fault_fd);
- iommufd_backend_free_id(idev->iommufd, fault_id);
+
+ while(try_cnt--) {
+ ret = iommufd_backend_free_id(idev->iommufd, fault_id);
+ if (!ret || errno != EBUSY ) {
+ break;
+ }
+ }
}
BRs,
Zhenzhong
> -----Original Message-----
> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
> Sent: 04 March 2026 08:56
> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> arm@nongnu.org; qemu-devel@nongnu.org
> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
> <kjaju@nvidia.com>
> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate host
> vIOMMU events
>
> External email: Use caution opening links or attachments
>
>
> >-----Original Message-----
> >From: Duan, Zhenzhong
> >Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate
> host
> >vIOMMU events
> >
> >Hi Shameer,
> >
> >>-----Original Message-----
> >>From: Shameer Kolothum Thodi <skolothumtho@nvidia.com>
> >>Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate
> host
> >>vIOMMU events
> >>
> >>Hi Zhenzhong,
> >>
> >>> -----Original Message-----
> >>> From: Duan, Zhenzhong <zhenzhong.duan@intel.com>
> >>> Sent: 28 February 2026 06:28
> >>> To: Shameer Kolothum Thodi <skolothumtho@nvidia.com>; qemu-
> >>> arm@nongnu.org; qemu-devel@nongnu.org
> >>> Cc: eric.auger@redhat.com; peter.maydell@linaro.org; Nicolin Chen
> >>> <nicolinc@nvidia.com>; Nathan Chen <nathanc@nvidia.com>; Matt Ochs
> >>> <mochs@nvidia.com>; Jiandi An <jan@nvidia.com>; Jason Gunthorpe
> >>> <jgg@nvidia.com>; jonathan.cameron@huawei.com;
> >>> zhangfei.gao@linaro.org; philmd@linaro.org; Krishnakant Jaju
> >>> <kjaju@nvidia.com>
> >>> Subject: RE: [PATCH v8 5/5] hw/arm/smmuv3-accel: Read and propagate
> host
> >>> vIOMMU events
> >>
> >>[...]
> >>
> >>> > static void smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> >>> > {
> >>> > IOMMUFDVeventq *veventq = accel->veventq;
> >>> >@@ -397,6 +441,7 @@ static void
> >>> >smmuv3_accel_free_veventq(SMMUv3AccelState *accel)
> >>> > if (!veventq) {
> >>> > return;
> >>> > }
> >>> >+ qemu_set_fd_handler(veventq->veventq_fd, NULL, NULL, NULL);
> >>> > close(veventq->veventq_fd);
> >>> > iommufd_backend_free_id(accel->viommu->iommufd, veventq-
> >>> >veventq_id);
> >>>
> >>> Did you ever see free_id failure here?
> >>
> >>So far I have not seen free_id() failure in my test setup.
> >>
> >> I'm working on PRQ using FAULTQ in
> >>> intel_iommu
> >>> and find a race condition, I suspect the same for vEVENTQ because both
> have
> >>> similar
> >>> implementation in kernel and qemu.
> >>>
> >>> qemu_set_fd_handler() is called to remove veventq_fd from epoll() but it's
> >>> async,
> >>> this can lead to releasing of veventq_fd delayed after free_id(veventq_id),
> >>> during veventq_fd releasing, it drops reference of veventq_id but it's too
> late
> >>> and free_id(veventq_id) already failed.
> >>>
> >>> It's not easy to reproduce, we had a test case in guest to trigger FAULTQ
> >>> allocation
> >>> and freeing sequence in loops.
> >>
> >>From a quick look:
> >>
> >>qemu_set_fd_handler(fd, NULL, NULL, NULL) -> aio_set_fd_handler()
> >>
> >>aio_set_fd_handler() takes ctx->list_lock. For the delete case it sets
> >>node->pfd.events = 0 and then calls:
> >>
> >>ctx->fdmon_ops->update(ctx, node, new_node); where new_node = NULL
> >>
> >>In the epoll backend (util/fdmon-epoll.c) this results in:
> >>
> >>epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, ...);
> >>
> >>As I understand it, epoll_ctl() is a synchronous syscall. When it returns,
> >>the fd has been removed from the epoll interest list in the kernel.
> >
> >OK, I think I understand wrongly.
> >
> >>
> >>Also, I think, aio_set_fd_handler() and aio_dispatch() run in the same
> thread for
> >>a given AioContext, and dispatch runs with ctx->list_lock held (via
> >>lockcnt). So looks like handler removal and dispatch are serialized at the
> QEMU
> >>level.
> >
> >I just found my scenario is a bit different from yours, aio_set_fd_handler()
> >is called in vcpu thread and aio_dispatch() is called in main thread in my
> scenario.
> >
> >>
> >>Do you see a specific window where the fd could still trigger after
> >>EPOLL_CTL_DEL returns?
> >
> >No, just speculation.
> >
> >>
> >>Could you confirm which fdmon backend you are using..poll, epoll, or
> >>io_uring?
> >
> >There is no CONFIG_LINUX_IO_URING defined in my env.
> >GDB printed fdmon_poll_ops for iohandler_ctx->fdmon_ops, I also set
> >breakpoint on fdmon_epoll_try_upgrade(), no catch, I think it was
> >using poll.
> >
> >>
> >>One more question: is the FAULTQ register/unregister model identical to
> >>vEVENTQ in terms of allocation and teardown ordering?
> >
> >register/unregister model are similar, see
> >https://github.com/yiliu1765/qemu/commit/28b72d6b372555494d0e6a1
> ccfc97c8
> >0bc34423a
> >the full branch:
> >https://github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_prq_wi
> p/
> >
> >> For vEVENTQ we only
> >>unregister when the last device is removed from the vIOMMU, typically via
> >>device_del hot unplug.
> >
> >OK, this is different, register/unregister model is triggered in vcpu thread
> >when guest send PASID table invalidation request in my scenario.
> >
> >>
> >>Please let me know if you see a specific race case here that I may be
> >>missing.
> >
> >Maybe this is a race case only in my scenario, I need to dig further.
> >Thanks for your analysis.
>
> Sorry for the noise, find root cause, this is indeed a race case only in my
> scenario, because I need to delete fault_fd from polling fds in vcpu thread.
>
> While aio_set_fd_handler() indeed wake up the main thread from poll(),
> but there still could be a small window we call free_id(fault_id) before
> poll() exit and release fault_id file reference.
>
> I wrote below to make it work, in worst case I see free_id(fault_id) failed once
> and could succeed in second try.
>
> --- a/hw/i386/intel_iommu_accel.c
> +++ b/hw/i386/intel_iommu_accel.c
> @@ -206,13 +206,20 @@ static void
> vtd_destroy_fs_faultq(VTDHostIOMMUDevice *vtd_hiod,
> uint32_t fault_id, uint32_t fault_fd)
> {
> HostIOMMUDeviceIOMMUFD *idev =
> HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
> + int ret, try_cnt = 5;
>
> if (!fault_id) {
> return;
> }
>
> close(fault_fd);
> - iommufd_backend_free_id(idev->iommufd, fault_id);
> +
> + while(try_cnt--) {
> + ret = iommufd_backend_free_id(idev->iommufd, fault_id);
> + if (!ret || errno != EBUSY ) {
> + break;
> + }
> + }
> }
Is it possible for you to use aio_bh_schedule_oneshot() here to force the
unregister run in QEMU I/O handler AioContext instead. I think that is
what this fn does( though I have not used this before).
Something like,
aio_bh_schedule_oneshot(iohandler_get_aio_context(),
faultq_teardown_bh,
data);
and
static void faultq_teardown_bh(void *opaque)
{
data *fq = opaque;
qemu_set_fd_handler(fq->fd, NULL, NULL, NULL);
close(fq->fd);
iommufd_backend_free_id(fq->iommufd, fq->id);
}
Thanks,
Shameer
>> >Maybe this is a race case only in my scenario, I need to dig further.
>> >Thanks for your analysis.
>>
>> Sorry for the noise, find root cause, this is indeed a race case only in my
>> scenario, because I need to delete fault_fd from polling fds in vcpu thread.
>>
>> While aio_set_fd_handler() indeed wake up the main thread from poll(),
>> but there still could be a small window we call free_id(fault_id) before
>> poll() exit and release fault_id file reference.
>>
>> I wrote below to make it work, in worst case I see free_id(fault_id) failed once
>> and could succeed in second try.
>>
>> --- a/hw/i386/intel_iommu_accel.c
>> +++ b/hw/i386/intel_iommu_accel.c
>> @@ -206,13 +206,20 @@ static void
>> vtd_destroy_fs_faultq(VTDHostIOMMUDevice *vtd_hiod,
>> uint32_t fault_id, uint32_t fault_fd)
>> {
>> HostIOMMUDeviceIOMMUFD *idev =
>> HOST_IOMMU_DEVICE_IOMMUFD(vtd_hiod->hiod);
>> + int ret, try_cnt = 5;
>>
>> if (!fault_id) {
>> return;
>> }
>>
>> close(fault_fd);
>> - iommufd_backend_free_id(idev->iommufd, fault_id);
>> +
>> + while(try_cnt--) {
>> + ret = iommufd_backend_free_id(idev->iommufd, fault_id);
>> + if (!ret || errno != EBUSY ) {
>> + break;
>> + }
>> + }
>> }
>
>Is it possible for you to use aio_bh_schedule_oneshot() here to force the
>unregister run in QEMU I/O handler AioContext instead. I think that is
>what this fn does( though I have not used this before).
>
>Something like,
>
>aio_bh_schedule_oneshot(iohandler_get_aio_context(),
> faultq_teardown_bh,
> data);
>
>and
>
>static void faultq_teardown_bh(void *opaque)
>{
> data *fq = opaque;
>
> qemu_set_fd_handler(fq->fd, NULL, NULL, NULL);
>
> close(fq->fd);
>
> iommufd_backend_free_id(fq->iommufd, fq->id);
>}
Good suggestion, I just confirmed this works fine for my scenario.
Thanks
Zhenzhong
© 2016 - 2026 Red Hat, Inc.