drivers/virt/acrn/irqfd.c | 47 ++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 10 deletions(-)
ACRN irqfd registers a custom waitqueue callback on the eventfd. When the
eventfd is released, eventfd_release() wakes the waitqueue with EPOLLHUP,
and hsm_irqfd_wakeup() queues irqfd->shutdown on vm->irqfd_wq.
The irqfd object can also be removed by ACRN_IOCTL_IRQFD with
ACRN_IRQFD_FLAG_DEASSIGN. In that path, acrn_irqfd_deassign() removes the
waitqueue entry and frees the hsm_irqfd object.
These two paths can race. If EPOLLHUP queues the shutdown work before
deassign frees the object, the work item may run after kfree() and recover
the freed hsm_irqfd via container_of(). It then dereferences irqfd->vm while
taking irqfds_lock.
A possible race is:
CPU0 CPU1
eventfd_release()
wake_up_poll(EPOLLHUP)
hsm_irqfd_wakeup()
queue_work(&irqfd->shutdown)
acrn_irqfd_deassign()
hsm_irqfd_shutdown()
list_del_init()
eventfd_ctx_remove_wait_queue()
kfree(irqfd) //free here!
hsm_irqfd_shutdown_work()
irqfd = container_of(work, ...)
vm = irqfd->vm //UAF!
Fix this by separating logical shutdown from object release. First remove
the irqfd from the VM list and eventfd waitqueue, then synchronously
cancel any pending/running shutdown work before freeing the object.
Also tear down irqfds before destroying the irqfd workqueue, so
eventfd wakeups cannot queue work after the workqueue has been destroyed.
Signed-off-by: Sicong Huang <congei42@163.com>
---
drivers/virt/acrn/irqfd.c | 47 ++++++++++++++++++++++++++++++---------
1 file changed, 37 insertions(+), 10 deletions(-)
diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
index acf8cd5f8f8c..659fd40d9aa5 100644
--- a/drivers/virt/acrn/irqfd.c
+++ b/drivers/virt/acrn/irqfd.c
@@ -44,30 +44,37 @@ static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
irqfd->msi.msi_data);
}
-static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
+static bool hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
{
u64 cnt;
lockdep_assert_held(&irqfd->vm->irqfds_lock);
+ if (list_empty(&irqfd->list))
+ return false;
+
/* remove from wait queue */
list_del_init(&irqfd->list);
eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
eventfd_ctx_put(irqfd->eventfd);
- kfree(irqfd);
+
+ return true;
}
static void hsm_irqfd_shutdown_work(struct work_struct *work)
{
struct hsm_irqfd *irqfd;
struct acrn_vm *vm;
+ bool free;
irqfd = container_of(work, struct hsm_irqfd, shutdown);
vm = irqfd->vm;
mutex_lock(&vm->irqfds_lock);
- if (!list_empty(&irqfd->list))
- hsm_irqfd_shutdown(irqfd);
+ free = hsm_irqfd_shutdown(irqfd);
mutex_unlock(&vm->irqfds_lock);
+
+ if (free)
+ kfree(irqfd);
}
/* Called with wqh->lock held and interrupts disabled */
@@ -170,7 +177,7 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
static int acrn_irqfd_deassign(struct acrn_vm *vm,
struct acrn_irqfd *args)
{
- struct hsm_irqfd *irqfd, *tmp;
+ struct hsm_irqfd *irqfd, *tmp, *to_free = NULL;
struct eventfd_ctx *eventfd;
eventfd = eventfd_ctx_fdget(args->fd);
@@ -180,13 +187,19 @@ static int acrn_irqfd_deassign(struct acrn_vm *vm,
mutex_lock(&vm->irqfds_lock);
list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) {
if (irqfd->eventfd == eventfd) {
- hsm_irqfd_shutdown(irqfd);
+ if (hsm_irqfd_shutdown(irqfd))
+ to_free = irqfd;
break;
}
}
mutex_unlock(&vm->irqfds_lock);
eventfd_ctx_put(eventfd);
+ if (to_free) {
+ cancel_work_sync(&to_free->shutdown);
+ kfree(to_free);
+ }
+
return 0;
}
@@ -219,9 +232,23 @@ void acrn_irqfd_deinit(struct acrn_vm *vm)
struct hsm_irqfd *irqfd, *next;
dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid);
+
+ for (;;) {
+ irqfd = NULL;
+
+ mutex_lock(&vm->irqfds_lock);
+ if (!list_empty(&vm->irqfds)) {
+ irqfd = list_first_entry(&vm->irqfds, struct hsm_irqfd, list);
+ hsm_irqfd_shutdown(irqfd);
+ }
+ mutex_unlock(&vm->irqfds_lock);
+
+ if (!irqfd)
+ break;
+
+ cancel_work_sync(&irqfd->shutdown);
+ kfree(irqfd);
+ }
+
destroy_workqueue(vm->irqfd_wq);
- mutex_lock(&vm->irqfds_lock);
- list_for_each_entry_safe(irqfd, next, &vm->irqfds, list)
- hsm_irqfd_shutdown(irqfd);
- mutex_unlock(&vm->irqfds_lock);
}
--
2.34.1
v1: https://lore.kernel.org/r/20260511135737.2285411-1-congei42@163.com v1 fixed the irqfd UAF by serialising deassign with the shutdown work through list_empty() + cancel_work_sync(). Fei Li pointed out that the lifetime rules were still distributed across several places. v2 restructures the lifetime along the same lines KVM irqfd uses: deassign/deinit only logically deactivate the irqfd, and the shutdown work is the sole owner of struct hsm_irqfd. Changes since v1: - Introduce HSM_IRQFD_FLAG_SHUTDOWN guarded by test_and_set_bit() so the cleanup work is queued at most once over the irqfd's lifetime, removing the cancel_work_sync()-based ownership negotiation. - hsm_irqfd_shutdown_work() now performs list_del_init() (if needed), eventfd_ctx_remove_wait_queue(), eventfd_ctx_put() and kfree(); the deassign/deinit paths no longer call any of these directly. - acrn_irqfd_deassign() and acrn_irqfd_deinit() only list_del_init() the irqfd under irqfds_lock and queue the cleanup work; both flush vm->irqfd_wq before returning / before destroy_workqueue(). - acrn_irqfd_deinit() now uses list_for_each_entry_safe(irqfd, next, ...) so both iterators are referenced, incidentally fixing the -Wunused-variable warning reported by the kernel test robot on v1 (https://lore.kernel.org/oe-kbuild-all/202605151636.lR13SqEN-lkp@intel.com/). - acrn_irqfd_assign() now holds vm->irqfds_lock across the duplicate check, vfs_poll() (which installs the eventfd waitqueue entry via hsm_irqfd_poll_func) and list_add_tail(), so the irqfd is never visible on vm->irqfds before its waitqueue entry has been installed. Sicong Huang (1): virt: acrn: Fix irqfd use-after-free during eventfd shutdown drivers/virt/acrn/irqfd.c | 71 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 27 deletions(-) base-commit: 7fd2df204f342fc17d1a0bfcd474b24232fb0f32 -- 2.34.1
acrn_irqfd_deassign() and the eventfd EPOLLHUP wakeup can race and free
the same struct hsm_irqfd:
CPU0 CPU1
---- ----
eventfd_release()
wake_up_poll(EPOLLHUP)
hsm_irqfd_wakeup()
queue_work(&irqfd->shutdown)
acrn_irqfd_deassign()
hsm_irqfd_shutdown()
list_del_init()
eventfd_ctx_remove_wait_queue()
eventfd_ctx_put()
kfree(irqfd)
hsm_irqfd_shutdown_work()
container_of(work, ..., shutdown)
irqfd->vm <-- use-after-free
The deassign path freed the irqfd while a shutdown work item was
already queued by EPOLLHUP (or vice versa), so the work item could
resurrect a dangling pointer through container_of().
Switch to the lifetime model used by KVM irqfds:
- Deassign/deinit only deactivate the irqfd: remove it from vm->irqfds
under irqfds_lock and queue the cleanup work.
- hsm_irqfd_shutdown_work() becomes the sole owner that unhooks the
eventfd waitqueue entry, drops the eventfd reference and frees the
irqfd.
- A new HSM_IRQFD_FLAG_SHUTDOWN bit guarded by test_and_set_bit()
ensures the cleanup work is queued at most once, no matter how many
of {EPOLLHUP, deassign, deinit} fire concurrently. This is safe to
call from the waitqueue callback, which runs with wqh->lock held and
IRQs disabled and therefore cannot take irqfds_lock.
- acrn_irqfd_deassign() flushes vm->irqfd_wq before returning so the
eventfd is fully detached on return. acrn_irqfd_deinit() deactivates
every irqfd, flushes the workqueue and only then destroys it, so no
path can queue_work() onto a torn-down workqueue.
- acrn_irqfd_assign() now installs the eventfd waitqueue entry and
publishes the irqfd to vm->irqfds under irqfds_lock, so the irqfd is
never visible to deassign/deinit before its waitqueue entry is in
place, and any EPOLLHUP that fires in the assign window queues
cleanup work that blocks on irqfds_lock until publication is done.
Signed-off-by: Sicong Huang <congei42@163.com>
---
drivers/virt/acrn/irqfd.c | 71 ++++++++++++++++++++++++---------------
1 file changed, 44 insertions(+), 27 deletions(-)
diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
index acf8cd5f8f8c..feeba7eda494 100644
--- a/drivers/virt/acrn/irqfd.c
+++ b/drivers/virt/acrn/irqfd.c
@@ -16,6 +16,9 @@
#include "acrn_drv.h"
+/* Cleanup work has been queued; set via test_and_set_bit(). */
+#define HSM_IRQFD_FLAG_SHUTDOWN 0
+
/**
* struct hsm_irqfd - Properties of HSM irqfd
* @vm: Associated VM pointer
@@ -25,6 +28,7 @@
* @list: Entry within &acrn_vm.irqfds of irqfds of a VM
* @pt: Structure for select/poll on the associated eventfd
* @msi: MSI data
+ * @flags: Internal lifecycle flags (HSM_IRQFD_FLAG_*)
*/
struct hsm_irqfd {
struct acrn_vm *vm;
@@ -34,6 +38,7 @@ struct hsm_irqfd {
struct list_head list;
poll_table pt;
struct acrn_msi_entry msi;
+ unsigned long flags;
};
static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
@@ -44,30 +49,29 @@ static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
irqfd->msi.msi_data);
}
-static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
+/* Queue the cleanup work at most once. Safe from atomic context. */
+static void hsm_irqfd_queue_shutdown(struct hsm_irqfd *irqfd)
{
- u64 cnt;
-
- lockdep_assert_held(&irqfd->vm->irqfds_lock);
-
- /* remove from wait queue */
- list_del_init(&irqfd->list);
- eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
- eventfd_ctx_put(irqfd->eventfd);
- kfree(irqfd);
+ if (!test_and_set_bit(HSM_IRQFD_FLAG_SHUTDOWN, &irqfd->flags))
+ queue_work(irqfd->vm->irqfd_wq, &irqfd->shutdown);
}
+/* Sole owner of @irqfd: unhook waitqueue, drop eventfd ref, free. */
static void hsm_irqfd_shutdown_work(struct work_struct *work)
{
- struct hsm_irqfd *irqfd;
- struct acrn_vm *vm;
+ struct hsm_irqfd *irqfd = container_of(work, struct hsm_irqfd,
+ shutdown);
+ struct acrn_vm *vm = irqfd->vm;
+ u64 cnt;
- irqfd = container_of(work, struct hsm_irqfd, shutdown);
- vm = irqfd->vm;
mutex_lock(&vm->irqfds_lock);
if (!list_empty(&irqfd->list))
- hsm_irqfd_shutdown(irqfd);
+ list_del_init(&irqfd->list);
mutex_unlock(&vm->irqfds_lock);
+
+ eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
+ eventfd_ctx_put(irqfd->eventfd);
+ kfree(irqfd);
}
/* Called with wqh->lock held and interrupts disabled */
@@ -76,17 +80,16 @@ static int hsm_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
{
unsigned long poll_bits = (unsigned long)key;
struct hsm_irqfd *irqfd;
- struct acrn_vm *vm;
irqfd = container_of(wait, struct hsm_irqfd, wait);
- vm = irqfd->vm;
+
if (poll_bits & POLLIN)
/* An event has been signaled, inject an interrupt */
acrn_irqfd_inject(irqfd);
if (poll_bits & POLLHUP)
- /* Do shutdown work in thread to hold wqh->lock */
- queue_work(vm->irqfd_wq, &irqfd->shutdown);
+ /* Defer teardown to the cleanup work; can't sleep here. */
+ hsm_irqfd_queue_shutdown(irqfd);
return 0;
}
@@ -142,6 +145,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
init_waitqueue_func_entry(&irqfd->wait, hsm_irqfd_wakeup);
init_poll_funcptr(&irqfd->pt, hsm_irqfd_poll_func);
+ /*
+ * Hold irqfds_lock across waitqueue install and list_add so the
+ * irqfd is not visible to deassign/deinit before its waitqueue
+ * entry is in place, and any racing EPOLLHUP cleanup work blocks
+ * on irqfds_lock until publication completes.
+ */
mutex_lock(&vm->irqfds_lock);
list_for_each_entry(tmp, &vm->irqfds, list) {
if (irqfd->eventfd != tmp->eventfd)
@@ -150,14 +159,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
mutex_unlock(&vm->irqfds_lock);
goto fail;
}
- list_add_tail(&irqfd->list, &vm->irqfds);
- mutex_unlock(&vm->irqfds_lock);
- /* Check the pending event in this stage */
events = vfs_poll(fd_file(f), &irqfd->pt);
-
+ list_add_tail(&irqfd->list, &vm->irqfds);
if (events & EPOLLIN)
acrn_irqfd_inject(irqfd);
+ mutex_unlock(&vm->irqfds_lock);
return 0;
fail:
@@ -180,13 +187,17 @@ static int acrn_irqfd_deassign(struct acrn_vm *vm,
mutex_lock(&vm->irqfds_lock);
list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) {
if (irqfd->eventfd == eventfd) {
- hsm_irqfd_shutdown(irqfd);
+ list_del_init(&irqfd->list);
+ hsm_irqfd_queue_shutdown(irqfd);
break;
}
}
mutex_unlock(&vm->irqfds_lock);
eventfd_ctx_put(eventfd);
+ /* Wait for cleanup work to finish so the eventfd is fully detached. */
+ flush_workqueue(vm->irqfd_wq);
+
return 0;
}
@@ -219,9 +230,15 @@ void acrn_irqfd_deinit(struct acrn_vm *vm)
struct hsm_irqfd *irqfd, *next;
dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid);
- destroy_workqueue(vm->irqfd_wq);
+
mutex_lock(&vm->irqfds_lock);
- list_for_each_entry_safe(irqfd, next, &vm->irqfds, list)
- hsm_irqfd_shutdown(irqfd);
+ list_for_each_entry_safe(irqfd, next, &vm->irqfds, list) {
+ list_del_init(&irqfd->list);
+ hsm_irqfd_queue_shutdown(irqfd);
+ }
mutex_unlock(&vm->irqfds_lock);
+
+ /* Drain all cleanup work before tearing the workqueue down. */
+ flush_workqueue(vm->irqfd_wq);
+ destroy_workqueue(vm->irqfd_wq);
}
--
2.34.1
On 2026-05-19 at 19:20:18 +0800, Sicong Huang wrote:
> acrn_irqfd_deassign() and the eventfd EPOLLHUP wakeup can race and free
> the same struct hsm_irqfd:
>
> CPU0 CPU1
> ---- ----
> eventfd_release()
> wake_up_poll(EPOLLHUP)
> hsm_irqfd_wakeup()
> queue_work(&irqfd->shutdown)
> acrn_irqfd_deassign()
> hsm_irqfd_shutdown()
> list_del_init()
> eventfd_ctx_remove_wait_queue()
> eventfd_ctx_put()
> kfree(irqfd)
> hsm_irqfd_shutdown_work()
> container_of(work, ..., shutdown)
> irqfd->vm <-- use-after-free
>
> The deassign path freed the irqfd while a shutdown work item was
> already queued by EPOLLHUP (or vice versa), so the work item could
> resurrect a dangling pointer through container_of().
>
> Switch to the lifetime model used by KVM irqfds:
>
> - Deassign/deinit only deactivate the irqfd: remove it from vm->irqfds
> under irqfds_lock and queue the cleanup work.
> - hsm_irqfd_shutdown_work() becomes the sole owner that unhooks the
> eventfd waitqueue entry, drops the eventfd reference and frees the
> irqfd.
> - A new HSM_IRQFD_FLAG_SHUTDOWN bit guarded by test_and_set_bit()
> ensures the cleanup work is queued at most once, no matter how many
> of {EPOLLHUP, deassign, deinit} fire concurrently. This is safe to
> call from the waitqueue callback, which runs with wqh->lock held and
> IRQs disabled and therefore cannot take irqfds_lock.
> - acrn_irqfd_deassign() flushes vm->irqfd_wq before returning so the
> eventfd is fully detached on return. acrn_irqfd_deinit() deactivates
> every irqfd, flushes the workqueue and only then destroys it, so no
> path can queue_work() onto a torn-down workqueue.
> - acrn_irqfd_assign() now installs the eventfd waitqueue entry and
> publishes the irqfd to vm->irqfds under irqfds_lock, so the irqfd is
> never visible to deassign/deinit before its waitqueue entry is in
> place, and any EPOLLHUP that fires in the assign window queues
> cleanup work that blocks on irqfds_lock until publication is done.
>
> Signed-off-by: Sicong Huang <congei42@163.com>
Hi Sicong,
Thanks, v2 looks much better to me.
Reviewed-by: Fei Li <fei1.li@intel.com>
> ---
> drivers/virt/acrn/irqfd.c | 71 ++++++++++++++++++++++++---------------
> 1 file changed, 44 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
> index acf8cd5f8f8c..feeba7eda494 100644
> --- a/drivers/virt/acrn/irqfd.c
> +++ b/drivers/virt/acrn/irqfd.c
> @@ -16,6 +16,9 @@
>
> #include "acrn_drv.h"
>
> +/* Cleanup work has been queued; set via test_and_set_bit(). */
> +#define HSM_IRQFD_FLAG_SHUTDOWN 0
> +
> /**
> * struct hsm_irqfd - Properties of HSM irqfd
> * @vm: Associated VM pointer
> @@ -25,6 +28,7 @@
> * @list: Entry within &acrn_vm.irqfds of irqfds of a VM
> * @pt: Structure for select/poll on the associated eventfd
> * @msi: MSI data
> + * @flags: Internal lifecycle flags (HSM_IRQFD_FLAG_*)
> */
> struct hsm_irqfd {
> struct acrn_vm *vm;
> @@ -34,6 +38,7 @@ struct hsm_irqfd {
> struct list_head list;
> poll_table pt;
> struct acrn_msi_entry msi;
> + unsigned long flags;
> };
>
> static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
> @@ -44,30 +49,29 @@ static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
> irqfd->msi.msi_data);
> }
>
> -static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
> +/* Queue the cleanup work at most once. Safe from atomic context. */
> +static void hsm_irqfd_queue_shutdown(struct hsm_irqfd *irqfd)
> {
> - u64 cnt;
> -
> - lockdep_assert_held(&irqfd->vm->irqfds_lock);
> -
> - /* remove from wait queue */
> - list_del_init(&irqfd->list);
> - eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
> - eventfd_ctx_put(irqfd->eventfd);
> - kfree(irqfd);
> + if (!test_and_set_bit(HSM_IRQFD_FLAG_SHUTDOWN, &irqfd->flags))
> + queue_work(irqfd->vm->irqfd_wq, &irqfd->shutdown);
> }
>
> +/* Sole owner of @irqfd: unhook waitqueue, drop eventfd ref, free. */
> static void hsm_irqfd_shutdown_work(struct work_struct *work)
> {
> - struct hsm_irqfd *irqfd;
> - struct acrn_vm *vm;
> + struct hsm_irqfd *irqfd = container_of(work, struct hsm_irqfd,
> + shutdown);
> + struct acrn_vm *vm = irqfd->vm;
> + u64 cnt;
>
> - irqfd = container_of(work, struct hsm_irqfd, shutdown);
> - vm = irqfd->vm;
> mutex_lock(&vm->irqfds_lock);
> if (!list_empty(&irqfd->list))
> - hsm_irqfd_shutdown(irqfd);
> + list_del_init(&irqfd->list);
> mutex_unlock(&vm->irqfds_lock);
> +
> + eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
> + eventfd_ctx_put(irqfd->eventfd);
> + kfree(irqfd);
> }
>
> /* Called with wqh->lock held and interrupts disabled */
> @@ -76,17 +80,16 @@ static int hsm_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
> {
> unsigned long poll_bits = (unsigned long)key;
> struct hsm_irqfd *irqfd;
> - struct acrn_vm *vm;
>
> irqfd = container_of(wait, struct hsm_irqfd, wait);
> - vm = irqfd->vm;
> +
> if (poll_bits & POLLIN)
> /* An event has been signaled, inject an interrupt */
> acrn_irqfd_inject(irqfd);
>
> if (poll_bits & POLLHUP)
> - /* Do shutdown work in thread to hold wqh->lock */
> - queue_work(vm->irqfd_wq, &irqfd->shutdown);
> + /* Defer teardown to the cleanup work; can't sleep here. */
> + hsm_irqfd_queue_shutdown(irqfd);
>
> return 0;
> }
> @@ -142,6 +145,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
> init_waitqueue_func_entry(&irqfd->wait, hsm_irqfd_wakeup);
> init_poll_funcptr(&irqfd->pt, hsm_irqfd_poll_func);
>
> + /*
> + * Hold irqfds_lock across waitqueue install and list_add so the
> + * irqfd is not visible to deassign/deinit before its waitqueue
> + * entry is in place, and any racing EPOLLHUP cleanup work blocks
> + * on irqfds_lock until publication completes.
> + */
> mutex_lock(&vm->irqfds_lock);
> list_for_each_entry(tmp, &vm->irqfds, list) {
> if (irqfd->eventfd != tmp->eventfd)
> @@ -150,14 +159,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
> mutex_unlock(&vm->irqfds_lock);
> goto fail;
> }
> - list_add_tail(&irqfd->list, &vm->irqfds);
> - mutex_unlock(&vm->irqfds_lock);
>
> - /* Check the pending event in this stage */
> events = vfs_poll(fd_file(f), &irqfd->pt);
> -
> + list_add_tail(&irqfd->list, &vm->irqfds);
> if (events & EPOLLIN)
> acrn_irqfd_inject(irqfd);
> + mutex_unlock(&vm->irqfds_lock);
>
> return 0;
> fail:
> @@ -180,13 +187,17 @@ static int acrn_irqfd_deassign(struct acrn_vm *vm,
> mutex_lock(&vm->irqfds_lock);
> list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) {
> if (irqfd->eventfd == eventfd) {
> - hsm_irqfd_shutdown(irqfd);
> + list_del_init(&irqfd->list);
> + hsm_irqfd_queue_shutdown(irqfd);
> break;
> }
> }
> mutex_unlock(&vm->irqfds_lock);
> eventfd_ctx_put(eventfd);
>
> + /* Wait for cleanup work to finish so the eventfd is fully detached. */
> + flush_workqueue(vm->irqfd_wq);
> +
> return 0;
> }
>
> @@ -219,9 +230,15 @@ void acrn_irqfd_deinit(struct acrn_vm *vm)
> struct hsm_irqfd *irqfd, *next;
>
> dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid);
> - destroy_workqueue(vm->irqfd_wq);
> +
> mutex_lock(&vm->irqfds_lock);
> - list_for_each_entry_safe(irqfd, next, &vm->irqfds, list)
> - hsm_irqfd_shutdown(irqfd);
> + list_for_each_entry_safe(irqfd, next, &vm->irqfds, list) {
> + list_del_init(&irqfd->list);
> + hsm_irqfd_queue_shutdown(irqfd);
> + }
> mutex_unlock(&vm->irqfds_lock);
> +
> + /* Drain all cleanup work before tearing the workqueue down. */
> + flush_workqueue(vm->irqfd_wq);
> + destroy_workqueue(vm->irqfd_wq);
> }
> --
> 2.34.1
>
Hi Sicong,
kernel test robot noticed the following build warnings:
[auto build test WARNING on linus/master]
[also build test WARNING on v7.1-rc3 next-20260508]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Sicong-Huang/virt-acrn-Fix-irqfd-use-after-free-during-async-shutdown/20260515-063914
base: linus/master
patch link: https://lore.kernel.org/r/20260511135737.2285411-1-congei42%40163.com
patch subject: [PATCH v1] virt: acrn: Fix irqfd use-after-free during async shutdown
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20260515/202605151636.lR13SqEN-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260515/202605151636.lR13SqEN-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605151636.lR13SqEN-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> drivers/virt/acrn/irqfd.c:232:28: warning: unused variable 'next' [-Wunused-variable]
232 | struct hsm_irqfd *irqfd, *next;
| ^~~~
1 warning generated.
vim +/next +232 drivers/virt/acrn/irqfd.c
aa3b483ff1d71c5 Shuo Liu 2021-02-07 229
aa3b483ff1d71c5 Shuo Liu 2021-02-07 230 void acrn_irqfd_deinit(struct acrn_vm *vm)
aa3b483ff1d71c5 Shuo Liu 2021-02-07 231 {
aa3b483ff1d71c5 Shuo Liu 2021-02-07 @232 struct hsm_irqfd *irqfd, *next;
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 2026-05-11 at 21:57:37 +0800, Sicong Huang wrote:
> ACRN irqfd registers a custom waitqueue callback on the eventfd. When the
> eventfd is released, eventfd_release() wakes the waitqueue with EPOLLHUP,
> and hsm_irqfd_wakeup() queues irqfd->shutdown on vm->irqfd_wq.
>
> The irqfd object can also be removed by ACRN_IOCTL_IRQFD with
> ACRN_IRQFD_FLAG_DEASSIGN. In that path, acrn_irqfd_deassign() removes the
> waitqueue entry and frees the hsm_irqfd object.
>
> These two paths can race. If EPOLLHUP queues the shutdown work before
> deassign frees the object, the work item may run after kfree() and recover
> the freed hsm_irqfd via container_of(). It then dereferences irqfd->vm while
> taking irqfds_lock.
>
> A possible race is:
> CPU0 CPU1
> eventfd_release()
> wake_up_poll(EPOLLHUP)
> hsm_irqfd_wakeup()
> queue_work(&irqfd->shutdown)
> acrn_irqfd_deassign()
> hsm_irqfd_shutdown()
> list_del_init()
> eventfd_ctx_remove_wait_queue()
> kfree(irqfd) //free here!
> hsm_irqfd_shutdown_work()
> irqfd = container_of(work, ...)
> vm = irqfd->vm //UAF!
>
> Fix this by separating logical shutdown from object release. First remove
> the irqfd from the VM list and eventfd waitqueue, then synchronously
> cancel any pending/running shutdown work before freeing the object.
> Also tear down irqfds before destroying the irqfd workqueue, so
> eventfd wakeups cannot queue work after the workqueue has been destroyed.
>
> Signed-off-by: Sicong Huang <congei42@163.com>
Hi Sicong,
Thanks for fixing this. The race looks real to me: `EPOLLHUP` can queue
`irqfd->shutdown`, while the deassign path can remove the waitqueue entry and
free the same irqfd before the work item runs.
The direction is good, but I think the lifetime rules can be made simpler. In
this version the final `kfree()` can still come from several places, with
`list_empty()` and `cancel_work_sync()` deciding who wins. That may work, but
it is harder to audit than necessary.
Could this follow the KVM irqfd pattern more closely? Deassign/deinit would
only deactivate the irqfd, i.e. remove it from the active list and queue cleanup
once. The cleanup work would then be the only place that removes the eventfd
waitqueue entry, drops the eventfd ref, and frees `struct hsm_irqfd`. The
synchronous paths can still flush the workqueue before returning or before
destroying `vm->irqfd_wq`.
Two smaller comments:
- `next` is left unused in `acrn_irqfd_deinit()` after this change.
- It may be worth tightening the assign path too, so an irqfd is not visible on
`vm->irqfds` before its eventfd waitqueue entry has been installed.
Thanks,
Fei
> ---
> drivers/virt/acrn/irqfd.c | 47 ++++++++++++++++++++++++++++++---------
> 1 file changed, 37 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
> index acf8cd5f8f8c..659fd40d9aa5 100644
> --- a/drivers/virt/acrn/irqfd.c
> +++ b/drivers/virt/acrn/irqfd.c
> @@ -44,30 +44,37 @@ static void acrn_irqfd_inject(struct hsm_irqfd *irqfd)
> irqfd->msi.msi_data);
> }
>
> -static void hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
> +static bool hsm_irqfd_shutdown(struct hsm_irqfd *irqfd)
> {
> u64 cnt;
>
> lockdep_assert_held(&irqfd->vm->irqfds_lock);
>
> + if (list_empty(&irqfd->list))
> + return false;
> +
> /* remove from wait queue */
> list_del_init(&irqfd->list);
> eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
> eventfd_ctx_put(irqfd->eventfd);
> - kfree(irqfd);
> +
> + return true;
> }
>
> static void hsm_irqfd_shutdown_work(struct work_struct *work)
> {
> struct hsm_irqfd *irqfd;
> struct acrn_vm *vm;
> + bool free;
>
> irqfd = container_of(work, struct hsm_irqfd, shutdown);
> vm = irqfd->vm;
> mutex_lock(&vm->irqfds_lock);
> - if (!list_empty(&irqfd->list))
> - hsm_irqfd_shutdown(irqfd);
> + free = hsm_irqfd_shutdown(irqfd);
> mutex_unlock(&vm->irqfds_lock);
> +
> + if (free)
> + kfree(irqfd);
> }
>
> /* Called with wqh->lock held and interrupts disabled */
> @@ -170,7 +177,7 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
> static int acrn_irqfd_deassign(struct acrn_vm *vm,
> struct acrn_irqfd *args)
> {
> - struct hsm_irqfd *irqfd, *tmp;
> + struct hsm_irqfd *irqfd, *tmp, *to_free = NULL;
> struct eventfd_ctx *eventfd;
>
> eventfd = eventfd_ctx_fdget(args->fd);
> @@ -180,13 +187,19 @@ static int acrn_irqfd_deassign(struct acrn_vm *vm,
> mutex_lock(&vm->irqfds_lock);
> list_for_each_entry_safe(irqfd, tmp, &vm->irqfds, list) {
> if (irqfd->eventfd == eventfd) {
> - hsm_irqfd_shutdown(irqfd);
> + if (hsm_irqfd_shutdown(irqfd))
> + to_free = irqfd;
> break;
> }
> }
> mutex_unlock(&vm->irqfds_lock);
> eventfd_ctx_put(eventfd);
>
> + if (to_free) {
> + cancel_work_sync(&to_free->shutdown);
> + kfree(to_free);
> + }
> +
> return 0;
> }
>
> @@ -219,9 +232,23 @@ void acrn_irqfd_deinit(struct acrn_vm *vm)
> struct hsm_irqfd *irqfd, *next;
>
> dev_dbg(acrn_dev.this_device, "VM %u irqfd deinit.\n", vm->vmid);
> +
> + for (;;) {
> + irqfd = NULL;
> +
> + mutex_lock(&vm->irqfds_lock);
> + if (!list_empty(&vm->irqfds)) {
> + irqfd = list_first_entry(&vm->irqfds, struct hsm_irqfd, list);
> + hsm_irqfd_shutdown(irqfd);
> + }
> + mutex_unlock(&vm->irqfds_lock);
> +
> + if (!irqfd)
> + break;
> +
> + cancel_work_sync(&irqfd->shutdown);
> + kfree(irqfd);
> + }
> +
> destroy_workqueue(vm->irqfd_wq);
> - mutex_lock(&vm->irqfds_lock);
> - list_for_each_entry_safe(irqfd, next, &vm->irqfds, list)
> - hsm_irqfd_shutdown(irqfd);
> - mutex_unlock(&vm->irqfds_lock);
> }
> --
> 2.34.1
>
© 2016 - 2026 Red Hat, Inc.