arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/xen.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+)
On PREEMPT_RT, kvm_xen_set_evtchn_fast() acquires a sleeping lock
(gpc->lock) from hard IRQ context (xen_timer_callback), triggering:
BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 0, name: swapper/5
preempt_count: 10100, expected: 0
RCU nest depth: 0, expected: 0
4 locks held by swapper/5/0:
INFO: lockdep is turned off.
irq event stamp: 1766
hardirqs last enabled at (1765): [<ffffffff81678fd4>] tick_nohz_idle_got_tick+0x84/0x90
hardirqs last disabled at (1766): [<ffffffff8b665051>] sysvec_apic_timer_interrupt+0x11/0xd0
softirqs last enabled at (0): [<ffffffff81289e76>] copy_process+0x1586/0x58b0
softirqs last disabled at (0): [<0000000000000000>] 0x0
Preempt disabled at:
[<ffffffff8b6650bc>] sysvec_apic_timer_interrupt+0x7c/0xd0
CPU: 5 UID: 0 PID: 0 Comm: swapper/5 Not tainted 6.13.0-rc1-syzkaller-00026-g2d5404caa8c7 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:94 [inline]
dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120
__might_resched+0x30d/0x8f0 kernel/sched/core.c:10318
rt_spin_lock+0x70/0x130 kernel/locking/spinlock_rt.c:48
kvm_xen_set_evtchn_fast+0x20b/0xa40 arch/x86/kvm/xen.c:1820
xen_timer_callback+0x91/0x1a0 arch/x86/kvm/xen.c:142
__run_hrtimer kernel/time/hrtimer.c:1739 [inline]
__hrtimer_run_queues+0x20b/0xa00 kernel/time/hrtimer.c:1803
The Xen timer uses HRTIMER_MODE_ABS_HARD for latency-sensitive event
delivery (see commit 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen
timer delivery")). On PREEMPT_RT, hard IRQ hrtimers execute in hard IRQ
context where sleeping locks cannot be acquired.
Use irq_work to defer event injection to a context where sleeping locks
are permitted on PREEMPT_RT. This preserves the hard IRQ timer precision
on non-RT kernels while avoiding the lock context violation on RT.
The approach follows the existing pvclock_irq_work pattern in
arch/x86/kvm/x86.c.
Tested on PREEMPT_RT kernel (CONFIG_PREEMPT_RT=y) with the syzbot C
reproducer - no crash observed after 30+ minutes of continuous execution.
Also tested on non-RT kernel (CONFIG_PREEMPT_RT=n) to verify no
regression in the fast path.
Reported-by: syzbot+919877893c9d28162dc2@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=919877893c9d28162dc2
Fixes: 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery")
Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/xen.c | 31 +++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5a3bfa293e8b..533b45289d53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -746,6 +746,7 @@ struct kvm_vcpu_xen {
u64 timer_expires; /* In guest epoch */
atomic_t timer_pending;
struct hrtimer timer;
+ struct irq_work timer_inject_irqwork;
int poll_evtchn;
struct timer_list poll_timer;
struct kvm_hypervisor_cpuid cpuid;
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index d6b2a665b499..01fa7b165355 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -122,6 +122,24 @@ void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
}
}
+static void xen_timer_inject_irqwork(struct irq_work *work)
+{
+ struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen,
+ timer_inject_irqwork);
+ struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen);
+ struct kvm_xen_evtchn e;
+ int rc;
+
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
+ if (rc != -EWOULDBLOCK)
+ vcpu->arch.xen.timer_expires = 0;
+}
+
static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
{
struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
@@ -132,6 +150,17 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
if (atomic_read(&vcpu->arch.xen.timer_pending))
return HRTIMER_NORESTART;
+ /*
+ * On PREEMPT_RT, this callback runs in hard IRQ context where
+ * kvm_xen_set_evtchn_fast() cannot acquire sleeping locks
+ * (specifically gpc->lock). Defer to irq_work which runs in
+ * thread context on RT.
+ */
+ if (in_hardirq()) {
+ irq_work_queue(&vcpu->arch.xen.timer_inject_irqwork);
+ return HRTIMER_NORESTART;
+ }
+
e.vcpu_id = vcpu->vcpu_id;
e.vcpu_idx = vcpu->vcpu_idx;
e.port = vcpu->arch.xen.timer_virq;
@@ -2303,6 +2332,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_HARD);
+ init_irq_work(&vcpu->arch.xen.timer_inject_irqwork,
+ xen_timer_inject_irqwork);
kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
--
2.43.0
On Sun, 29 Mar 2026 18:45:43 +0530
"shaikh.kamal" <shaikhkamal2012@gmail.com> wrote:
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -122,6 +122,24 @@ void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
> }
> }
>
> +static void xen_timer_inject_irqwork(struct irq_work *work)
> +{
> + struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen,
> + timer_inject_irqwork);
> + struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen);
> + struct kvm_xen_evtchn e;
> + int rc;
> +
> + e.vcpu_id = vcpu->vcpu_id;
> + e.vcpu_idx = vcpu->vcpu_idx;
> + e.port = vcpu->arch.xen.timer_virq;
> + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
> +
> + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
> + if (rc != -EWOULDBLOCK)
> + vcpu->arch.xen.timer_expires = 0;
> +}
Why duplicate this code and not simply make a static inline helper
function that is used in both places?
-- Steve
> +
> static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
> {
> struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
> @@ -132,6 +150,17 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
> if (atomic_read(&vcpu->arch.xen.timer_pending))
> return HRTIMER_NORESTART;
>
> + /*
> + * On PREEMPT_RT, this callback runs in hard IRQ context where
> + * kvm_xen_set_evtchn_fast() cannot acquire sleeping locks
> + * (specifically gpc->lock). Defer to irq_work which runs in
> + * thread context on RT.
> + */
> + if (in_hardirq()) {
> + irq_work_queue(&vcpu->arch.xen.timer_inject_irqwork);
> + return HRTIMER_NORESTART;
> + }
> +
> e.vcpu_id = vcpu->vcpu_id;
> e.vcpu_idx = vcpu->vcpu_idx;
> e.port = vcpu->arch.xen.timer_virq;
On Mon, 2026-03-30 at 10:18 -0400, Steven Rostedt wrote:
>
> > +static void xen_timer_inject_irqwork(struct irq_work *work)
> > +{
> > + struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen,
> > + timer_inject_irqwork);
> > + struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen);
> > + struct kvm_xen_evtchn e;
> > + int rc;
> > +
> > + e.vcpu_id = vcpu->vcpu_id;
> > + e.vcpu_idx = vcpu->vcpu_idx;
> > + e.port = vcpu->arch.xen.timer_virq;
> > + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
> > +
> > + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
> > + if (rc != -EWOULDBLOCK)
> > + vcpu->arch.xen.timer_expires = 0;
> > +}
>
> Why duplicate this code and not simply make a static inline helper
> function that is used in both places?
It's already duplicating the functionality; the original
xen_timer_callback() will already fall back to injecting the IRQ in
process context when it needs to (by setting vcpu-
>arch.xen.timer_pending and then setting KVM_REQ_UNBLOCK).
All you had to do was make kvm_xen_set_evtchn_fast() return
-EWOULDBLOCK in the in_hardirq() case in order to use the existing
fallback, surely?
Better still, can't kvm_xen_set_evtchn_fast() just use read_trylock()
instead?
Amazon Development Centre (London) Ltd. Registered in England and Wales with registration number 04543232 with its registered office at 1 Principal Place, Worship Street, London EC2A 2FA, United Kingdom.
On Mon, Mar 30, 2026, David Woodhouse wrote:
> On Mon, 2026-03-30 at 10:18 -0400, Steven Rostedt wrote:
> >
> > > +static void xen_timer_inject_irqwork(struct irq_work *work)
> > > +{
> > > + struct kvm_vcpu_xen *xen = container_of(work, struct kvm_vcpu_xen,
> > > + timer_inject_irqwork);
> > > + struct kvm_vcpu *vcpu = container_of(xen, struct kvm_vcpu, arch.xen);
> > > + struct kvm_xen_evtchn e;
> > > + int rc;
> > > +
> > > + e.vcpu_id = vcpu->vcpu_id;
> > > + e.vcpu_idx = vcpu->vcpu_idx;
> > > + e.port = vcpu->arch.xen.timer_virq;
> > > + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
> > > +
> > > + rc = kvm_xen_set_evtchn_fast(&e, vcpu->kvm);
> > > + if (rc != -EWOULDBLOCK)
> > > + vcpu->arch.xen.timer_expires = 0;
> > > +}
> >
> > Why duplicate this code and not simply make a static inline helper
> > function that is used in both places?
>
> It's already duplicating the functionality; the original
> xen_timer_callback() will already fall back to injecting the IRQ in
> process context when it needs to (by setting vcpu-
> >arch.xen.timer_pending and then setting KVM_REQ_UNBLOCK).
>
> All you had to do was make kvm_xen_set_evtchn_fast() return
> -EWOULDBLOCK in the in_hardirq() case in order to use the existing
> fallback, surely?
>
> Better still, can't kvm_xen_set_evtchn_fast() just use read_trylock()
> instead?
Re-reading through the thread where you proposed using trylock, and through
commit bbe17c625d68 ("KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest()"),
I think I agree with using trylock for "fast" paths.
Though I would prefer to not make it unconditional for the "fast" helper instead
of conditional based on in_interrupt(). And before we start doing surgery to
"fix" a setup no one uses, and also before we use gpcs more broadly, I think we
should try to up-level the gpc APIs to reduce the amount of duplicate, boilerplate
code. kvm_xen_update_runstate_guest() and maybe kvm_xen_set_evtchn() will likely
need to open code some amount of logic, but
Side topic, looks like kvm_xen_shared_info_init() is buggy in that it fails to
mark the slot as dirty.
E.g. sans the API implementations, I think we can and should end up with code
like this:
---
arch/x86/kvm/x86.c | 14 ++---
arch/x86/kvm/xen.c | 127 ++++++++++++---------------------------------
2 files changed, 37 insertions(+), 104 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b5d48e75b65..65bad25fd9d4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3274,15 +3274,8 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ if (kvm_gpc_acquire(gpc))
+ return;
guest_hv_clock = (void *)(gpc->khva + offset);
@@ -3305,8 +3298,7 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
guest_hv_clock->version = ++hv_clock.version;
- kvm_gpc_mark_dirty_in_slot(gpc);
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_dirty(gpc);
trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 91fd3673c09a..a97fd88ee99c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -42,19 +42,12 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
- int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
+ int ret;
- read_lock_irq(&gpc->lock);
- while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
- read_unlock_irq(&gpc->lock);
-
- ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
- if (ret)
- goto out;
-
- read_lock_irq(&gpc->lock);
- }
+ ret = kvm_gpc_acquire(gpc);
+ if (ret)
+ goto out;
/*
* This code mirrors kvm_write_wall_clock() except that it writes
@@ -96,7 +89,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
smp_wmb();
wc->version = wc_version + 1;
- read_unlock_irq(&gpc->lock);
+ kvm_gpc_release_dirty(gpc);
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
@@ -155,22 +148,14 @@ static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
struct gfn_to_pfn_cache *gpc,
unsigned int offset)
{
- unsigned long flags;
int r;
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
- if (r)
- return r;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ r = kvm_gpc_acquire(gpc);
+ if (r)
+ return r;
memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_clean(gpc);
/*
* Sanity check TSC shift+multiplier to verify the guest's view of time
@@ -420,27 +405,8 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* Attempt to obtain the GPC lock on *both* (if there are two)
* gfn_to_pfn caches that cover the region.
*/
- if (atomic) {
- local_irq_save(flags);
- if (!read_trylock(&gpc1->lock)) {
- local_irq_restore(flags);
- return;
- }
- } else {
- read_lock_irqsave(&gpc1->lock, flags);
- }
- while (!kvm_gpc_check(gpc1, user_len1)) {
- read_unlock_irqrestore(&gpc1->lock, flags);
-
- /* When invoked from kvm_sched_out() we cannot sleep */
- if (atomic)
- return;
-
- if (kvm_gpc_refresh(gpc1, user_len1))
- return;
-
- read_lock_irqsave(&gpc1->lock, flags);
- }
+ if (__kvm_gpc_acquire(gpc, atomic))
+ return;
if (likely(!user_len2)) {
/*
@@ -465,6 +431,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
* gpc1 lock to make lockdep shut up about it.
*/
lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+
if (atomic) {
if (!read_trylock(&gpc2->lock)) {
read_unlock_irqrestore(&gpc1->lock, flags);
@@ -575,13 +542,10 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}
- if (user_len2) {
- kvm_gpc_mark_dirty_in_slot(gpc2);
- read_unlock(&gpc2->lock);
- }
+ if (user_len2)
+ kvm_gpc_release_dirty(gpc2);
- kvm_gpc_mark_dirty_in_slot(gpc1);
- read_unlock_irqrestore(&gpc1->lock, flags);
+ kvm_gpc_release_dirty(gpc1);
}
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -645,20 +609,8 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
if (!evtchn_pending_sel)
return;
- /*
- * Yes, this is an open-coded loop. But that's just what put_user()
- * does anyway. Page it in and retry the instruction. We're just a
- * little more honest about it.
- */
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ if (kvm_gpc_acquire(gpc))
+ return;
/* Now gpc->khva is a valid kernel address for the vcpu_info */
if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
@@ -686,8 +638,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
- kvm_gpc_mark_dirty_in_slot(gpc);
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_dirty(gpc);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
@@ -697,8 +648,8 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
- unsigned long flags;
u8 rc = 0;
+ int r;
/*
* If the global upcall vector (HVMIRQ_callback_vector) is set and
@@ -713,33 +664,23 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- /*
- * This function gets called from kvm_vcpu_block() after setting the
- * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
- * from a HLT. So we really mustn't sleep. If the page ended up absent
- * at that point, just return 1 in order to trigger an immediate wake,
- * and we'll end up getting called again from a context where we *can*
- * fault in the page and wait for it.
- */
- if (in_atomic() || !task_is_running(current))
- return 1;
-
- if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
- /*
- * If this failed, userspace has screwed up the
- * vcpu_info mapping. No interrupts for you.
- */
- return 0;
- }
- read_lock_irqsave(&gpc->lock, flags);
- }
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ *
+ * If acquiring the cache fails completely, then userspace has screwed
+ * up the vcpu_info mapping. No interrupts for you.
+ */
+ r = __kvm_gpc_acquire(gpc, in_atomic() || !task_is_running(current));
+ if (r)
+ return r == -EWOULDBLOCK ? 1 : 0;
rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
- read_unlock_irqrestore(&gpc->lock, flags);
+ kvm_gpc_release_clean(gpc);
return rc;
}
base-commit: 3d6cdcc8883b5726513d245eef0e91cabfc397f7
--
[*] https://lore.kernel.org/all/76c61e1cb86e04df892d74c10976597700fe4cb5.camel@infradead.org
Hi Sean, Thanks for the detailed feedback. I've implemented v2 using unconditional read_trylock() as you suggested. Changes in v2: - Use read_trylock() unconditionally (no in_hardirq() check) - Apply trylock to both gpc lock acquisitions - Return -EWOULDBLOCK on contention to use existing slow path Changes from v1: - v1 used irq_work deferral (rejected for code duplication) - v2 uses trylock, leveraging existing slow path (cleaner) Testing: - PREEMPT_RT kernel: No crash with syzbot reproducer after 30+ min - non-RT kernel: No regression This minimal fix preserves compatibility with your planned gpc API cleanup. Thanks, Kamal shaikh.kamal (1): KVM: x86/xen: Use trylock for fast path event channel delivery arch/x86/kvm/xen.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) -- 2.43.0
On 2026-04-01 08:40:07 [-0700], Sean Christopherson wrote:
> Re-reading through the thread where you proposed using trylock, and through
> commit bbe17c625d68 ("KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest()"),
> I think I agree with using trylock for "fast" paths.
Has bbe17c625d68 been applied for PREEMPT_RT reasons?
Sebastian
kvm_xen_set_evtchn_fast() acquires gpc->lock with read_lock_irqsave(),
which becomes a sleeping lock on PREEMPT_RT, triggering:
BUG: sleeping function called from invalid context
in_hardirq(): 1, in_serving_softirq(): 0
Call Trace:
<IRQ>
rt_spin_lock+0x70/0x130
kvm_xen_set_evtchn_fast+0x20b/0xa40
xen_timer_callback+0x91/0x1a0
__run_hrtimer
hrtimer_interrupt
when called from hard IRQ context (e.g., hrtimer callback).
The function uses read_lock_irqsave() to access two gpc structures:
shinfo_cache and vcpu_info_cache. On PREEMPT_RT, these rwlocks are
rt_mutex-based and cannot be acquired from hard IRQ context.
Use read_trylock() instead for both gpc lock acquisitions. If either
lock is contended, return -EWOULDBLOCK to trigger the existing slow
path: xen_timer_callback() sets vcpu->arch.xen.timer_pending, kicks
the vCPU with KVM_REQ_UNBLOCK, and the event gets injected from
process context via kvm_xen_inject_timer_irqs().
This approach works on all kernels (RT and non-RT) and preserves the
"fast path" semantics: acquire the lock only if immediately available,
otherwise bail out rather than blocking.
Reported-by: syzbot+919877893c9d28162dc2@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=919877893c9d28162dc2
Fixes: 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery")
Suggested-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
---
arch/x86/kvm/xen.c | 33 +++++++++++++++++++++++++++++----
1 file changed, 29 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index d6b2a665b499..479e8f23a9c4 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1817,7 +1817,17 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);
- read_lock_irqsave(&gpc->lock, flags);
+ /*
+ * Use trylock for the "fast" path. If the lock is contended,
+ * return -EWOULDBLOCK to use the slow path which injects the
+ * event from process context via timer_pending + KVM_REQ_UNBLOCK.
+ */
+ local_irq_save(flags);
+ if (!read_trylock(&gpc->lock)) {
+ local_irq_restore(flags);
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EWOULDBLOCK;
+ }
if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
@@ -1848,10 +1858,22 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
} else {
rc = 1; /* Delivered to the bitmap in shared_info. */
/* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock(&gpc->lock);
+ local_irq_restore(flags);
gpc = &vcpu->arch.xen.vcpu_info_cache;
- read_lock_irqsave(&gpc->lock, flags);
+ local_irq_save(flags);
+ if (!read_trylock(&gpc->lock)) {
+ /*
+ * Lock contended. Set the in-kernel pending flag
+ * and kick the vCPU to inject via the slow path.
+ */
+ local_irq_restore(flags);
+ if (!test_and_set_bit(port_word_bit,
+ &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ goto out_kick;
+ }
if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
@@ -1885,7 +1907,10 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
}
out_rcu:
- read_unlock_irqrestore(&gpc->lock, flags);
+ read_unlock(&gpc->lock);
+ local_irq_restore(flags);
+
+ out_kick:
srcu_read_unlock(&kvm->srcu, idx);
if (kick_vcpu) {
--
2.43.0
On 2026-04-02 07:01:02 [+0530], shaikh.kamal wrote:
…
> The function uses read_lock_irqsave() to access two gpc structures:
> shinfo_cache and vcpu_info_cache. On PREEMPT_RT, these rwlocks are
> rt_mutex-based and cannot be acquired from hard IRQ context.
>
> Use read_trylock() instead for both gpc lock acquisitions. If either
> lock is contended, return -EWOULDBLOCK to trigger the existing slow
> path: xen_timer_callback() sets vcpu->arch.xen.timer_pending, kicks
> the vCPU with KVM_REQ_UNBLOCK, and the event gets injected from
> process context via kvm_xen_inject_timer_irqs().
>
> This approach works on all kernels (RT and non-RT) and preserves the
> "fast path" semantics: acquire the lock only if immediately available,
> otherwise bail out rather than blocking.
No. This split into local_irq_save() + trylock is something you must not
do. The fact that it does not lead to any warnings does not mean it is
good.
One problem is that your trylock will record the current task on the CPU
as the owner of this lock which can lead to odd lock chains if observed
by other tasks while trying to PI.
So no.
If this is just to shut up syskaller I would suggest to let xen depend
on !PREEMPT_RT until someone figures out what to do.
> Reported-by: syzbot+919877893c9d28162dc2@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=919877893c9d28162dc2
> Fixes: 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery")
> Suggested-by: Sean Christopherson <seanjc@google.com>
>
> Signed-off-by: shaikh.kamal <shaikhkamal2012@gmail.com>
Sebastian
© 2016 - 2026 Red Hat, Inc.