In the save/restore path, if KVM_SET_NESTED_STATE is performed before
restoring REGS and/or SREGS , the values of CS and RIP used to
initialize the vmcb02's NextRIP and soft interrupt tracking RIPs are
incorrect.
Recalculate them up after CS is set, or REGS are restored. This is only
needed when a nested run is pending during restore. After L2 runs for
the first time, any soft interrupts injected by L1 are already delivered
or tracked by KVM separately for re-injection, so the CS and RIP values
are no longer relevant.
If KVM_SET_NESTED_STATE is performed after both REGS and SREGS are
restored, it will just overwrite the fields.
Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE")
CC: stable@vger.kernel.org
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
---
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/svm/nested.c | 4 +++-
arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++++
arch/x86/kvm/x86.c | 2 ++
5 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index de709fb5bd76..7221517ea3e6 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -54,6 +54,7 @@ KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags)
KVM_X86_OP(get_if_flag)
+KVM_X86_OP_OPTIONAL(post_user_set_regs)
KVM_X86_OP(flush_tlb_all)
KVM_X86_OP(flush_tlb_current)
#if IS_ENABLED(CONFIG_HYPERV)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ff07c45e3c73..feadd9579159 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1789,6 +1789,7 @@ struct kvm_x86_ops {
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
bool (*get_if_flag)(struct kvm_vcpu *vcpu);
+ void (*post_user_set_regs)(struct kvm_vcpu *vcpu);
void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index af7a0113f269..22680aa31c28 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -766,7 +766,9 @@ void nested_vmcb02_prepare_rips(struct kvm_vcpu *vcpu, unsigned long csbase,
else if (boot_cpu_has(X86_FEATURE_NRIPS))
svm->vmcb->control.next_rip = rip;
- if (!is_evtinj_soft(svm->nested.ctl.event_inj))
+ /* L1's injected events should be cleared after the first run of L2 */
+ if (!is_evtinj_soft(svm->nested.ctl.event_inj) ||
+ WARN_ON_ONCE(!svm->nested.nested_run_pending))
return;
svm->soft_int_injected = true;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8f8bc863e214..5729da2b300d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1477,6 +1477,24 @@ static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
: kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
}
+static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * In the save/restore path, if nested state is restored before
+ * RIP or CS, then fixing up the vmcb02 (and soft IRQ tracking) is
+ * needed. This is only the case if a nested run is pending (i.e. L2
+ * is yet to run after L1's VMRUN). Otherwise, any soft IRQ injected by
+ * L1 should have been delivered to L2 or is being tracked separately by
+ * KVM for re-injection. Similarly, NextRIP would have already been
+ * updated by the CPU and/or KVM.
+ */
+ if (svm->nested.nested_run_pending)
+ nested_vmcb02_prepare_rips(vcpu, svm->vmcb->save.cs.base,
+ kvm_rip_read(vcpu));
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
kvm_register_mark_available(vcpu, reg);
@@ -1826,6 +1844,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
if (seg == VCPU_SREG_SS)
/* This is symmetric with svm_get_segment() */
svm->vmcb->save.cpl = (var->dpl & 3);
+ else if (seg == VCPU_SREG_CS)
+ svm_fixup_nested_rips(vcpu);
vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}
@@ -5172,6 +5192,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
+ .post_user_set_regs = svm_fixup_nested_rips,
.flush_tlb_all = svm_flush_tlb_all,
.flush_tlb_current = svm_flush_tlb_current,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index db3f393192d9..35fe1d337273 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12112,6 +12112,8 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
kvm_rip_write(vcpu, regs->rip);
kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
+ kvm_x86_call(post_user_set_regs)(vcpu);
+
vcpu->arch.exception.pending = false;
vcpu->arch.exception_vmexit.pending = false;
--
2.53.0.273.g2a3d683680-goog
On Thu, Feb 12, 2026, Yosry Ahmed wrote:
> In the save/restore path, if KVM_SET_NESTED_STATE is performed before
> restoring REGS and/or SREGS , the values of CS and RIP used to
> initialize the vmcb02's NextRIP and soft interrupt tracking RIPs are
> incorrect.
>
> Recalculate them up after CS is set, or REGS are restored. This is only
> needed when a nested run is pending during restore. After L2 runs for
> the first time, any soft interrupts injected by L1 are already delivered
> or tracked by KVM separately for re-injection, so the CS and RIP values
> are no longer relevant.
>
> If KVM_SET_NESTED_STATE is performed after both REGS and SREGS are
> restored, it will just overwrite the fields.
Apparently I suggested this general idea, but ugh. :-)
> static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
> {
> kvm_register_mark_available(vcpu, reg);
> @@ -1826,6 +1844,8 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
> if (seg == VCPU_SREG_SS)
> /* This is symmetric with svm_get_segment() */
> svm->vmcb->save.cpl = (var->dpl & 3);
> + else if (seg == VCPU_SREG_CS)
> + svm_fixup_nested_rips(vcpu);
>
> vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
> }
> @@ -5172,6 +5192,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
> .get_rflags = svm_get_rflags,
> .set_rflags = svm_set_rflags,
> .get_if_flag = svm_get_if_flag,
> + .post_user_set_regs = svm_fixup_nested_rips,
>
> .flush_tlb_all = svm_flush_tlb_all,
> .flush_tlb_current = svm_flush_tlb_current,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index db3f393192d9..35fe1d337273 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12112,6 +12112,8 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> kvm_rip_write(vcpu, regs->rip);
> kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
>
> + kvm_x86_call(post_user_set_regs)(vcpu);
I especially don't love this callback. Aside from adding a new kvm_x86_ops hook,
I don't like that _any_ CS change triggers a fixup, whereas only userspace writes
to RIP trigger a fixup. That _should_ be a moot point, because neither CS nor RIP
should change while nested_run_pending is true, but I dislike the asymmetry.
I was going to suggest we instead react to RIP being dirty, but what if we take
it a step further? Somewhat of a crazy idea, but what happens if we simply wait
until just before VMRUN to set soft_int_csbase, soft_int_old_rip, and
soft_int_next_rip (when the guest doesn't have NRIPS)?
E.g. after patch 2, completely untested...
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index aec17c80ed73..6fc1b2e212d2 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -863,12 +863,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
if (is_evtinj_soft(vmcb02->control.event_inj)) {
svm->soft_int_injected = true;
- svm->soft_int_csbase = vmcb12_csbase;
- svm->soft_int_old_rip = vmcb12_rip;
+
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
svm->soft_int_next_rip = svm->nested.ctl.next_rip;
- else
- svm->soft_int_next_rip = vmcb12_rip;
}
/* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8f8bc863e214..358ec940ffc9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4322,6 +4322,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
return EXIT_FASTPATH_EXIT_USERSPACE;
}
+ if (is_guest_mode(vcpu) && svm->nested.nested_run_pending &&
+ svm->soft_int_injected) {
+ svm->soft_int_csbase = svm->vmcb->save.cs.base;
+ svm->soft_int_old_rip = kvm_rip_read(vcpu);
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
+ svm->soft_int_next_rip = kvm_rip_read(vcpu);
+ }
+
sync_lapic_to_cr8(vcpu);
if (unlikely(svm->asid != svm->vmcb->control.asid)) {
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index db3f393192d9..35fe1d337273 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -12112,6 +12112,8 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
> > kvm_rip_write(vcpu, regs->rip);
> > kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
> >
> > + kvm_x86_call(post_user_set_regs)(vcpu);
>
> I especially don't love this callback. Aside from adding a new kvm_x86_ops hook,
> I don't like that _any_ CS change triggers a fixup, whereas only userspace writes
> to RIP trigger a fixup. That _should_ be a moot point, because neither CS nor RIP
> should change while nested_run_pending is true, but I dislike the asymmetry.
>
> I was going to suggest we instead react to RIP being dirty, but what if we take
> it a step further? Somewhat of a crazy idea, but what happens if we simply wait
> until just before VMRUN to set soft_int_csbase, soft_int_old_rip, and
> soft_int_next_rip (when the guest doesn't have NRIPS)?
I generally like this idea. I thought about it for a moment but was
worried about how much of a behavioral change this introduces, but that
was probably before I convinced myself the problem only exists with
nested_run_pending.
That being said..
>
> E.g. after patch 2, completely untested...
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index aec17c80ed73..6fc1b2e212d2 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -863,12 +863,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
Above the context lines we have:
/*
* next_rip is consumed on VMRUN as the return address pushed on the
* stack for injected soft exceptions/interrupts. If nrips is exposed
* to L1, take it verbatim from vmcb12. If nrips is supported in
* hardware but not exposed to L1, stuff the actual L2 RIP to emulate
* what a nrips=0 CPU would do (L1 is responsible for advancing RIP
* prior to injecting the event).
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
else if (boot_cpu_has(X86_FEATURE_NRIPS))
vmcb02->control.next_rip = vmcb12_rip;
The same bug affects vmcb02->control.next_rip when the guest doesn't
have NRIPS. I think we don't want to move part of the vmcb02
initialization before VMRUN too. We can keep the initialization here and
overwrite it before VMRUN if needed, but that's just also ugh..
> svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
> if (is_evtinj_soft(vmcb02->control.event_inj)) {
> svm->soft_int_injected = true;
> - svm->soft_int_csbase = vmcb12_csbase;
> - svm->soft_int_old_rip = vmcb12_rip;
> +
> if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> svm->soft_int_next_rip = svm->nested.ctl.next_rip;
Why not move this too?
> - else
> - svm->soft_int_next_rip = vmcb12_rip;
> }
>
> /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 8f8bc863e214..358ec940ffc9 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4322,6 +4322,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
> return EXIT_FASTPATH_EXIT_USERSPACE;
> }
>
> + if (is_guest_mode(vcpu) && svm->nested.nested_run_pending &&
> + svm->soft_int_injected) {
> + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> + svm->soft_int_old_rip = kvm_rip_read(vcpu);
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> + svm->soft_int_next_rip = kvm_rip_read(vcpu);
> + }
> +
I generally dislike adding more is_guest_mode() stuff in svm_vcpu_run(),
maybe we can refactor them later to pre-run and post-run nested
callbacks? Anyway, not a big deal, definitely an improvement over the
current patch assuming we can figure out how to fix next_rip.
> sync_lapic_to_cr8(vcpu);
>
> if (unlikely(svm->asid != svm->vmcb->control.asid)) {
On Thu, Feb 19, 2026, Yosry Ahmed wrote:
> > E.g. after patch 2, completely untested...
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index aec17c80ed73..6fc1b2e212d2 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -863,12 +863,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
>
> Above the context lines we have:
>
> /*
> * next_rip is consumed on VMRUN as the return address pushed on the
> * stack for injected soft exceptions/interrupts. If nrips is exposed
> * to L1, take it verbatim from vmcb12. If nrips is supported in
> * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
> * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
> * prior to injecting the event).
> */
> if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> vmcb02->control.next_rip = svm->nested.ctl.next_rip;
> else if (boot_cpu_has(X86_FEATURE_NRIPS))
> vmcb02->control.next_rip = vmcb12_rip;
>
> The same bug affects vmcb02->control.next_rip when the guest doesn't
> have NRIPS. I think we don't want to move part of the vmcb02
> initialization before VMRUN too. We can keep the initialization here and
> overwrite it before VMRUN if needed, but that's just also ugh..
Aha! I knew I was missing something, but I couldn't quite get my brain to figure
out what.
I don't have a super strong preference as to copying the code or moving it
wholesale. Though I would say if the pre-VMRUN logic is _identical_ (and I think
it is?), then we move it, and simply update the comment in
nested_vmcb02_prepare_control() to call that out.
> > svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
> > if (is_evtinj_soft(vmcb02->control.event_inj)) {
> > svm->soft_int_injected = true;
> > - svm->soft_int_csbase = vmcb12_csbase;
> > - svm->soft_int_old_rip = vmcb12_rip;
> > +
> > if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> > svm->soft_int_next_rip = svm->nested.ctl.next_rip;
>
> Why not move this too?
For the same reason I think we should keep
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
where it is. When NRIPS is exposed to the guest, the incoming nested state is
the one and only source of truth. By keeping the code different, we'd effectively
be documenting that the host.NRIPS+!guest.NRIPS case is the anomaly.
> > - else
> > - svm->soft_int_next_rip = vmcb12_rip;
> > }
> >
> > /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 8f8bc863e214..358ec940ffc9 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -4322,6 +4322,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
> > return EXIT_FASTPATH_EXIT_USERSPACE;
> > }
> >
> > + if (is_guest_mode(vcpu) && svm->nested.nested_run_pending &&
> > + svm->soft_int_injected) {
> > + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> > + svm->soft_int_old_rip = kvm_rip_read(vcpu);
> > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> > + svm->soft_int_next_rip = kvm_rip_read(vcpu);
> > + }
> > +
>
> I generally dislike adding more is_guest_mode() stuff in svm_vcpu_run(),
> maybe we can refactor them later to pre-run and post-run nested
> callbacks? Anyway, not a big deal, definitely an improvement over the
> current patch assuming we can figure out how to fix next_rip.
I don't love it either, but I want to (a) avoid unnecessarily overwriting the
fields, e.g. if KVM never actually does VMRUN and (b) minimize the probability
of consuming a bad RIP.
In practice, I would expect the nested_run_pending check to be correctly predicted
the overwhelming majority of the time, i.e. I don't anticipate performance issues
due to putting the code in the hot path.
If we want to try and move the update out of svm_vcpu_run(), then we shouldn't
need generic pre/post callbacks for nested, svm_prepare_switch_to_guest() is the
right fit.
> > > svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
> > > if (is_evtinj_soft(vmcb02->control.event_inj)) {
> > > svm->soft_int_injected = true;
> > > - svm->soft_int_csbase = vmcb12_csbase;
> > > - svm->soft_int_old_rip = vmcb12_rip;
> > > +
> > > if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> > > svm->soft_int_next_rip = svm->nested.ctl.next_rip;
> >
> > Why not move this too?
>
> For the same reason I think we should keep
>
> if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> vmcb02->control.next_rip = svm->nested.ctl.next_rip;
>
> where it is. When NRIPS is exposed to the guest, the incoming nested state is
> the one and only source of truth. By keeping the code different, we'd effectively
> be documenting that the host.NRIPS+!guest.NRIPS case is the anomaly.
I see, makes sense. I like the fact that we should be able to completely
drop vmcb12_rip and vmcb12_csbase with this (unless we want to start
using it for the bus_lock_rip check), which will also remove the need
for patch 2.
I guess the only hiccup will be patch 1. We'll end up with this code in
nested_vmcb02_prepare_control():
if boot_cpu_has(X86_FEATURE_NRIPS)) {
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
!svm->nested.nested_run_pending)
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
}
, and this code pre-VMRUN:
if boot_cpu_has(X86_FEATURE_NRIPS)) {
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) &&
svm->nested.nested_run_pending)
vmcb02->control.next_rip = kvm_rip_read(vcpu);
}
It seems a bit fragile that the 'if' is somewhere and the 'else' (or the
opposite condition) is somewhere else. They could get out of sync. Maybe
a helper will make this better:
/* Huge comment */
bool nested_svm_use_vmcb12_next_rip(struct kvm_vcpu *vcpu)
{
return guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
!svm->nested.nested_run_pending;
}
or maybe the name makes more sense as the negative:
nested_svm_use_rip_as_next_rip()?
I don't like both names..
Aha! Maybe it's actually better to have the helper set the NextRip
directly?
/* Huge comment */
u64 nested_vmcb02_update_next_rip(struct kvm_vcpu *vcpu)
{
u64 next_rip;
if (WARN_ON_ONCE(svm->vmcb != svm->nested.vmcb02.ptr))
return;
if (!boot_cpu_has(X86_FEATURE_NRIPS))
return;
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
!svm->nested.nested_run_pending)
next_rip = svm->nested.ctl.next_rip;
else
next_rip = kvm_rip_read(vcpu);
svm->vmcb->control.next_rip = next_rip;
}
Then, we just call this from nested_vmcb02_prepare_control() and
svm_vcpu_run() (with the soft IRQ stuff). In some cases we'll put the
wrong thing in vmcb02 and then fix it up later, but I think that's fine
(that's what the patch is doing anyway).
However, we lose the fact that the whole thing is guarded by
nested_run_pending, so performance in svm_vcpu_run() could suffer. We
could leave it all guarded by nested_run_pending, as
nested_vmcb02_update_next_rip() should do nothing in svm_vcpu_run()
otherwise, but then the caller is depending on implementation details of
the helper.
Maybe just put it in svm_prepare_switch_to_guest() to begin with and not
guard it with nested_run_pending?
>
> > > - else
> > > - svm->soft_int_next_rip = vmcb12_rip;
> > > }
> > >
> > > /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
> > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > index 8f8bc863e214..358ec940ffc9 100644
> > > --- a/arch/x86/kvm/svm/svm.c
> > > +++ b/arch/x86/kvm/svm/svm.c
> > > @@ -4322,6 +4322,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
> > > return EXIT_FASTPATH_EXIT_USERSPACE;
> > > }
> > >
> > > + if (is_guest_mode(vcpu) && svm->nested.nested_run_pending &&
> > > + svm->soft_int_injected) {
> > > + svm->soft_int_csbase = svm->vmcb->save.cs.base;
> > > + svm->soft_int_old_rip = kvm_rip_read(vcpu);
> > > + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> > > + svm->soft_int_next_rip = kvm_rip_read(vcpu);
> > > + }
> > > +
> >
> > I generally dislike adding more is_guest_mode() stuff in svm_vcpu_run(),
> > maybe we can refactor them later to pre-run and post-run nested
> > callbacks? Anyway, not a big deal, definitely an improvement over the
> > current patch assuming we can figure out how to fix next_rip.
>
> I don't love it either, but I want to (a) avoid unnecessarily overwriting the
> fields, e.g. if KVM never actually does VMRUN and (b) minimize the probability
> of consuming a bad RIP.
>
> In practice, I would expect the nested_run_pending check to be correctly predicted
> the overwhelming majority of the time, i.e. I don't anticipate performance issues
> due to putting the code in the hot path.
>
> If we want to try and move the update out of svm_vcpu_run(), then we shouldn't
> need generic pre/post callbacks for nested, svm_prepare_switch_to_guest() is the
> right fit.
I prefer putting it in svm_prepare_switch_to_guest() to begin with,
because it gives us more flexibility (see above).
On Fri, Feb 20, 2026, Yosry Ahmed wrote:
> > > > svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
> > > > if (is_evtinj_soft(vmcb02->control.event_inj)) {
> > > > svm->soft_int_injected = true;
> > > > - svm->soft_int_csbase = vmcb12_csbase;
> > > > - svm->soft_int_old_rip = vmcb12_rip;
> > > > +
> > > > if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> > > > svm->soft_int_next_rip = svm->nested.ctl.next_rip;
> > >
> > > Why not move this too?
> >
> > For the same reason I think we should keep
> >
> > if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
> > vmcb02->control.next_rip = svm->nested.ctl.next_rip;
> >
> > where it is. When NRIPS is exposed to the guest, the incoming nested state is
> > the one and only source of truth. By keeping the code different, we'd effectively
> > be documenting that the host.NRIPS+!guest.NRIPS case is the anomaly.
>
> I see, makes sense. I like the fact that we should be able to completely
> drop vmcb12_rip and vmcb12_csbase with this (unless we want to start
> using it for the bus_lock_rip check), which will also remove the need
> for patch 2.
...
> It seems a bit fragile that the 'if' is somewhere and the 'else' (or the
> opposite condition) is somewhere else. They could get out of sync.
> Maybe
> a helper will make this better:
>
> /* Huge comment */
> bool nested_svm_use_vmcb12_next_rip(struct kvm_vcpu *vcpu)
> {
> return guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
> !svm->nested.nested_run_pending;
> }
>
> or maybe the name makes more sense as the negative:
> nested_svm_use_rip_as_next_rip()?
>
> I don't like both names..
>
> Aha! Maybe it's actually better to have the helper set the NextRip
> directly?
>
> /* Huge comment */
> u64 nested_vmcb02_update_next_rip(struct kvm_vcpu *vcpu)
> {
> u64 next_rip;
>
> if (WARN_ON_ONCE(svm->vmcb != svm->nested.vmcb02.ptr))
> return;
>
> if (!boot_cpu_has(X86_FEATURE_NRIPS))
> return;
>
> if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
> !svm->nested.nested_run_pending)
> next_rip = svm->nested.ctl.next_rip;
> else
> next_rip = kvm_rip_read(vcpu);
>
> svm->vmcb->control.next_rip = next_rip;
> }
>
> Then, we just call this from nested_vmcb02_prepare_control() and
> svm_vcpu_run() (with the soft IRQ stuff). In some cases we'll put the
> wrong thing in vmcb02 and then fix it up later, but I think that's fine
> (that's what the patch is doing anyway).
>
> However, we lose the fact that the whole thing is guarded by
> nested_run_pending, so performance in svm_vcpu_run() could suffer. We
> could leave it all guarded by nested_run_pending, as
> nested_vmcb02_update_next_rip() should do nothing in svm_vcpu_run()
> otherwise, but then the caller is depending on implementation details of
> the helper.
>
> Maybe just put it in svm_prepare_switch_to_guest() to begin with and not
> guard it with nested_run_pending?
Of all the options, I still like open coding the two, even though it means the
"else" will be separated from the "if", followed by the
nested_svm_use_vmcb12_next_rip() helper option. I straight up dislike
nested_vmcb02_update_next_rip() because it buries simple concepts behind a
complex function (copy vmcb12->next_rip to vmcb02->next_rip is at its core a
*very* simple idea). Oh, and it unnecessarily rewrites vmcb02->next_rip in the
common case where the guest has NRIPs.
I'm a-ok with the separate "if" and "else", because it's not a pure "else", and
that's the entire reason we have a mess in the first place: we wrote an if-else,
but what is actually necessitate by KVM's uAPI is two separate (but tightly
coupled) if-statements.
© 2016 - 2026 Red Hat, Inc.