[PATCH 1/4] KVM: nSVM: Sync next_rip to cached vmcb12 after VMRUN of L2

Yosry Ahmed posted 4 patches 6 hours ago
[PATCH 1/4] KVM: nSVM: Sync next_rip to cached vmcb12 after VMRUN of L2
Posted by Yosry Ahmed 6 hours ago
After VMRUN in guest mode, nested_sync_control_from_vmcb02() syncs
fields written by the CPU from vmcb02 to the cached vmcb12. This is
because the cached vmcb12 is used as the authoritative copy of some of
the controls, and is the payload when saving/restoring nested state.

next_rip is also written by the CPU (in some cases) after VMRUN, but is
not sync'd to cached vmcb12. As a result, it is corrupted after
save/restore (replaced by the original value written by L1 on nested
VMRUN). This could cause problems for both KVM (e.g. when injecting a
soft IRQ) or L1 (e.g. when using next_rip to advance RIP after emulating
an instruction).

Fix this by sync'ing next_rip in nested_sync_control_from_vmcb02(). Move
the call to nested_sync_control_from_vmcb02() (and the entire
is_guest_mode() block) after svm_complete_interrupts(), as it may update
next_rip in vmcb02.

Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE")
CC: stable@vger.kernel.org
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
---
 arch/x86/kvm/svm/nested.c |  6 ++++--
 arch/x86/kvm/svm/svm.c    | 26 +++++++++++++++-----------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index de90b104a0dd..70086ba6497f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -519,8 +519,10 @@ void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
 void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
 {
 	u32 mask;
-	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
-	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;
+
+	svm->nested.ctl.event_inj	= svm->vmcb->control.event_inj;
+	svm->nested.ctl.event_inj_err	= svm->vmcb->control.event_inj_err;
+	svm->nested.ctl.next_rip	= svm->vmcb->control.next_rip;
 
 	/* Only a few fields of int_ctl are written by the processor.  */
 	mask = V_IRQ_MASK | V_TPR_MASK;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5f0136dbdde6..6d8d4d19455e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4399,17 +4399,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 	sync_cr8_to_lapic(vcpu);
 
 	svm->next_rip = 0;
-	if (is_guest_mode(vcpu)) {
-		nested_sync_control_from_vmcb02(svm);
-
-		/* Track VMRUNs that have made past consistency checking */
-		if (svm->nested.nested_run_pending &&
-		    !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
-                        ++vcpu->stat.nested_run;
-
-		svm->nested.nested_run_pending = 0;
-	}
-
 	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
 
 	/*
@@ -4435,6 +4424,21 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 
 	svm_complete_interrupts(vcpu);
 
+	/*
+	 * svm_complete_interrupts() may update svm->vmcb->control.next_rip,
+	 * which is sync'd by nested_sync_control_from_vmcb02() below.
+	 */
+	if (is_guest_mode(vcpu)) {
+		nested_sync_control_from_vmcb02(svm);
+
+		/* Track VMRUNs that have made past consistency checking */
+		if (svm->nested.nested_run_pending &&
+		    !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
+			++vcpu->stat.nested_run;
+
+		svm->nested.nested_run_pending = 0;
+	}
+
 	return svm_exit_handlers_fastpath(vcpu);
 }
 

base-commit: e944fe2c09f405a2e2d147145c9b470084bc4c9a
-- 
2.53.0.rc2.204.g2597b5adb4-goog
Re: [PATCH 1/4] KVM: nSVM: Sync next_rip to cached vmcb12 after VMRUN of L2
Posted by Sean Christopherson 5 hours ago
On Tue, Feb 10, 2026, Yosry Ahmed wrote:
> After VMRUN in guest mode, nested_sync_control_from_vmcb02() syncs
> fields written by the CPU from vmcb02 to the cached vmcb12. This is
> because the cached vmcb12 is used as the authoritative copy of some of
> the controls, and is the payload when saving/restoring nested state.
> 
> next_rip is also written by the CPU (in some cases) after VMRUN, but is
> not sync'd to cached vmcb12. As a result, it is corrupted after
> save/restore (replaced by the original value written by L1 on nested
> VMRUN). This could cause problems for both KVM (e.g. when injecting a
> soft IRQ) or L1 (e.g. when using next_rip to advance RIP after emulating
> an instruction).
> 
> Fix this by sync'ing next_rip in nested_sync_control_from_vmcb02(). Move
> the call to nested_sync_control_from_vmcb02() (and the entire
> is_guest_mode() block) after svm_complete_interrupts(), as it may update
> next_rip in vmcb02.

I'll give you one guess as to what I would say about bundling changes.  AFAICT,
there is _zero_ reason to move the call nested_sync_control_from_vmcb02() in a
patch tagged for stable@.

> Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE")
> CC: stable@vger.kernel.org
> Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> ---
>  arch/x86/kvm/svm/nested.c |  6 ++++--
>  arch/x86/kvm/svm/svm.c    | 26 +++++++++++++++-----------
>  2 files changed, 19 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index de90b104a0dd..70086ba6497f 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -519,8 +519,10 @@ void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
>  void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
>  {
>  	u32 mask;
> -	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
> -	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;
> +
> +	svm->nested.ctl.event_inj	= svm->vmcb->control.event_inj;
> +	svm->nested.ctl.event_inj_err	= svm->vmcb->control.event_inj_err;
> +	svm->nested.ctl.next_rip	= svm->vmcb->control.next_rip;

This is all a mess (the existing code).  nested_svm_vmexit() does this:

	vmcb12->control.int_state         = vmcb02->control.int_state;
	vmcb12->control.exit_code         = vmcb02->control.exit_code;
	vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
	vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;

	if (!svm_is_vmrun_failure(vmcb12->control.exit_code))
		nested_save_pending_event_to_vmcb12(svm, vmcb12);

	if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
		vmcb12->control.next_rip  = vmcb02->control.next_rip;

	vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
	vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
	vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;

but then svm_get_nested_state(), by way of nested_copy_vmcb_cache_to_control(),
pulls everything from the cached fields.  Which probably only works because the
only fields that are pulled from vmcb02 nested_svm_vmexit() are never modified
by the CPU.

Actually, I take that back, I have no idea how this code works.  How does e.g.
exit_info_1 not get clobbered on save/restore?

In other words, AFAICT, nested.ctl.int_ctl is special in that KVM needs it to be
up-to-date at all times, *and* it needs to copied back to vmcb12 (or userspace).

Part of me wants to remove these two fields entirely:

	/* cache for control fields of the guest */
	struct vmcb_ctrl_area_cached ctl;

	/*
	 * Note: this struct is not kept up-to-date while L2 runs; it is only
	 * valid within nested_svm_vmrun.
	 */
	struct vmcb_save_area_cached save;

and instead use "full" caches only for the duration of nested_svm_vmrun().  Or
hell, just copy the entire vmcb12 and throw the cached structures in the garbage.
But that'll probably end in a game of whack-a-mole as things get moved back in.

So rather than do something totally drastic, I think we should kill
nested_copy_vmcb_cache_to_control() and replace it with a "save control" flow.
And then have it share code as much code as possible with nested_svm_vmexit(),
and fixup nested_svm_vmexit() to not pull from svm->nested.ctl unnecessarily.
Which, again AFICT, is pretty much limited to int_ctl: either vmcb02 is
authoritative, or KVM shouldn't be updating vmcb12, and so only the "save control"
for KVM_GET_NESTED_STATE needs to copy from the cache to the migrated vmcb12.

That'll probably end up a bit fat for a stable@ patch, so we could do a gross
one-off fix for this issue, and then do cleanups on top.

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5f0136dbdde6..cd5664c65a00 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4435,6 +4435,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
 
        svm_complete_interrupts(vcpu);
 
+       /*
+        * Update the cache after completing interrupts to get an accurate
+        * NextRIP, e.g. when re-injecting a soft interrupt.
+        *
+        * FIXME: Rework svm_get_nested_state() to not pull data from the
+        *        cache (except for maybe int_ctl).
+        */
+       if (is_guest_mode(vcpu))
+               svm->nested.ctl.next_rip = svm->vmcb->control.next_rip;
+
        return svm_exit_handlers_fastpath(vcpu);
 }
 
>  	/* Only a few fields of int_ctl are written by the processor.  */
>  	mask = V_IRQ_MASK | V_TPR_MASK;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 5f0136dbdde6..6d8d4d19455e 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4399,17 +4399,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
>  	sync_cr8_to_lapic(vcpu);
>  
>  	svm->next_rip = 0;
> -	if (is_guest_mode(vcpu)) {
> -		nested_sync_control_from_vmcb02(svm);
> -
> -		/* Track VMRUNs that have made past consistency checking */
> -		if (svm->nested.nested_run_pending &&
> -		    !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
> -                        ++vcpu->stat.nested_run;
> -
> -		svm->nested.nested_run_pending = 0;
> -	}
> -
>  	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
>  
>  	/*
> @@ -4435,6 +4424,21 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
>  
>  	svm_complete_interrupts(vcpu);
>  
> +	/*
> +	 * svm_complete_interrupts() may update svm->vmcb->control.next_rip,
> +	 * which is sync'd by nested_sync_control_from_vmcb02() below.

Please try to avoid referencing functions and fields in comments.  History has
shown that they almost always become stale.

> +	 */
> +	if (is_guest_mode(vcpu)) {
> +		nested_sync_control_from_vmcb02(svm);
> +
> +		/* Track VMRUNs that have made past consistency checking */
> +		if (svm->nested.nested_run_pending &&
> +		    !svm_is_vmrun_failure(svm->vmcb->control.exit_code))
> +			++vcpu->stat.nested_run;
> +
> +		svm->nested.nested_run_pending = 0;
> +	}
> +
>  	return svm_exit_handlers_fastpath(vcpu);
>  }
>  
> 
> base-commit: e944fe2c09f405a2e2d147145c9b470084bc4c9a
> -- 
> 2.53.0.rc2.204.g2597b5adb4-goog
>