Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from
the VMRUN path, instead of making the call in nested_svm_vmrun(). This
simplifies the flow of nested_svm_vmrun() and removes all jumps to
cleanup labels.
Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
---
arch/x86/kvm/svm/nested.c | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index a48668c36a191..89830380cebc5 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
nested_svm_hv_update_vm_vp_ids(vcpu);
+ if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
+ return -1;
+
return 0;
}
@@ -1105,23 +1108,18 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true))
- goto out_exit_err;
-
- if (nested_svm_merge_msrpm(vcpu))
- return ret;
-
-out_exit_err:
- svm->nested.nested_run_pending = 0;
- svm->nmi_l1_to_l2 = false;
- svm->soft_int_injected = false;
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
+ svm->nested.nested_run_pending = 0;
+ svm->nmi_l1_to_l2 = false;
+ svm->soft_int_injected = false;
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
- nested_svm_vmexit(svm);
+ nested_svm_vmexit(svm);
+ }
return ret;
}
--
2.51.2.1041.gc1ab5b90ca-goog
On Mon, Nov 10, 2025, Yosry Ahmed wrote:
> Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from
> the VMRUN path, instead of making the call in nested_svm_vmrun(). This
> simplifies the flow of nested_svm_vmrun() and removes all jumps to
> cleanup labels.
>
> Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> ---
> arch/x86/kvm/svm/nested.c | 28 +++++++++++++---------------
> 1 file changed, 13 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index a48668c36a191..89830380cebc5 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
>
> nested_svm_hv_update_vm_vp_ids(vcpu);
>
> + if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
This is silly, just do:
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
nested_svm_merge_msrpm(vcpu)) {
svm->nested.nested_run_pending = 0;
svm->nmi_l1_to_l2 = false;
svm->soft_int_injected = false;
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
svm->vmcb->control.exit_code_hi = -1u;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
nested_svm_vmexit(svm);
}
> + return -1;
Please stop returning -1, use a proper -errno.
> +
> return 0;
> }
>
> @@ -1105,23 +1108,18 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
>
> svm->nested.nested_run_pending = 1;
>
> - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true))
> - goto out_exit_err;
> -
> - if (nested_svm_merge_msrpm(vcpu))
> - return ret;
> -
> -out_exit_err:
> - svm->nested.nested_run_pending = 0;
> - svm->nmi_l1_to_l2 = false;
> - svm->soft_int_injected = false;
> + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> + svm->nested.nested_run_pending = 0;
> + svm->nmi_l1_to_l2 = false;
> + svm->soft_int_injected = false;
>
> - svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> - svm->vmcb->control.exit_code_hi = 0;
> - svm->vmcb->control.exit_info_1 = 0;
> - svm->vmcb->control.exit_info_2 = 0;
> + svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> + svm->vmcb->control.exit_code_hi = 0;
> + svm->vmcb->control.exit_info_1 = 0;
> + svm->vmcb->control.exit_info_2 = 0;
>
> - nested_svm_vmexit(svm);
> + nested_svm_vmexit(svm);
Note, there's a pre-existing bug in nested_svm_vmexit(). Lovely, and it's a
user-triggerable WARN_ON() (and not even a WARN_ON_ONCE() at that).
If nested_svm_vmexit() fails to map vmcb12, it (unbelievably stupidly) injects a
#GP and hopes for the best. Oh FFS, it also has the asinine -EINVAL "logic".
Anyways, it injects #GP (maybe), and bails early, which leaves
KVM_REQ_GET_NESTED_STATE_PAGES set. KVM will then process that on the next
vcpu_enter_guest() and trip the WARN_ON() in svm_get_nested_state_pages().
Something like this to clean up the mess:
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index d4c872843a9d..96f8009a0d45 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1018,9 +1018,6 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
nested_svm_hv_update_vm_vp_ids(vcpu);
- if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
- return -1;
-
return 0;
}
@@ -1094,7 +1091,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
+ nested_svm_merge_msrpm(vcpu)) {
svm->nested.nested_run_pending = 0;
svm->nmi_l1_to_l2 = false;
svm->soft_int_injected = false;
@@ -1158,24 +1156,16 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
int nested_svm_vmexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
+ gpa_t vmcb12_gpa = svm->nested.vmcb12_gpa;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
struct vmcb *vmcb12;
struct kvm_host_map map;
- int rc;
-
- rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
- if (rc) {
- if (rc == -EINVAL)
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
vmcb12 = map.hva;
/* Exit Guest-Mode */
leave_guest_mode(vcpu);
- svm->nested.vmcb12_gpa = 0;
WARN_ON_ONCE(svm->nested.nested_run_pending);
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
@@ -1183,6 +1173,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
/* in case we halted in L2 */
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
+ svm->nested.vmcb12_gpa = 0;
+
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return 1;
+ }
+
/* Give the current vmcb to the guest */
vmcb12->save.es = vmcb02->save.es;
@@ -1973,7 +1970,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- if (WARN_ON(!is_guest_mode(vcpu)))
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
return true;
if (!vcpu->arch.pdptrs_from_userspace &&
On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote:
> On Mon, Nov 10, 2025, Yosry Ahmed wrote:
> > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from
> > the VMRUN path, instead of making the call in nested_svm_vmrun(). This
> > simplifies the flow of nested_svm_vmrun() and removes all jumps to
> > cleanup labels.
> >
> > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> > ---
> > arch/x86/kvm/svm/nested.c | 28 +++++++++++++---------------
> > 1 file changed, 13 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index a48668c36a191..89830380cebc5 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
> >
> > nested_svm_hv_update_vm_vp_ids(vcpu);
> >
> > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
>
> This is silly, just do:
>
> if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> nested_svm_merge_msrpm(vcpu)) {
> svm->nested.nested_run_pending = 0;
> svm->nmi_l1_to_l2 = false;
> svm->soft_int_injected = false;
>
> svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> svm->vmcb->control.exit_code_hi = -1u;
> svm->vmcb->control.exit_info_1 = 0;
> svm->vmcb->control.exit_info_2 = 0;
>
> nested_svm_vmexit(svm);
> }
>
> > + return -1;
>
> Please stop returning -1, use a proper -errno.
>
> > +
> > return 0;
> > }
> >
> > @@ -1105,23 +1108,18 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
> >
> > svm->nested.nested_run_pending = 1;
> >
> > - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true))
> > - goto out_exit_err;
> > -
> > - if (nested_svm_merge_msrpm(vcpu))
> > - return ret;
> > -
> > -out_exit_err:
> > - svm->nested.nested_run_pending = 0;
> > - svm->nmi_l1_to_l2 = false;
> > - svm->soft_int_injected = false;
> > + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> > + svm->nested.nested_run_pending = 0;
> > + svm->nmi_l1_to_l2 = false;
> > + svm->soft_int_injected = false;
> >
> > - svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > - svm->vmcb->control.exit_code_hi = 0;
> > - svm->vmcb->control.exit_info_1 = 0;
> > - svm->vmcb->control.exit_info_2 = 0;
> > + svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > + svm->vmcb->control.exit_code_hi = 0;
> > + svm->vmcb->control.exit_info_1 = 0;
> > + svm->vmcb->control.exit_info_2 = 0;
> >
> > - nested_svm_vmexit(svm);
> > + nested_svm_vmexit(svm);
>
> Note, there's a pre-existing bug in nested_svm_vmexit(). Lovely, and it's a
> user-triggerable WARN_ON() (and not even a WARN_ON_ONCE() at that).
>
> If nested_svm_vmexit() fails to map vmcb12, it (unbelievably stupidly) injects a
> #GP and hopes for the best. Oh FFS, it also has the asinine -EINVAL "logic".
> Anyways, it injects #GP (maybe), and bails early, which leaves
> KVM_REQ_GET_NESTED_STATE_PAGES set. KVM will then process that on the next
> vcpu_enter_guest() and trip the WARN_ON() in svm_get_nested_state_pages().
FWIW, I don't think there will be a warning because when
nested_svm_vmexit() fails to map vmcb12 it also fails to leave guest
mode, so the WARN_ON() should not fire.
I still agree this is a bug and will include a fix/cleanup in the next
version that I will send out shortly.
On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote:
> On Mon, Nov 10, 2025, Yosry Ahmed wrote:
> > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from
> > the VMRUN path, instead of making the call in nested_svm_vmrun(). This
> > simplifies the flow of nested_svm_vmrun() and removes all jumps to
> > cleanup labels.
> >
> > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> > ---
> > arch/x86/kvm/svm/nested.c | 28 +++++++++++++---------------
> > 1 file changed, 13 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index a48668c36a191..89830380cebc5 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
> >
> > nested_svm_hv_update_vm_vp_ids(vcpu);
> >
> > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
>
> This is silly, just do:
>
> if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> nested_svm_merge_msrpm(vcpu)) {
> svm->nested.nested_run_pending = 0;
> svm->nmi_l1_to_l2 = false;
> svm->soft_int_injected = false;
>
> svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> svm->vmcb->control.exit_code_hi = -1u;
> svm->vmcb->control.exit_info_1 = 0;
> svm->vmcb->control.exit_info_2 = 0;
>
> nested_svm_vmexit(svm);
> }
Actually, if we go with the approach of making all VMRUN failures
happen before preparing the VMCB02 (as discussed in the other thread),
then we will want to call nested_svm_merge_msrpm() from within
enter_svm_guest_mode().
Otherwise, we either have a separate failure path for
nested_svm_merge_msrpm(), or we make all VMRUN failures happen after
preparing the VMCB02 and handled by nested_svm_vmexit().
I like having a separate exit path for VMRUN failures, and it makes more
sense to do the consistency checks on VMCB12 before preparing VMCB02.
But I understand if you prefer to keep things simple and move all
failures after VMCB02.
I already have it implemented with the separate VMRUN failure path, but
I don't wanna spam you with another series if you prefer it the other
way.
>
> > + return -1;
>
> Please stop returning -1, use a proper -errno.
>
> > +
> > return 0;
> > }
> >
> > @@ -1105,23 +1108,18 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
> >
> > svm->nested.nested_run_pending = 1;
> >
> > - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true))
> > - goto out_exit_err;
> > -
> > - if (nested_svm_merge_msrpm(vcpu))
> > - return ret;
> > -
> > -out_exit_err:
> > - svm->nested.nested_run_pending = 0;
> > - svm->nmi_l1_to_l2 = false;
> > - svm->soft_int_injected = false;
> > + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> > + svm->nested.nested_run_pending = 0;
> > + svm->nmi_l1_to_l2 = false;
> > + svm->soft_int_injected = false;
> >
> > - svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > - svm->vmcb->control.exit_code_hi = 0;
> > - svm->vmcb->control.exit_info_1 = 0;
> > - svm->vmcb->control.exit_info_2 = 0;
> > + svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > + svm->vmcb->control.exit_code_hi = 0;
> > + svm->vmcb->control.exit_info_1 = 0;
> > + svm->vmcb->control.exit_info_2 = 0;
> >
> > - nested_svm_vmexit(svm);
> > + nested_svm_vmexit(svm);
>
> Note, there's a pre-existing bug in nested_svm_vmexit(). Lovely, and it's a
> user-triggerable WARN_ON() (and not even a WARN_ON_ONCE() at that).
>
> If nested_svm_vmexit() fails to map vmcb12, it (unbelievably stupidly) injects a
> #GP and hopes for the best. Oh FFS, it also has the asinine -EINVAL "logic".
> Anyways, it injects #GP (maybe), and bails early, which leaves
> KVM_REQ_GET_NESTED_STATE_PAGES set. KVM will then process that on the next
> vcpu_enter_guest() and trip the WARN_ON() in svm_get_nested_state_pages().
>
> Something like this to clean up the mess:
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index d4c872843a9d..96f8009a0d45 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -1018,9 +1018,6 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
>
> nested_svm_hv_update_vm_vp_ids(vcpu);
>
> - if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
> - return -1;
> -
> return 0;
> }
>
> @@ -1094,7 +1091,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
>
> svm->nested.nested_run_pending = 1;
>
> - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> + nested_svm_merge_msrpm(vcpu)) {
> svm->nested.nested_run_pending = 0;
> svm->nmi_l1_to_l2 = false;
> svm->soft_int_injected = false;
> @@ -1158,24 +1156,16 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
> int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> struct kvm_vcpu *vcpu = &svm->vcpu;
> + gpa_t vmcb12_gpa = svm->nested.vmcb12_gpa;
> struct vmcb *vmcb01 = svm->vmcb01.ptr;
> struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
> struct vmcb *vmcb12;
> struct kvm_host_map map;
> - int rc;
> -
> - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
> - if (rc) {
> - if (rc == -EINVAL)
> - kvm_inject_gp(vcpu, 0);
> - return 1;
> - }
>
> vmcb12 = map.hva;
>
> /* Exit Guest-Mode */
> leave_guest_mode(vcpu);
> - svm->nested.vmcb12_gpa = 0;
> WARN_ON_ONCE(svm->nested.nested_run_pending);
>
> kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> @@ -1183,6 +1173,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> /* in case we halted in L2 */
> kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
>
> + svm->nested.vmcb12_gpa = 0;
> +
> + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) {
> + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> + return 1;
> + }
> +
> /* Give the current vmcb to the guest */
>
> vmcb12->save.es = vmcb02->save.es;
> @@ -1973,7 +1970,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
>
> static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
> {
> - if (WARN_ON(!is_guest_mode(vcpu)))
> + if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
> return true;
>
> if (!vcpu->arch.pdptrs_from_userspace &&
>
On Thu, Dec 11, 2025 at 07:25:21PM +0000, Yosry Ahmed wrote:
> On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote:
> > On Mon, Nov 10, 2025, Yosry Ahmed wrote:
> > > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from
> > > the VMRUN path, instead of making the call in nested_svm_vmrun(). This
> > > simplifies the flow of nested_svm_vmrun() and removes all jumps to
> > > cleanup labels.
> > >
> > > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> > > ---
> > > arch/x86/kvm/svm/nested.c | 28 +++++++++++++---------------
> > > 1 file changed, 13 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > > index a48668c36a191..89830380cebc5 100644
> > > --- a/arch/x86/kvm/svm/nested.c
> > > +++ b/arch/x86/kvm/svm/nested.c
> > > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
> > >
> > > nested_svm_hv_update_vm_vp_ids(vcpu);
> > >
> > > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
> >
> > This is silly, just do:
> >
> > if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> > nested_svm_merge_msrpm(vcpu)) {
> > svm->nested.nested_run_pending = 0;
> > svm->nmi_l1_to_l2 = false;
> > svm->soft_int_injected = false;
> >
> > svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > svm->vmcb->control.exit_code_hi = -1u;
> > svm->vmcb->control.exit_info_1 = 0;
> > svm->vmcb->control.exit_info_2 = 0;
> >
> > nested_svm_vmexit(svm);
> > }
>
> Actually, if we go with the approach of making all VMRUN failures
> happen before preparing the VMCB02 (as discussed in the other thread),
> then we will want to call nested_svm_merge_msrpm() from within
> enter_svm_guest_mode().
We can also just call nested_svm_merge_msrpm() before
enter_svm_guest_mode(), which seems to work. Part of me still prefers to
keep all the potential failures bundled together in
enter_svm_guest_mode() though.
>
> Otherwise, we either have a separate failure path for
> nested_svm_merge_msrpm(), or we make all VMRUN failures happen after
> preparing the VMCB02 and handled by nested_svm_vmexit().
>
> I like having a separate exit path for VMRUN failures, and it makes more
> sense to do the consistency checks on VMCB12 before preparing VMCB02.
> But I understand if you prefer to keep things simple and move all
> failures after VMCB02.
>
> I already have it implemented with the separate VMRUN failure path, but
> I don't wanna spam you with another series if you prefer it the other
> way.
>
> >
> > > + return -1;
> >
> > Please stop returning -1, use a proper -errno.
> >
> > > +
> > > return 0;
> > > }
> > >
> > > @@ -1105,23 +1108,18 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
> > >
> > > svm->nested.nested_run_pending = 1;
> > >
> > > - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true))
> > > - goto out_exit_err;
> > > -
> > > - if (nested_svm_merge_msrpm(vcpu))
> > > - return ret;
> > > -
> > > -out_exit_err:
> > > - svm->nested.nested_run_pending = 0;
> > > - svm->nmi_l1_to_l2 = false;
> > > - svm->soft_int_injected = false;
> > > + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> > > + svm->nested.nested_run_pending = 0;
> > > + svm->nmi_l1_to_l2 = false;
> > > + svm->soft_int_injected = false;
> > >
> > > - svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > > - svm->vmcb->control.exit_code_hi = 0;
> > > - svm->vmcb->control.exit_info_1 = 0;
> > > - svm->vmcb->control.exit_info_2 = 0;
> > > + svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > > + svm->vmcb->control.exit_code_hi = 0;
> > > + svm->vmcb->control.exit_info_1 = 0;
> > > + svm->vmcb->control.exit_info_2 = 0;
> > >
> > > - nested_svm_vmexit(svm);
> > > + nested_svm_vmexit(svm);
> >
> > Note, there's a pre-existing bug in nested_svm_vmexit(). Lovely, and it's a
> > user-triggerable WARN_ON() (and not even a WARN_ON_ONCE() at that).
> >
> > If nested_svm_vmexit() fails to map vmcb12, it (unbelievably stupidly) injects a
> > #GP and hopes for the best. Oh FFS, it also has the asinine -EINVAL "logic".
> > Anyways, it injects #GP (maybe), and bails early, which leaves
> > KVM_REQ_GET_NESTED_STATE_PAGES set. KVM will then process that on the next
> > vcpu_enter_guest() and trip the WARN_ON() in svm_get_nested_state_pages().
> >
> > Something like this to clean up the mess:
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index d4c872843a9d..96f8009a0d45 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -1018,9 +1018,6 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
> >
> > nested_svm_hv_update_vm_vp_ids(vcpu);
> >
> > - if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
> > - return -1;
> > -
> > return 0;
> > }
> >
> > @@ -1094,7 +1091,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
> >
> > svm->nested.nested_run_pending = 1;
> >
> > - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> > + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> > + nested_svm_merge_msrpm(vcpu)) {
> > svm->nested.nested_run_pending = 0;
> > svm->nmi_l1_to_l2 = false;
> > svm->soft_int_injected = false;
> > @@ -1158,24 +1156,16 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
> > int nested_svm_vmexit(struct vcpu_svm *svm)
> > {
> > struct kvm_vcpu *vcpu = &svm->vcpu;
> > + gpa_t vmcb12_gpa = svm->nested.vmcb12_gpa;
> > struct vmcb *vmcb01 = svm->vmcb01.ptr;
> > struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
> > struct vmcb *vmcb12;
> > struct kvm_host_map map;
> > - int rc;
> > -
> > - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
> > - if (rc) {
> > - if (rc == -EINVAL)
> > - kvm_inject_gp(vcpu, 0);
> > - return 1;
> > - }
> >
> > vmcb12 = map.hva;
> >
> > /* Exit Guest-Mode */
> > leave_guest_mode(vcpu);
> > - svm->nested.vmcb12_gpa = 0;
> > WARN_ON_ONCE(svm->nested.nested_run_pending);
> >
> > kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> > @@ -1183,6 +1173,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> > /* in case we halted in L2 */
> > kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
> >
> > + svm->nested.vmcb12_gpa = 0;
> > +
> > + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) {
> > + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> > + return 1;
> > + }
> > +
> > /* Give the current vmcb to the guest */
> >
> > vmcb12->save.es = vmcb02->save.es;
> > @@ -1973,7 +1970,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
> >
> > static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
> > {
> > - if (WARN_ON(!is_guest_mode(vcpu)))
> > + if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
> > return true;
> >
> > if (!vcpu->arch.pdptrs_from_userspace &&
> >
On Thu, Dec 11, 2025, Yosry Ahmed wrote:
> On Thu, Dec 11, 2025 at 07:25:21PM +0000, Yosry Ahmed wrote:
> > On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote:
> > > On Mon, Nov 10, 2025, Yosry Ahmed wrote:
> > > > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from
> > > > the VMRUN path, instead of making the call in nested_svm_vmrun(). This
> > > > simplifies the flow of nested_svm_vmrun() and removes all jumps to
> > > > cleanup labels.
> > > >
> > > > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> > > > ---
> > > > arch/x86/kvm/svm/nested.c | 28 +++++++++++++---------------
> > > > 1 file changed, 13 insertions(+), 15 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > > > index a48668c36a191..89830380cebc5 100644
> > > > --- a/arch/x86/kvm/svm/nested.c
> > > > +++ b/arch/x86/kvm/svm/nested.c
> > > > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
> > > >
> > > > nested_svm_hv_update_vm_vp_ids(vcpu);
> > > >
> > > > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
> > >
> > > This is silly, just do:
> > >
> > > if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> > > nested_svm_merge_msrpm(vcpu)) {
> > > svm->nested.nested_run_pending = 0;
> > > svm->nmi_l1_to_l2 = false;
> > > svm->soft_int_injected = false;
> > >
> > > svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > > svm->vmcb->control.exit_code_hi = -1u;
> > > svm->vmcb->control.exit_info_1 = 0;
> > > svm->vmcb->control.exit_info_2 = 0;
> > >
> > > nested_svm_vmexit(svm);
> > > }
> >
> > Actually, if we go with the approach of making all VMRUN failures
> > happen before preparing the VMCB02 (as discussed in the other thread),
> > then we will want to call nested_svm_merge_msrpm() from within
> > enter_svm_guest_mode().
>
> We can also just call nested_svm_merge_msrpm() before
> enter_svm_guest_mode(), which seems to work.
That's likely unsafe, nested_vmcb_check_controls() checks fields that are consumed
by nested_svm_merge_msrpm().
> > Otherwise, we either have a separate failure path for
> > nested_svm_merge_msrpm(), or we make all VMRUN failures happen after
> > preparing the VMCB02 and handled by nested_svm_vmexit().
> >
> > I like having a separate exit path for VMRUN failures, and it makes more
> > sense to do the consistency checks on VMCB12 before preparing VMCB02.
> > But I understand if you prefer to keep things simple and move all
> > failures after VMCB02.
> >
> > I already have it implemented with the separate VMRUN failure path, but
> > I don't wanna spam you with another series if you prefer it the other
> > way.
Spam away.
On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote:
> On Mon, Nov 10, 2025, Yosry Ahmed wrote:
> > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from
> > the VMRUN path, instead of making the call in nested_svm_vmrun(). This
> > simplifies the flow of nested_svm_vmrun() and removes all jumps to
> > cleanup labels.
> >
> > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> > ---
> > arch/x86/kvm/svm/nested.c | 28 +++++++++++++---------------
> > 1 file changed, 13 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> > index a48668c36a191..89830380cebc5 100644
> > --- a/arch/x86/kvm/svm/nested.c
> > +++ b/arch/x86/kvm/svm/nested.c
> > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
> >
> > nested_svm_hv_update_vm_vp_ids(vcpu);
> >
> > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
>
> This is silly, just do:
Ack. Any objections to just dropping from_vmrun and moving
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES) to svm_leave_smm()? I
like the consistency of completely relying on from_vmrun or not at all
:P
>
> if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> nested_svm_merge_msrpm(vcpu)) {
> svm->nested.nested_run_pending = 0;
> svm->nmi_l1_to_l2 = false;
> svm->soft_int_injected = false;
>
> svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> svm->vmcb->control.exit_code_hi = -1u;
> svm->vmcb->control.exit_info_1 = 0;
> svm->vmcb->control.exit_info_2 = 0;
>
> nested_svm_vmexit(svm);
> }
>
> > + return -1;
>
> Please stop returning -1, use a proper -errno.
Ack.
>
> > +
> > return 0;
> > }
> >
> > @@ -1105,23 +1108,18 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
> >
> > svm->nested.nested_run_pending = 1;
> >
> > - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true))
> > - goto out_exit_err;
> > -
> > - if (nested_svm_merge_msrpm(vcpu))
> > - return ret;
> > -
> > -out_exit_err:
> > - svm->nested.nested_run_pending = 0;
> > - svm->nmi_l1_to_l2 = false;
> > - svm->soft_int_injected = false;
> > + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> > + svm->nested.nested_run_pending = 0;
> > + svm->nmi_l1_to_l2 = false;
> > + svm->soft_int_injected = false;
> >
> > - svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > - svm->vmcb->control.exit_code_hi = 0;
> > - svm->vmcb->control.exit_info_1 = 0;
> > - svm->vmcb->control.exit_info_2 = 0;
> > + svm->vmcb->control.exit_code = SVM_EXIT_ERR;
> > + svm->vmcb->control.exit_code_hi = 0;
> > + svm->vmcb->control.exit_info_1 = 0;
> > + svm->vmcb->control.exit_info_2 = 0;
> >
> > - nested_svm_vmexit(svm);
> > + nested_svm_vmexit(svm);
>
> Note, there's a pre-existing bug in nested_svm_vmexit(). Lovely, and it's a
> user-triggerable WARN_ON() (and not even a WARN_ON_ONCE() at that).
>
> If nested_svm_vmexit() fails to map vmcb12, it (unbelievably stupidly) injects a
> #GP and hopes for the best. Oh FFS, it also has the asinine -EINVAL "logic".
> Anyways, it injects #GP (maybe), and bails early, which leaves
> KVM_REQ_GET_NESTED_STATE_PAGES set. KVM will then process that on the next
> vcpu_enter_guest() and trip the WARN_ON() in svm_get_nested_state_pages().
>
> Something like this to clean up the mess:
Will add a patch, thanks for catching this!
>
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index d4c872843a9d..96f8009a0d45 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -1018,9 +1018,6 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
>
> nested_svm_hv_update_vm_vp_ids(vcpu);
>
> - if (from_vmrun && !nested_svm_merge_msrpm(vcpu))
> - return -1;
> -
> return 0;
> }
>
> @@ -1094,7 +1091,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
>
> svm->nested.nested_run_pending = 1;
>
> - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true)) {
> + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
> + nested_svm_merge_msrpm(vcpu)) {
> svm->nested.nested_run_pending = 0;
> svm->nmi_l1_to_l2 = false;
> svm->soft_int_injected = false;
> @@ -1158,24 +1156,16 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
> int nested_svm_vmexit(struct vcpu_svm *svm)
> {
> struct kvm_vcpu *vcpu = &svm->vcpu;
> + gpa_t vmcb12_gpa = svm->nested.vmcb12_gpa;
> struct vmcb *vmcb01 = svm->vmcb01.ptr;
> struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
> struct vmcb *vmcb12;
> struct kvm_host_map map;
> - int rc;
> -
> - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
> - if (rc) {
> - if (rc == -EINVAL)
> - kvm_inject_gp(vcpu, 0);
> - return 1;
> - }
>
> vmcb12 = map.hva;
>
> /* Exit Guest-Mode */
> leave_guest_mode(vcpu);
> - svm->nested.vmcb12_gpa = 0;
> WARN_ON_ONCE(svm->nested.nested_run_pending);
>
> kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> @@ -1183,6 +1173,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
> /* in case we halted in L2 */
> kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
>
> + svm->nested.vmcb12_gpa = 0;
> +
> + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) {
> + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> + return 1;
> + }
> +
> /* Give the current vmcb to the guest */
>
> vmcb12->save.es = vmcb02->save.es;
> @@ -1973,7 +1970,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
>
> static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
> {
> - if (WARN_ON(!is_guest_mode(vcpu)))
> + if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
> return true;
>
> if (!vcpu->arch.pdptrs_from_userspace &&
>
On Tue, Dec 09, 2025, Yosry Ahmed wrote: > On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote: > > On Mon, Nov 10, 2025, Yosry Ahmed wrote: > > > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from > > > the VMRUN path, instead of making the call in nested_svm_vmrun(). This > > > simplifies the flow of nested_svm_vmrun() and removes all jumps to > > > cleanup labels. > > > > > > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev> > > > --- > > > arch/x86/kvm/svm/nested.c | 28 +++++++++++++--------------- > > > 1 file changed, 13 insertions(+), 15 deletions(-) > > > > > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c > > > index a48668c36a191..89830380cebc5 100644 > > > --- a/arch/x86/kvm/svm/nested.c > > > +++ b/arch/x86/kvm/svm/nested.c > > > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun) > > > > > > nested_svm_hv_update_vm_vp_ids(vcpu); > > > > > > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu)) > > > > This is silly, just do: > > Ack. Any objections to just dropping from_vmrun and moving > kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES) to svm_leave_smm()? I > like the consistency of completely relying on from_vmrun or not at all Zero objections. When I was initially going through this, I actually thought you were _adding_ the flag and was going to yell at you :-)
On Tue, Dec 09, 2025 at 11:09:26AM -0800, Sean Christopherson wrote: > On Tue, Dec 09, 2025, Yosry Ahmed wrote: > > On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote: > > > On Mon, Nov 10, 2025, Yosry Ahmed wrote: > > > > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from > > > > the VMRUN path, instead of making the call in nested_svm_vmrun(). This > > > > simplifies the flow of nested_svm_vmrun() and removes all jumps to > > > > cleanup labels. > > > > > > > > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev> > > > > --- > > > > arch/x86/kvm/svm/nested.c | 28 +++++++++++++--------------- > > > > 1 file changed, 13 insertions(+), 15 deletions(-) > > > > > > > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c > > > > index a48668c36a191..89830380cebc5 100644 > > > > --- a/arch/x86/kvm/svm/nested.c > > > > +++ b/arch/x86/kvm/svm/nested.c > > > > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun) > > > > > > > > nested_svm_hv_update_vm_vp_ids(vcpu); > > > > > > > > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu)) > > > > > > This is silly, just do: > > > > Ack. Any objections to just dropping from_vmrun and moving > > kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES) to svm_leave_smm()? I > > like the consistency of completely relying on from_vmrun or not at all > > Zero objections. When I was initially going through this, I actually thought you > were _adding_ the flag and was going to yell at you :-) Ugh from_vmrun is also plumbed into nested_svm_load_cr3() as reload_pdptrs. Apparently we shouldn't do that in the call path from svm_leave_smm()? Anyway, seems like it'll be non-trivial to detangle (at least for me, I have 0 understanding of SMM), so I will leave it as-is.
On Wed, Dec 10, 2025, Yosry Ahmed wrote: > On Tue, Dec 09, 2025 at 11:09:26AM -0800, Sean Christopherson wrote: > > On Tue, Dec 09, 2025, Yosry Ahmed wrote: > > > On Tue, Dec 09, 2025 at 08:11:41AM -0800, Sean Christopherson wrote: > > > > On Mon, Nov 10, 2025, Yosry Ahmed wrote: > > > > > Call nested_svm_merge_msrpm() from enter_svm_guest_mode() if called from > > > > > the VMRUN path, instead of making the call in nested_svm_vmrun(). This > > > > > simplifies the flow of nested_svm_vmrun() and removes all jumps to > > > > > cleanup labels. > > > > > > > > > > Signed-off-by: Yosry Ahmed <yosry.ahmed@linux.dev> > > > > > --- > > > > > arch/x86/kvm/svm/nested.c | 28 +++++++++++++--------------- > > > > > 1 file changed, 13 insertions(+), 15 deletions(-) > > > > > > > > > > diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c > > > > > index a48668c36a191..89830380cebc5 100644 > > > > > --- a/arch/x86/kvm/svm/nested.c > > > > > +++ b/arch/x86/kvm/svm/nested.c > > > > > @@ -1020,6 +1020,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun) > > > > > > > > > > nested_svm_hv_update_vm_vp_ids(vcpu); > > > > > > > > > > + if (from_vmrun && !nested_svm_merge_msrpm(vcpu)) > > > > > > > > This is silly, just do: > > > > > > Ack. Any objections to just dropping from_vmrun and moving > > > kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES) to svm_leave_smm()? I > > > like the consistency of completely relying on from_vmrun or not at all > > > > Zero objections. When I was initially going through this, I actually thought you > > were _adding_ the flag and was going to yell at you :-) > > Ugh from_vmrun is also plumbed into nested_svm_load_cr3() as > reload_pdptrs. Apparently we shouldn't do that in the call path from > svm_leave_smm()? Anyway, seems like it'll be non-trivial to detangle (at > least for me, I have 0 understanding of SMM), so I will leave it as-is. Agreed, there's enough refactoring going on as it is, no need to turn the snowball into an avalanche.
© 2016 - 2026 Red Hat, Inc.