xen/arch/x86/hvm/svm/svm.c | 29 +++++++++-------------------- xen/arch/x86/hvm/vmx/vmx.c | 26 +++++++++++--------------- 2 files changed, 20 insertions(+), 35 deletions(-)
While we do this for unknown user mode exits, crashing for supervisor mode
exits is unhelpful. Intel in particular expect the unknown case to be #UD
because they do introduce new instructions with new VMEXIT_* codes without
other enablement controls. e.g. MSRLIST, USER_MSR, MSR_IMM, but AMD have
RDPRU and SKINIT as examples too.
A guest which gets its CPUID checks wrong can trigger the VMExit, and the
VMexit handler would need to emulate the CPUID check and #UD anyway. This
would be a guest bug, and giving it an exception is the right course of
action.
Drop the "#UD or crash" semantics and make it exclusively #UD. Rename the
lables to match the changed semantics. Fold the cases which were plain #UD's
too.
One case that was wrong beforehand was EPT_MISCONFIG. Failing to resolve is
always a Xen bug, and not even necesserily due to the instruction under %rip.
Convert it to be an unconditional domain_crash() with applicable diagnostic
information.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
---
xen/arch/x86/hvm/svm/svm.c | 29 +++++++++--------------------
xen/arch/x86/hvm/vmx/vmx.c | 26 +++++++++++---------------
2 files changed, 20 insertions(+), 35 deletions(-)
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index 2d7c598ffe99..637b47084c25 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -66,15 +66,6 @@ static bool amd_erratum383_found __read_mostly;
static uint64_t osvw_length, osvw_status;
static DEFINE_SPINLOCK(osvw_lock);
-/* Only crash the guest if the problem originates in kernel mode. */
-static void svm_crash_or_fault(struct vcpu *v)
-{
- if ( vmcb_get_cpl(v->arch.hvm.svm.vmcb) )
- hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
- else
- domain_crash(v->domain);
-}
-
void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
{
struct vcpu *curr = current;
@@ -85,7 +76,7 @@ void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
if ( unlikely(inst_len > MAX_INST_LEN) )
{
gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len);
- svm_crash_or_fault(curr);
+ hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
return;
}
@@ -2872,7 +2863,7 @@ void asmlinkage svm_vmexit_handler(void)
* task switch. Decode under %rip to find its length.
*/
if ( insn_len < 0 && (insn_len = svm_get_task_switch_insn_len()) == 0 )
- goto crash_or_fault;
+ goto inject_ud;
hvm_task_switch(vmcb->ei.task_switch.sel,
vmcb->ei.task_switch.iret ? TSW_iret :
@@ -2984,13 +2975,6 @@ void asmlinkage svm_vmexit_handler(void)
svm_vmexit_do_rdtsc(regs, exit_reason == VMEXIT_RDTSCP);
break;
- case VMEXIT_MONITOR:
- case VMEXIT_MWAIT:
- case VMEXIT_SKINIT:
- case VMEXIT_RDPRU:
- hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
- break;
-
case VMEXIT_VMRUN:
svm_vmexit_do_vmrun(regs, v, regs->rax);
break;
@@ -3083,8 +3067,13 @@ void asmlinkage svm_vmexit_handler(void)
gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", "
"exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n",
exit_reason, vmcb->exitinfo1, vmcb->exitinfo2);
- crash_or_fault:
- svm_crash_or_fault(v);
+ fallthrough;
+ case VMEXIT_MONITOR:
+ case VMEXIT_MWAIT:
+ case VMEXIT_SKINIT:
+ case VMEXIT_RDPRU:
+ inject_ud:
+ hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
break;
}
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 6b407226c44c..1af5861ef921 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4337,7 +4337,7 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
__vmread(EXIT_QUALIFICATION, &exit_qualification);
rc = hvm_monitor_vmexit(exit_reason, exit_qualification);
if ( rc < 0 )
- goto exit_and_crash;
+ goto unexpected_vmexit;
if ( rc )
return;
}
@@ -4490,7 +4490,7 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
trap_type, insn_len, 0);
if ( rc < 0 )
- goto exit_and_crash;
+ goto unexpected_vmexit;
if ( !rc )
vmx_propagate_intr(intr_info);
}
@@ -4511,7 +4511,7 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
insn_len, 0);
if ( rc < 0 )
- goto exit_and_crash;
+ goto unexpected_vmexit;
if ( !rc )
vmx_propagate_intr(intr_info);
}
@@ -4557,7 +4557,7 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
case X86_EXC_NMI:
if ( MASK_EXTR(intr_info, INTR_INFO_INTR_TYPE_MASK) !=
X86_ET_NMI )
- goto exit_and_crash;
+ goto unexpected_vmexit;
TRACE(TRC_HVM_NMI);
/* Already handled above. */
break;
@@ -4571,7 +4571,7 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
break;
default:
TRACE(TRC_HVM_TRAP, vector);
- goto exit_and_crash;
+ goto unexpected_vmexit;
}
break;
}
@@ -4627,7 +4627,7 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
* rc > 0 paused waiting for response, work here is done
*/
if ( rc < 0 )
- goto exit_and_crash;
+ goto unexpected_vmexit;
if ( !rc )
update_guest_eip(); /* Safe: CPUID */
break;
@@ -4773,7 +4773,7 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
rc = hvm_monitor_io(io_qual.port, bytes, io_qual.in, io_qual.str);
if ( rc < 0 )
- goto exit_and_crash;
+ goto unexpected_vmexit;
if ( rc )
break;
@@ -4820,7 +4820,8 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
__vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
if ( !ept_handle_misconfig(gpa) )
- goto exit_and_crash;
+ domain_crash(v->domain,
+ "Unhandled EPT_MISCONFIG, gpa %#"PRIpaddr"\n", gpa);
break;
}
@@ -4902,14 +4903,9 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs)
case EXIT_REASON_INVPCID:
/* fall through */
default:
- exit_and_crash:
+ unexpected_vmexit:
gprintk(XENLOG_ERR, "Unexpected vmexit: reason %lu\n", exit_reason);
-
- if ( vmx_get_cpl() )
- hvm_inject_hw_exception(X86_EXC_UD,
- X86_EVENT_NO_EC);
- else
- domain_crash(v->domain);
+ hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
break;
}
base-commit: 117a46287427db2a6f5fe219eb2031d7ca39b603
--
2.39.5
On Fri Nov 28, 2025 at 6:47 PM CET, Andrew Cooper wrote:
> While we do this for unknown user mode exits, crashing for supervisor mode
> exits is unhelpful. Intel in particular expect the unknown case to be #UD
> because they do introduce new instructions with new VMEXIT_* codes without
> other enablement controls. e.g. MSRLIST, USER_MSR, MSR_IMM, but AMD have
> RDPRU and SKINIT as examples too.
I don't know how often Intel adds intercepts (or whatever the VMX equivalent is)
without default-off knobs, but there's a potentially dangerous assumption here
about all intercepts being synchronous with the executed instruction. Some might
depend on other events (i.e: NMIs, IRQs, IPIs, etc) and injecting #UD in those
cases would be very insecure for the guest. It might encourage the kernel to
interpret the current instruction that the kernel can't know wasn't meant to
ever trigger #UD. This would be an integrity-compromising mistake to make.
IOW, I think this is a dangerous default to have and Xen should just crash the
domain irrespective of CPL. At least on SVM. If a guest executes SKINIT and it
doesn't exist
>
> A guest which gets its CPUID checks wrong can trigger the VMExit, and the
> VMexit handler would need to emulate the CPUID check and #UD anyway. This
> would be a guest bug, and giving it an exception is the right course of
> action.
You you need a guest bug compounded with an outdated hypervisor to reproduce the
crash, and even then it's perfectly benign. Beats the alternative described
above, IMO.
>
> Drop the "#UD or crash" semantics and make it exclusively #UD. Rename the
> lables to match the changed semantics. Fold the cases which were plain #UD's
> too.
nit: typo s/lables/labels
Also, does why emacs double spaces after each sentence on reflow? You have them
in a number of places.
>
> One case that was wrong beforehand was EPT_MISCONFIG. Failing to resolve is
> always a Xen bug, and not even necesserily due to the instruction under %rip.
> Convert it to be an unconditional domain_crash() with applicable diagnostic
> information.
One example of the above synchronous vs asynchronous events.
>
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> ---
> xen/arch/x86/hvm/svm/svm.c | 29 +++++++++--------------------
> xen/arch/x86/hvm/vmx/vmx.c | 26 +++++++++++---------------
> 2 files changed, 20 insertions(+), 35 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
> index 2d7c598ffe99..637b47084c25 100644
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -66,15 +66,6 @@ static bool amd_erratum383_found __read_mostly;
> static uint64_t osvw_length, osvw_status;
> static DEFINE_SPINLOCK(osvw_lock);
>
> -/* Only crash the guest if the problem originates in kernel mode. */
> -static void svm_crash_or_fault(struct vcpu *v)
> -{
> - if ( vmcb_get_cpl(v->arch.hvm.svm.vmcb) )
> - hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
> - else
> - domain_crash(v->domain);
> -}
> -
> void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
> {
> struct vcpu *curr = current;
> @@ -85,7 +76,7 @@ void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
> if ( unlikely(inst_len > MAX_INST_LEN) )
> {
> gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len);
> - svm_crash_or_fault(curr);
> + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
This seems more than fair enough.
> return;
> }
>
> @@ -2872,7 +2863,7 @@ void asmlinkage svm_vmexit_handler(void)
> * task switch. Decode under %rip to find its length.
> */
> if ( insn_len < 0 && (insn_len = svm_get_task_switch_insn_len()) == 0 )
> - goto crash_or_fault;
> + goto inject_ud;
And so does this (semantically).
>
> hvm_task_switch(vmcb->ei.task_switch.sel,
> vmcb->ei.task_switch.iret ? TSW_iret :
> @@ -2984,13 +2975,6 @@ void asmlinkage svm_vmexit_handler(void)
> svm_vmexit_do_rdtsc(regs, exit_reason == VMEXIT_RDTSCP);
> break;
>
> - case VMEXIT_MONITOR:
> - case VMEXIT_MWAIT:
> - case VMEXIT_SKINIT:
> - case VMEXIT_RDPRU:
> - hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
> - break;
> -
> case VMEXIT_VMRUN:
> svm_vmexit_do_vmrun(regs, v, regs->rax);
> break;
> @@ -3083,8 +3067,13 @@ void asmlinkage svm_vmexit_handler(void)
> gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", "
> "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n",
> exit_reason, vmcb->exitinfo1, vmcb->exitinfo2);
> - crash_or_fault:
> - svm_crash_or_fault(v);
> + fallthrough;
> + case VMEXIT_MONITOR:
> + case VMEXIT_MWAIT:
> + case VMEXIT_SKINIT:
> + case VMEXIT_RDPRU:
> + inject_ud:
> + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
I understand monitor, mwait, skinit and rdpru triggering #UD, but in the
default case (and we're also covering "default" out of context) injecting #UD
might be meaningless, plain wrong or highly misleading (e.g: imagine receiving
#UD on IRQ). VMEXITs happen due to optional or mandatory intercepts about
synchronous and asynchronous events. If Xen receives a VMEXIT it doesn't
understand the ideal would be to BUG(), because they don't just come out of thin
air. You're meant to enable the intercepts in the VMCB. For defensive purposes
a domain_crash() would be fine too, as it partly is now. I don't understand
the CPL distinction. The kernel might trigger highly dangerous and
integrity-compromising paths if it chooses to interpret #UD and it happens to to
be an externally triggered event rather than something related to the current
instruction.
(dropped VMX, because I don't know about it)
Cheers,
Alejandro
On Mon Dec 1, 2025 at 3:52 PM CET, Alejandro Vallejo wrote: > On Fri Nov 28, 2025 at 6:47 PM CET, Andrew Cooper wrote: >> While we do this for unknown user mode exits, crashing for supervisor mode >> exits is unhelpful. Intel in particular expect the unknown case to be #UD >> because they do introduce new instructions with new VMEXIT_* codes without >> other enablement controls. e.g. MSRLIST, USER_MSR, MSR_IMM, but AMD have >> RDPRU and SKINIT as examples too. > > I don't know how often Intel adds intercepts (or whatever the VMX equivalent is) > without default-off knobs, but there's a potentially dangerous assumption here > about all intercepts being synchronous with the executed instruction. Some might > depend on other events (i.e: NMIs, IRQs, IPIs, etc) and injecting #UD in those > cases would be very insecure for the guest. It might encourage the kernel to > interpret the current instruction that the kernel can't know wasn't meant to > ever trigger #UD. This would be an integrity-compromising mistake to make. > > IOW, I think this is a dangerous default to have and Xen should just crash the > domain irrespective of CPL. At least on SVM. If a guest executes SKINIT and it > doesn't exist ... and it doesn't exist, it's fine for a guest to crash. The domain crashing is a Xen bug, but the bug triggering is a guest bug. And that's ok. Sorry, those linnes got lost. Cheers, Alejandro
On 28.11.2025 18:47, Andrew Cooper wrote: > While we do this for unknown user mode exits, crashing for supervisor mode > exits is unhelpful. Intel in particular expect the unknown case to be #UD > because they do introduce new instructions with new VMEXIT_* codes without > other enablement controls. e.g. MSRLIST, USER_MSR, MSR_IMM, but AMD have > RDPRU and SKINIT as examples too. USER-MSR has MSR_USER_MSR_CTL, so doesn't fully fit here? (It's still not us to directly control exposure of the insns, but an OS would need to use the MSR to do so, and hence we need to properly handle writes to that MSR for the respective exits to become possible.) MSRLIST has a dedicated exec control, so whether the exits can occur is under our control. RDPRU and SKINIT have dedicated intercepts, without use of which the respective exit can't occur aiui. IOW it looks to be really only MSR-IMM which is special. > @@ -3083,8 +3067,13 @@ void asmlinkage svm_vmexit_handler(void) > gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", " > "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n", > exit_reason, vmcb->exitinfo1, vmcb->exitinfo2); > - crash_or_fault: > - svm_crash_or_fault(v); > + fallthrough; > + case VMEXIT_MONITOR: > + case VMEXIT_MWAIT: > + case VMEXIT_SKINIT: > + case VMEXIT_RDPRU: > + inject_ud: > + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); > break; > } > Should this be brought more in line with respective VMX code (kept) below, in never bypassing the gprintk() by any of the case labels? Basically meaning that the case labels you move could simply be dropped for the time being (or else, like the INVCPID one visible in context below, would want re-inserting a few lines earlier). Jan > @@ -4902,14 +4903,9 @@ void asmlinkage vmx_vmexit_handler(struct cpu_user_regs *regs) > case EXIT_REASON_INVPCID: > /* fall through */ > default: > - exit_and_crash: > + unexpected_vmexit: > gprintk(XENLOG_ERR, "Unexpected vmexit: reason %lu\n", exit_reason); > - > - if ( vmx_get_cpl() ) > - hvm_inject_hw_exception(X86_EXC_UD, > - X86_EVENT_NO_EC); > - else > - domain_crash(v->domain); > + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); > break; > } > > > base-commit: 117a46287427db2a6f5fe219eb2031d7ca39b603
On 01/12/2025 9:02 am, Jan Beulich wrote: > On 28.11.2025 18:47, Andrew Cooper wrote: >> While we do this for unknown user mode exits, crashing for supervisor mode >> exits is unhelpful. Intel in particular expect the unknown case to be #UD >> because they do introduce new instructions with new VMEXIT_* codes without >> other enablement controls. e.g. MSRLIST, USER_MSR, MSR_IMM, but AMD have >> RDPRU and SKINIT as examples too. > USER-MSR has MSR_USER_MSR_CTL, so doesn't fully fit here? (It's still not us > to directly control exposure of the insns, but an OS would need to use the > MSR to do so, and hence we need to properly handle writes to that MSR for > the respective exits to become possible.) Oh yes, and the #GP from failing the MSR_USER_MSR_CTL check take priority over the intercept. > MSRLIST has a dedicated exec control, so whether the exits can occur is > under our control. Ah ok. > RDPRU and SKINIT have dedicated intercepts, without use of which the > respective exit can't occur aiui. Correct, but note how we intercept them unconditionally? MONITOR, MWAIT and SKINIT are for Xen's safety, because otherwise the instructions would execute normally in guest context. RDPRU is to block access to the perf counters, because a guest has no legitimate use for them. There are no enablement controls for these instructions. They're always guest-available (on capable hardware) if not intercepted. > > IOW it looks to be really only MSR-IMM which is special. > >> @@ -3083,8 +3067,13 @@ void asmlinkage svm_vmexit_handler(void) >> gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", " >> "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n", >> exit_reason, vmcb->exitinfo1, vmcb->exitinfo2); >> - crash_or_fault: >> - svm_crash_or_fault(v); >> + fallthrough; >> + case VMEXIT_MONITOR: >> + case VMEXIT_MWAIT: >> + case VMEXIT_SKINIT: >> + case VMEXIT_RDPRU: >> + inject_ud: >> + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); >> break; >> } >> > Should this be brought more in line with respective VMX code (kept) below, > in never bypassing the gprintk() by any of the case labels? Basically > meaning that the case labels you move could simply be dropped for the time > being (or else, like the INVCPID one visible in context below, would want > re-inserting a few lines earlier). As said, they're all reachable by guests on capable hardware. I could add a /* Not implemented for guests */ if that would make it clearer? But, we don't want the printk(). We know the labels are reachable, and #UD is the right action. ~Andrew
On 01.12.2025 17:36, Andrew Cooper wrote: > On 01/12/2025 9:02 am, Jan Beulich wrote: >> On 28.11.2025 18:47, Andrew Cooper wrote: >>> While we do this for unknown user mode exits, crashing for supervisor mode >>> exits is unhelpful. Intel in particular expect the unknown case to be #UD >>> because they do introduce new instructions with new VMEXIT_* codes without >>> other enablement controls. e.g. MSRLIST, USER_MSR, MSR_IMM, but AMD have >>> RDPRU and SKINIT as examples too. >> USER-MSR has MSR_USER_MSR_CTL, so doesn't fully fit here? (It's still not us >> to directly control exposure of the insns, but an OS would need to use the >> MSR to do so, and hence we need to properly handle writes to that MSR for >> the respective exits to become possible.) > > Oh yes, and the #GP from failing the MSR_USER_MSR_CTL check take > priority over the intercept. > >> MSRLIST has a dedicated exec control, so whether the exits can occur is >> under our control. > > Ah ok. > > >> RDPRU and SKINIT have dedicated intercepts, without use of which the >> respective exit can't occur aiui. > > Correct, but note how we intercept them unconditionally? > > MONITOR, MWAIT and SKINIT are for Xen's safety, because otherwise the > instructions would execute normally in guest context. > > RDPRU is to block access to the perf counters, because a guest has no > legitimate use for them. > > There are no enablement controls for these instructions. They're always > guest-available (on capable hardware) if not intercepted. For our purposes, the intercept is the enable (i.e. we disable their use by injecting #UD if the intercept triggers). IOW I think those are slightly different in any event, in not really being "unknown". I don't mind their mentioning, but I think the distinction wants to at least be expressed somehow. >>> @@ -3083,8 +3067,13 @@ void asmlinkage svm_vmexit_handler(void) >>> gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", " >>> "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n", >>> exit_reason, vmcb->exitinfo1, vmcb->exitinfo2); >>> - crash_or_fault: >>> - svm_crash_or_fault(v); >>> + fallthrough; >>> + case VMEXIT_MONITOR: >>> + case VMEXIT_MWAIT: >>> + case VMEXIT_SKINIT: >>> + case VMEXIT_RDPRU: >>> + inject_ud: >>> + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); >>> break; >>> } >>> >> Should this be brought more in line with respective VMX code (kept) below, >> in never bypassing the gprintk() by any of the case labels? Basically >> meaning that the case labels you move could simply be dropped for the time >> being (or else, like the INVCPID one visible in context below, would want >> re-inserting a few lines earlier). > > As said, they're all reachable by guests on capable hardware. > > I could add a /* Not implemented for guests */ if that would make it > clearer? Yes, ideally with "yet" also added - recall I've been sitting on an RDPRU emulator patch, awaiting you to fulfill your promise of sorting the CPUID side of things there. > But, we don't want the printk(). We know the labels are reachable, and > #UD is the right action. Hmm, yes, with what you have said further up I think I agree. Yet then my question goes the other way around: Do we want the log message for the (at least) two known exits in VMX, which are grouped with the default: label? IOW I'm still puzzled by the asymmetry between SVM and VMX in this regard. Jan
On 02/12/2025 8:33 am, Jan Beulich wrote: > On 01.12.2025 17:36, Andrew Cooper wrote: >> On 01/12/2025 9:02 am, Jan Beulich wrote: >>> On 28.11.2025 18:47, Andrew Cooper wrote: >>>> While we do this for unknown user mode exits, crashing for supervisor mode >>>> exits is unhelpful. Intel in particular expect the unknown case to be #UD >>>> because they do introduce new instructions with new VMEXIT_* codes without >>>> other enablement controls. e.g. MSRLIST, USER_MSR, MSR_IMM, but AMD have >>>> RDPRU and SKINIT as examples too. >>> USER-MSR has MSR_USER_MSR_CTL, so doesn't fully fit here? (It's still not us >>> to directly control exposure of the insns, but an OS would need to use the >>> MSR to do so, and hence we need to properly handle writes to that MSR for >>> the respective exits to become possible.) >> Oh yes, and the #GP from failing the MSR_USER_MSR_CTL check take >> priority over the intercept. >> >>> MSRLIST has a dedicated exec control, so whether the exits can occur is >>> under our control. >> Ah ok. >> >> >>> RDPRU and SKINIT have dedicated intercepts, without use of which the >>> respective exit can't occur aiui. >> Correct, but note how we intercept them unconditionally? >> >> MONITOR, MWAIT and SKINIT are for Xen's safety, because otherwise the >> instructions would execute normally in guest context. >> >> RDPRU is to block access to the perf counters, because a guest has no >> legitimate use for them. >> >> There are no enablement controls for these instructions. They're always >> guest-available (on capable hardware) if not intercepted. > For our purposes, the intercept is the enable No - very critically not. If we were to not intercept SKINIT, the guest could INIT the logical processor (into a weird protected mode, rather than real mode), and reset the DRTM PCRs in the TPM. This is an example of a capability which *should* have had an enable (FamFh Rev F and G and Fam10h Rev B had SVM but not SKINIT) but didn't. It is always available in guest mode on capable hardware, and Xen *must* intercept SKINIT to keep control of the system. The absence of an enable should also have had a CVE but I can't locate one. It turns out to be the same for MONITOR, which appeared in Fam10h Rev B and has no enable. So actually, AMD is full of examples where we ought to have a CPU support policy... > (i.e. we disable their use > by injecting #UD if the intercept triggers). IOW I think those are > slightly different in any event, in not really being "unknown". I don't > mind their mentioning, but I think the distinction wants to at least be > expressed somehow. > >>>> @@ -3083,8 +3067,13 @@ void asmlinkage svm_vmexit_handler(void) >>>> gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", " >>>> "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n", >>>> exit_reason, vmcb->exitinfo1, vmcb->exitinfo2); >>>> - crash_or_fault: >>>> - svm_crash_or_fault(v); >>>> + fallthrough; >>>> + case VMEXIT_MONITOR: >>>> + case VMEXIT_MWAIT: >>>> + case VMEXIT_SKINIT: >>>> + case VMEXIT_RDPRU: >>>> + inject_ud: >>>> + hvm_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC); >>>> break; >>>> } >>>> >>> Should this be brought more in line with respective VMX code (kept) below, >>> in never bypassing the gprintk() by any of the case labels? Basically >>> meaning that the case labels you move could simply be dropped for the time >>> being (or else, like the INVCPID one visible in context below, would want >>> re-inserting a few lines earlier). >> As said, they're all reachable by guests on capable hardware. >> >> I could add a /* Not implemented for guests */ if that would make it >> clearer? > Yes, ideally with "yet" also added - recall I've been sitting on an RDPRU > emulator patch, awaiting you to fulfill your promise of sorting the CPUID > side of things there. > >> But, we don't want the printk(). We know the labels are reachable, and >> #UD is the right action. > Hmm, yes, with what you have said further up I think I agree. Yet then my > question goes the other way around: Do we want the log message for the > (at least) two known exits in VMX, which are grouped with the default: > label? The preemption timer and INVPCID? Those have explicit enables and are either not configured (preemption timer, and INVPCID with Shadow), or enabled and not intercepted (INVPCID on capable hardware and with HAP). Getting the VMExit is either a configuration error in Xen, or a hardware/VMM violation of the VT-x spec. I suppose we probably want to make those into explicit domain crashes. > IOW I'm still puzzled by the asymmetry between SVM and VMX in this regard. VT-x and SVM are different in almost every regard. I find it surprising how well they mange to approximate a working system given the differences. ~Andrew
© 2016 - 2025 Red Hat, Inc.