Under FRED, entry_from_pv() handles everything, even system call instructions.
This means more of our logic is written in C now, rather than assembly.
In order to facilitate this, introduce pv_inject_callback(), which reuses
struct trap_bounce infrastructure to inject the syscall/sysenter callbacks.
This in turns requires some !PV compatibility for pv_inject_callback() and
pv_hypercall() which can both be ASSERT_UNREACHABLE().
For each of INT $N, SYSCALL and SYSENTER, FRED gives us interrupted context
which was previously lost. As the guest can't see FRED, Xen has to lose state
in the same way to maintain the prior behaviour.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
v3:
* Simplify DCE handling.
* Add ASSERT_UNREACHABLE() to pv_inject_callback().
* Adjust comment for X86_ET_SW_INT
v2:
* New
---
xen/arch/x86/include/asm/domain.h | 2 +
xen/arch/x86/include/asm/hypercall.h | 2 -
xen/arch/x86/pv/traps.c | 39 ++++++++
xen/arch/x86/traps.c | 131 +++++++++++++++++++++++++++
4 files changed, 172 insertions(+), 2 deletions(-)
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 94b0cf7f1d95..ad7f6adb2cb9 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -725,6 +725,8 @@ void arch_vcpu_regs_init(struct vcpu *v);
struct vcpu_hvm_context;
int arch_set_info_hvm_guest(struct vcpu *v, const struct vcpu_hvm_context *ctx);
+void pv_inject_callback(unsigned int type);
+
#ifdef CONFIG_PV
void pv_inject_event(const struct x86_event *event);
#else
diff --git a/xen/arch/x86/include/asm/hypercall.h b/xen/arch/x86/include/asm/hypercall.h
index bf2f0e169aef..d042a61d1702 100644
--- a/xen/arch/x86/include/asm/hypercall.h
+++ b/xen/arch/x86/include/asm/hypercall.h
@@ -18,9 +18,7 @@
#define __HYPERVISOR_paging_domctl_cont __HYPERVISOR_arch_1
-#ifdef CONFIG_PV
void pv_hypercall(struct cpu_user_regs *regs);
-#endif
void pv_ring1_init_hypercall_page(void *p);
void pv_ring3_init_hypercall_page(void *p);
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
index b0395b99145a..c863ab9d372a 100644
--- a/xen/arch/x86/pv/traps.c
+++ b/xen/arch/x86/pv/traps.c
@@ -20,6 +20,8 @@
#include <asm/shared.h>
#include <asm/traps.h>
+#include <public/callback.h>
+
void pv_inject_event(const struct x86_event *event)
{
struct vcpu *curr = current;
@@ -96,6 +98,43 @@ void pv_inject_event(const struct x86_event *event)
}
}
+void pv_inject_callback(unsigned int type)
+{
+ struct vcpu *curr = current;
+ struct trap_bounce *tb = &curr->arch.pv.trap_bounce;
+ unsigned long rip;
+ bool irq;
+
+ ASSERT(is_pv_64bit_vcpu(curr));
+
+ switch ( type )
+ {
+ case CALLBACKTYPE_syscall:
+ rip = curr->arch.pv.syscall_callback_eip;
+ irq = curr->arch.pv.vgc_flags & VGCF_syscall_disables_events;
+ break;
+
+ case CALLBACKTYPE_syscall32:
+ rip = curr->arch.pv.syscall32_callback_eip;
+ irq = curr->arch.pv.syscall32_disables_events;
+ break;
+
+ case CALLBACKTYPE_sysenter:
+ rip = curr->arch.pv.sysenter_callback_eip;
+ irq = curr->arch.pv.sysenter_disables_events;
+ break;
+
+ default:
+ ASSERT_UNREACHABLE();
+ rip = 0;
+ irq = false;
+ break;
+ }
+
+ tb->flags = TBF_EXCEPTION | (irq ? TBF_INTERRUPT : 0);
+ tb->eip = rip;
+}
+
/*
* Called from asm to set up the MCE trapbounce info.
* Returns false no callback is set up, else true.
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index 2f40f628cbff..e2c35a046e6b 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -18,6 +18,7 @@
#include <xen/delay.h>
#include <xen/domain_page.h>
#include <xen/guest_access.h>
+#include <xen/hypercall.h>
#include <xen/init.h>
#include <xen/mm.h>
#include <xen/paging.h>
@@ -51,6 +52,8 @@
#include <asm/traps.h>
#include <asm/uaccess.h>
+#include <public/callback.h>
+
/*
* opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
* fatal: Xen prints diagnostic message and then hangs.
@@ -2267,6 +2270,7 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
{
struct fred_info *fi = cpu_regs_fred_info(regs);
+ struct vcpu *curr = current;
uint8_t type = regs->fred_ss.type;
uint8_t vec = regs->fred_ss.vector;
@@ -2309,6 +2313,38 @@ void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
switch ( type )
{
+ case X86_ET_SW_INT:
+ /*
+ * For better or worse, Xen writes IDT vectors 3 and 4 with DPL3 (so
+ * INT3/INTO work), making INT $3/4 indistinguishable, and the guest
+ * choice of DPL for these vectors is ignored.
+ *
+ * Have them fall through into X86_ET_HW_EXC, as #BP in particular
+ * needs handling by do_int3() in case an external debugger is
+ * attached.
+ *
+ * As the event type is provided, INT $N instructions don't need #GP
+ * tricks to spot, and INT $0x80 doesn't need a fastpath. As the
+ * guest is necessary PV64, INT $0x82 has no special meaning either.
+ *
+ * When converting to a fault, hardware finally gives us enough
+ * information to account for prefixes, so provide the more correct
+ * behaviour rather than assuming the instruction was two bytes long.
+ */
+ if ( vec != X86_EXC_BP && vec != X86_EXC_OF )
+ {
+ const struct trap_info *ti = &curr->arch.pv.trap_ctxt[vec];
+
+ if ( permit_softint(TI_GET_DPL(ti), curr, regs) )
+ pv_inject_sw_interrupt(vec);
+ else
+ {
+ regs->rip -= regs->fred_ss.insnlen;
+ pv_inject_hw_exception(X86_EXC_GP, (vec << 3) | X86_XEC_IDT);
+ }
+ break;
+ }
+ fallthrough;
case X86_ET_HW_EXC:
case X86_ET_PRIV_SW_EXC:
case X86_ET_SW_EXC:
@@ -2338,6 +2374,101 @@ void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
}
break;
+ case X86_ET_OTHER:
+ switch ( regs->fred_ss.vector )
+ {
+ case 1: /* SYSCALL */
+ {
+ /*
+ * FRED delivery preserves the interrupted %cs/%ss, but previously
+ * SYSCALL lost the interrupted selectors, and SYSRET forced the
+ * use of the ones in MSR_STAR.
+ *
+ * The guest isn't aware of FRED, so recreate the legacy
+ * behaviour.
+ *
+ * The non-FRED SYSCALL path sets TRAP_syscall in entry_vector to
+ * signal that SYSRET can be used, but this isn't relevant in FRED
+ * mode.
+ *
+ * When setting the selectors, clear all upper metadata again for
+ * backwards compatibility. In particular fred_ss.swint becomes
+ * pend_DB on ERETx, and nothing else in the pv_hypercall() would
+ * clean up.
+ *
+ * When converting to a fault, hardware finally gives us enough
+ * information to account for prefixes, so provide the more
+ * correct behaviour rather than assuming the instruction was two
+ * bytes long.
+ */
+ bool l = regs->fred_ss.l;
+ unsigned int len = regs->fred_ss.insnlen;
+
+ regs->ssx = l ? FLAT_KERNEL_SS : FLAT_USER_SS32;
+ regs->csx = l ? FLAT_KERNEL_CS64 : FLAT_USER_CS32;
+
+ if ( guest_kernel_mode(curr, regs) )
+ pv_hypercall(regs);
+ else if ( (l ? curr->arch.pv.syscall_callback_eip
+ : curr->arch.pv.syscall32_callback_eip) == 0 )
+ {
+ regs->rip -= len;
+ pv_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
+ }
+ else
+ {
+ /*
+ * The PV ABI, given no virtual SYSCALL_MASK, hardcodes that
+ * DF is cleared. Other flags are handled in the same way as
+ * interrupts and exceptions in create_bounce_frame().
+ */
+ regs->eflags &= ~X86_EFLAGS_DF;
+ pv_inject_callback(l ? CALLBACKTYPE_syscall
+ : CALLBACKTYPE_syscall32);
+ }
+ break;
+ }
+
+ case 2: /* SYSENTER */
+ {
+ /*
+ * FRED delivery preserves the interrupted state, but previously
+ * SYSENTER discarded almost everything.
+ *
+ * The guest isn't aware of FRED, so recreate the legacy
+ * behaviour.
+ *
+ * When setting the selectors, clear all upper metadata. In
+ * particular fred_ss.swint becomes pend_DB on ERETx.
+ *
+ * When converting to a fault, hardware finally gives us enough
+ * information to account for prefixes, so provide the more
+ * correct behaviour rather than assuming the instruction was two
+ * bytes long.
+ */
+ unsigned int len = regs->fred_ss.insnlen;
+
+ regs->ssx = FLAT_USER_SS;
+ regs->rsp = 0;
+ regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ regs->csx = 3;
+ regs->rip = 0;
+
+ if ( !curr->arch.pv.sysenter_callback_eip )
+ {
+ regs->rip -= len;
+ pv_inject_hw_exception(X86_EXC_GP, 0);
+ }
+ else
+ pv_inject_callback(CALLBACKTYPE_sysenter);
+ break;
+ }
+
+ default:
+ goto fatal;
+ }
+ break;
+
default:
goto fatal;
}
--
2.39.5
© 2016 - 2026 Red Hat, Inc.