[PATCH v2 22/23] x86/pv: System call handling in FRED mode

Andrew Cooper posted 23 patches 2 months ago
There is a newer version of this series
[PATCH v2 22/23] x86/pv: System call handling in FRED mode
Posted by Andrew Cooper 2 months ago
Under FRED, entry_from_pv() handles everything, even system calls.  This means
more of our logic is written in C now, rather than assembly.

In order to facilitate this, introduce pv_inject_callback(), which reuses
struct trap_bounce infrastructure to inject the syscall/sysenter callbacks.
This in turns requires some !PV compatibility for pv_inject_callback() and
pv_hypercall() which can both be ASSERT_UNREACHABLE().

For each of INT $N, SYSCALL and SYSENTER, FRED gives us interrupted context
which was previously lost.  As the guest can't see FRED, Xen has to lose state
in the same way to maintain the prior behaviour.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v2:
 * New
---
 xen/arch/x86/include/asm/domain.h    |   5 ++
 xen/arch/x86/include/asm/hypercall.h |   5 ++
 xen/arch/x86/pv/traps.c              |  33 +++++++++
 xen/arch/x86/traps.c                 | 107 +++++++++++++++++++++++++++
 4 files changed, 150 insertions(+)

diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 5df8c7825333..b374decccc9c 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -712,11 +712,16 @@ int arch_set_info_hvm_guest(struct vcpu *v, const struct vcpu_hvm_context *ctx);
 
 #ifdef CONFIG_PV
 void pv_inject_event(const struct x86_event *event);
+void pv_inject_callback(unsigned int type);
 #else
 static inline void pv_inject_event(const struct x86_event *event)
 {
     ASSERT_UNREACHABLE();
 }
+static inline void pv_inject_callback(unsigned int type)
+{
+    ASSERT_UNREACHABLE();
+}
 #endif
 
 static inline void pv_inject_hw_exception(unsigned int vector, int errcode)
diff --git a/xen/arch/x86/include/asm/hypercall.h b/xen/arch/x86/include/asm/hypercall.h
index f6e9e2313b3c..1010332a47e9 100644
--- a/xen/arch/x86/include/asm/hypercall.h
+++ b/xen/arch/x86/include/asm/hypercall.h
@@ -20,6 +20,11 @@
 
 #ifdef CONFIG_PV
 void pv_hypercall(struct cpu_user_regs *regs);
+#else
+static inline void pv_hypercall(struct cpu_user_regs *regs)
+{
+    ASSERT_UNREACHABLE();
+}
 #endif
 
 void pv_ring1_init_hypercall_page(void *ptr);
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
index c3c0976c440f..e7314d8703d9 100644
--- a/xen/arch/x86/pv/traps.c
+++ b/xen/arch/x86/pv/traps.c
@@ -19,6 +19,8 @@
 #include <asm/shared.h>
 #include <asm/traps.h>
 
+#include <public/callback.h>
+
 void pv_inject_event(const struct x86_event *event)
 {
     struct vcpu *curr = current;
@@ -95,6 +97,37 @@ void pv_inject_event(const struct x86_event *event)
     }
 }
 
+void pv_inject_callback(unsigned int type)
+{
+    struct vcpu *curr = current;
+    struct trap_bounce *tb = &curr->arch.pv.trap_bounce;
+    unsigned long rip = 0;
+    bool irq = false;
+
+    ASSERT(is_pv_64bit_vcpu(curr));
+
+    switch ( type )
+    {
+    case CALLBACKTYPE_syscall:
+        rip = curr->arch.pv.syscall_callback_eip;
+        irq = curr->arch.pv.vgc_flags & VGCF_syscall_disables_events;
+        break;
+
+    case CALLBACKTYPE_syscall32:
+        rip = curr->arch.pv.syscall32_callback_eip;
+        irq = curr->arch.pv.syscall32_disables_events;
+        break;
+
+    case CALLBACKTYPE_sysenter:
+        rip = curr->arch.pv.sysenter_callback_eip;
+        irq = curr->arch.pv.sysenter_disables_events;
+        break;
+    }
+
+    tb->flags = TBF_EXCEPTION | (irq ? TBF_INTERRUPT : 0);
+    tb->eip = rip;
+}
+
 /*
  * Called from asm to set up the MCE trapbounce info.
  * Returns false no callback is set up, else true.
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index e10b4e771824..9211067cd688 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -18,6 +18,7 @@
 #include <xen/delay.h>
 #include <xen/domain_page.h>
 #include <xen/guest_access.h>
+#include <xen/hypercall.h>
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/paging.h>
@@ -52,6 +53,8 @@
 #include <asm/uaccess.h>
 #include <asm/xenoprof.h>
 
+#include <public/callback.h>
+
 /*
  * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
  *  fatal:  Xen prints diagnostic message and then hangs.
@@ -2266,6 +2269,7 @@ void asmlinkage check_ist_exit(const struct cpu_user_regs *regs, bool ist_exit)
 void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
 {
     struct fred_info *fi = cpu_regs_fred_info(regs);
+    struct vcpu *curr = current;
     uint8_t type = regs->fred_ss.type;
     uint8_t vec = regs->fred_ss.vector;
 
@@ -2305,6 +2309,27 @@ void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
 
     switch ( type )
     {
+    case X86_ET_SW_INT:
+        /*
+         * INT $3/4 are indistinguishable from INT3/INTO under IDT, and are
+         * permitted by Xen without the guest kernel having a choice.  Let
+         * them fall through into X86_ET_HW_EXC, as #BP in particular needs
+         * handling by do_int3() in case an external debugger is attached.
+         */
+        if ( vec != X86_EXC_BP && vec != X86_EXC_OF )
+        {
+            const struct trap_info *ti = &curr->arch.pv.trap_ctxt[vec];
+
+            if ( permit_softint(TI_GET_DPL(ti), curr, regs) )
+                pv_inject_sw_interrupt(vec);
+            else
+            {
+                regs->rip -= 2;
+                pv_inject_hw_exception(X86_EXC_GP, (vec << 3) | X86_XEC_IDT);
+            }
+            break;
+        }
+        fallthrough;
     case X86_ET_HW_EXC:
     case X86_ET_PRIV_SW_EXC:
     case X86_ET_SW_EXC:
@@ -2335,6 +2360,88 @@ void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
         }
         break;
 
+    case X86_ET_OTHER:
+        switch ( regs->fred_ss.vector )
+        {
+        case 1: /* SYSCALL */
+        {
+            /*
+             * FRED delivery preserves the interrupted %cs/%ss, but previously
+             * SYSCALL lost the interrupted selectors, and SYSRET forced the
+             * use of the ones in MSR_STAR.
+             *
+             * The guest isn't aware of FRED, so recreate the legacy
+             * behaviour, including the guess of instruction length for
+             * faults.
+             *
+             * The non-FRED SYSCALL path sets TRAP_syscall in entry_vector to
+             * signal that SYSRET can be used, but this isn't relevant in FRED
+             * mode.
+             *
+             * When setting the selectors, clear all upper metadata again for
+             * backwards compatibility.  In particular fred_ss.swint becomes
+             * pend_DB on ERETx, and nothing else in the pv_hypercall() would
+             * clean up.
+             */
+            bool l = regs->fred_ss.l;
+
+            regs->ssx = l ? FLAT_KERNEL_SS   : FLAT_USER_SS32;
+            regs->csx = l ? FLAT_KERNEL_CS64 : FLAT_USER_CS32;
+
+            if ( guest_kernel_mode(curr, regs) )
+                pv_hypercall(regs);
+            else if ( (l ? curr->arch.pv.syscall_callback_eip
+                         : curr->arch.pv.syscall32_callback_eip) == 0 )
+            {
+                regs->rip -= 2;
+                pv_inject_hw_exception(X86_EXC_UD, X86_EVENT_NO_EC);
+            }
+            else
+            {
+                /*
+                 * The PV ABI, given no virtual SYSCALL_MASK, hardcodes that
+                 * DF is cleared.  Other flags are handled in the same way as
+                 * interrupts and exceptions in create_bounce_frame().
+                 */
+                regs->eflags &= ~X86_EFLAGS_DF;
+                pv_inject_callback(l ? CALLBACKTYPE_syscall
+                                     : CALLBACKTYPE_syscall32);
+            }
+            break;
+        }
+
+        case 2: /* SYSENTER */
+            /*
+             * FRED delivery preserves the interrupted state, but previously
+             * SYSENTER discarded almost everything.
+             *
+             * The guest isn't aware of FRED, so recreate the legacy
+             * behaviour, including the guess of instruction length for
+             * faults.
+             *
+             * When setting the selectors, clear all upper metadata.  In
+             * particular fred_ss.swint becomes pend_DB on ERETx.
+             */
+            regs->ssx = FLAT_USER_SS;
+            regs->rsp = 0;
+            regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+            regs->csx = 3;
+            regs->rip = 0;
+
+            if ( !curr->arch.pv.sysenter_callback_eip )
+            {
+                regs->rip -= 2;
+                pv_inject_hw_exception(X86_EXC_GP, 0);
+            }
+            else
+                pv_inject_callback(CALLBACKTYPE_sysenter);
+            break;
+
+        default:
+            goto fatal;
+        }
+        break;
+
     default:
         goto fatal;
     }
-- 
2.39.5


Re: [PATCH v2 22/23] x86/pv: System call handling in FRED mode
Posted by Jan Beulich 1 month, 4 weeks ago
On 28.08.2025 17:04, Andrew Cooper wrote:
> Under FRED, entry_from_pv() handles everything, even system calls.  This means
> more of our logic is written in C now, rather than assembly.
> 
> In order to facilitate this, introduce pv_inject_callback(), which reuses
> struct trap_bounce infrastructure to inject the syscall/sysenter callbacks.
> This in turns requires some !PV compatibility for pv_inject_callback() and
> pv_hypercall() which can both be ASSERT_UNREACHABLE().
> 
> For each of INT $N, SYSCALL and SYSENTER, FRED gives us interrupted context
> which was previously lost.  As the guest can't see FRED, Xen has to lose state
> in the same way to maintain the prior behaviour.

In principle we could expose a new capability to the guest allowing it to
request that we preserve state. Question of course is whether that would
be of any practical use.

> --- a/xen/arch/x86/include/asm/domain.h
> +++ b/xen/arch/x86/include/asm/domain.h
> @@ -712,11 +712,16 @@ int arch_set_info_hvm_guest(struct vcpu *v, const struct vcpu_hvm_context *ctx);
>  
>  #ifdef CONFIG_PV
>  void pv_inject_event(const struct x86_event *event);
> +void pv_inject_callback(unsigned int type);
>  #else
>  static inline void pv_inject_event(const struct x86_event *event)
>  {
>      ASSERT_UNREACHABLE();
>  }
> +static inline void pv_inject_callback(unsigned int type)
> +{
> +    ASSERT_UNREACHABLE();
> +}
>  #endif

We don't really need this, nor ...

> --- a/xen/arch/x86/include/asm/hypercall.h
> +++ b/xen/arch/x86/include/asm/hypercall.h
> @@ -20,6 +20,11 @@
>  
>  #ifdef CONFIG_PV
>  void pv_hypercall(struct cpu_user_regs *regs);
> +#else
> +static inline void pv_hypercall(struct cpu_user_regs *regs)
> +{
> +    ASSERT_UNREACHABLE();
> +}
>  #endif

... this, do we? If you expose the decls outside of the #ifdef, I can't help
the impression that all call sites will simply be DCE-ed (thanks to the
!IS_ENABLED(CONFIG_PV) check at the top of entry_from_pv()).

> --- a/xen/arch/x86/pv/traps.c
> +++ b/xen/arch/x86/pv/traps.c
> @@ -19,6 +19,8 @@
>  #include <asm/shared.h>
>  #include <asm/traps.h>
>  
> +#include <public/callback.h>
> +
>  void pv_inject_event(const struct x86_event *event)
>  {
>      struct vcpu *curr = current;
> @@ -95,6 +97,37 @@ void pv_inject_event(const struct x86_event *event)
>      }
>  }
>  
> +void pv_inject_callback(unsigned int type)
> +{
> +    struct vcpu *curr = current;
> +    struct trap_bounce *tb = &curr->arch.pv.trap_bounce;
> +    unsigned long rip = 0;
> +    bool irq = false;

Move the latter two initializers into a default: case, after an
ASSERT_UNREACHABLE()?

> +    ASSERT(is_pv_64bit_vcpu(curr));

I was first wondering why you check this here, but yes, PV32 is disabled when
FRED is enabled. IOW if a new use for this function turned up, this could
validly be relaxed.

> @@ -2305,6 +2309,27 @@ void asmlinkage entry_from_pv(struct cpu_user_regs *regs)
>  
>      switch ( type )
>      {
> +    case X86_ET_SW_INT:
> +        /*
> +         * INT $3/4 are indistinguishable from INT3/INTO under IDT, and are

Didn't we discuss this the other day? They are distinguishable, as long as you
set their gates' DPL to 0. Just that we use DPL 3. Hence I think this wants
wording a little differently, to make clear it's our (possibly wrong) choice.

> +         * permitted by Xen without the guest kernel having a choice.  Let

Doesn't the guest have a choice by using TI_SET_DPL() suitably?

> +         * them fall through into X86_ET_HW_EXC, as #BP in particular needs
> +         * handling by do_int3() in case an external debugger is attached.
> +         */

I don't understand this, though. An external debugger would better not place
breakpoints using CD 03, so I think we'd better wire such the normal INT nn
way. And for #OF I also don't think we need to make an exception.

Jan