[v1] xen: drop hypercall function tables

[PATCH 10/12] xen/x86: call hypercall handlers via switch statement

Posted by Juergen Gross 4 years, 3 months ago

Instead of using a function table use the generated switch statement
macros for calling the appropriate hypercall handlers.

This is beneficial to performance and avoids speculation issues.

With calling the handlers using the correct number of parameters now
it is possible to do the parameter register clobbering in the NDEBUG
case after returning from the handler. This in turn removes the only
users of hypercall_args_table[] which can be removed now.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 xen/arch/x86/hvm/hypercall.c    | 144 +++----------------------
 xen/arch/x86/hypercall.c        |  59 -----------
 xen/arch/x86/pv/hypercall.c     | 180 +++-----------------------------
 xen/include/asm-x86/hypercall.h |  43 +++++---
 4 files changed, 60 insertions(+), 366 deletions(-)

diff --git a/xen/arch/x86/hvm/hypercall.c b/xen/arch/x86/hvm/hypercall.c
index 85b7a33523..e766cf4c72 100644
--- a/xen/arch/x86/hvm/hypercall.c
+++ b/xen/arch/x86/hvm/hypercall.c
@@ -108,56 +108,10 @@ long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         return compat_physdev_op(cmd, arg);
 }
 
-#define HYPERCALL(x)                                         \
-    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,  \
-                               (hypercall_fn_t *) do_ ## x }
-
-#define HVM_CALL(x)                                          \
-    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) hvm_ ## x, \
-                               (hypercall_fn_t *) hvm_ ## x }
-
-#define COMPAT_CALL(x)                                       \
-    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,  \
-                               (hypercall_fn_t *) compat_ ## x }
-
-static const struct {
-    hypercall_fn_t *native, *compat;
-} hvm_hypercall_table[] = {
-    HVM_CALL(memory_op),
-    COMPAT_CALL(multicall),
-#ifdef CONFIG_GRANT_TABLE
-    HVM_CALL(grant_table_op),
-#endif
-    HYPERCALL(vm_assist),
-    COMPAT_CALL(vcpu_op),
-    HVM_CALL(physdev_op),
-    COMPAT_CALL(xen_version),
-    HYPERCALL(console_io),
-    HYPERCALL(event_channel_op),
-    COMPAT_CALL(sched_op),
-    COMPAT_CALL(set_timer_op),
-    COMPAT_CALL(xsm_op),
-    HYPERCALL(hvm_op),
-    HYPERCALL(sysctl),
-    HYPERCALL(domctl),
-#ifdef CONFIG_ARGO
-    COMPAT_CALL(argo_op),
-#endif
-    COMPAT_CALL(platform_op),
-#ifdef CONFIG_PV
-    COMPAT_CALL(mmuext_op),
-#endif
-    HYPERCALL(xenpmu_op),
-    COMPAT_CALL(dm_op),
-#ifdef CONFIG_HYPFS
-    HYPERCALL(hypfs_op),
+#ifndef NDEBUG
+static unsigned char hypercall_args_64[] = hypercall_args_hvm64;
+static unsigned char hypercall_args_32[] = hypercall_args_hvm32;
 #endif
-    HYPERCALL(paging_domctl_cont)
-};
-
-#undef HYPERCALL
-#undef HVM_CALL
-#undef COMPAT_CALL
 
 int hvm_hypercall(struct cpu_user_regs *regs)
 {
@@ -203,23 +157,6 @@ int hvm_hypercall(struct cpu_user_regs *regs)
         return ret;
     }
 
-    BUILD_BUG_ON(ARRAY_SIZE(hvm_hypercall_table) >
-                 ARRAY_SIZE(hypercall_args_table));
-
-    if ( eax >= ARRAY_SIZE(hvm_hypercall_table) )
-    {
-        regs->rax = -ENOSYS;
-        return HVM_HCALL_completed;
-    }
-
-    eax = array_index_nospec(eax, ARRAY_SIZE(hvm_hypercall_table));
-
-    if ( !hvm_hypercall_table[eax].native )
-    {
-        regs->rax = -ENOSYS;
-        return HVM_HCALL_completed;
-    }
-
     /*
      * Caching is intended for instruction emulation only. Disable it
      * for any accesses by hypercall argument copy-in / copy-out.
@@ -239,33 +176,11 @@ int hvm_hypercall(struct cpu_user_regs *regs)
         HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%lu(%lx, %lx, %lx, %lx, %lx)",
                     eax, rdi, rsi, rdx, r10, r8);
 
-#ifndef NDEBUG
-        /* Deliberately corrupt parameter regs not used by this hypercall. */
-        switch ( hypercall_args_table[eax].native )
-        {
-        case 0: rdi = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 1: rsi = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 2: rdx = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 3: r10 = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 4: r8 = 0xdeadbeefdeadf00dUL;
-        }
-#endif
-
-        regs->rax = hvm_hypercall_table[eax].native(rdi, rsi, rdx, r10, r8);
+        call_handlers_hvm64(eax, regs->rax, rdi, rsi, rdx, r10, r8);
 
 #ifndef NDEBUG
-        if ( !curr->hcall_preempted )
-        {
-            /* Deliberately corrupt parameter regs used by this hypercall. */
-            switch ( hypercall_args_table[eax].native )
-            {
-            case 5: regs->r8  = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 4: regs->r10 = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 3: regs->rdx = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 2: regs->rsi = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 1: regs->rdi = 0xdeadbeefdeadf00dUL;
-            }
-        }
+        if ( !curr->hcall_preempted && regs->rax != -ENOSYS )
+            clobber_regs(regs, hypercall_args_64[eax]);
 #endif
     }
     else
@@ -279,35 +194,13 @@ int hvm_hypercall(struct cpu_user_regs *regs)
         HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%lu(%x, %x, %x, %x, %x)", eax,
                     ebx, ecx, edx, esi, edi);
 
-#ifndef NDEBUG
-        /* Deliberately corrupt parameter regs not used by this hypercall. */
-        switch ( hypercall_args_table[eax].compat )
-        {
-        case 0: ebx = 0xdeadf00d; fallthrough;
-        case 1: ecx = 0xdeadf00d; fallthrough;
-        case 2: edx = 0xdeadf00d; fallthrough;
-        case 3: esi = 0xdeadf00d; fallthrough;
-        case 4: edi = 0xdeadf00d;
-        }
-#endif
-
         curr->hcall_compat = true;
-        regs->eax = hvm_hypercall_table[eax].compat(ebx, ecx, edx, esi, edi);
+        call_handlers_hvm32(eax, regs->eax, ebx, ecx, edx, esi, edi);
         curr->hcall_compat = false;
 
 #ifndef NDEBUG
-        if ( !curr->hcall_preempted )
-        {
-            /* Deliberately corrupt parameter regs used by this hypercall. */
-            switch ( hypercall_args_table[eax].compat )
-            {
-            case 5: regs->rdi = 0xdeadf00d; fallthrough;
-            case 4: regs->rsi = 0xdeadf00d; fallthrough;
-            case 3: regs->rdx = 0xdeadf00d; fallthrough;
-            case 2: regs->rcx = 0xdeadf00d; fallthrough;
-            case 1: regs->rbx = 0xdeadf00d;
-            }
-        }
+        if ( !curr->hcall_preempted && regs->eax != -ENOSYS )
+            clobber_regs32(regs, hypercall_args_32[eax]);
 #endif
     }
 
@@ -327,31 +220,20 @@ int hvm_hypercall(struct cpu_user_regs *regs)
 enum mc_disposition hvm_do_multicall_call(struct mc_state *state)
 {
     struct vcpu *curr = current;
-    hypercall_fn_t *func = NULL;
 
     if ( hvm_guest_x86_mode(curr) == 8 )
     {
         struct multicall_entry *call = &state->call;
 
-        if ( call->op < ARRAY_SIZE(hvm_hypercall_table) )
-            func = array_access_nospec(hvm_hypercall_table, call->op).native;
-        if ( func )
-            call->result = func(call->args[0], call->args[1], call->args[2],
-                                call->args[3], call->args[4]);
-        else
-            call->result = -ENOSYS;
+        call_handlers_hvm64(call->op, call->result, call->args[0], call->args[1],
+                            call->args[2], call->args[3], call->args[4]);
     }
     else
     {
         struct compat_multicall_entry *call = &state->compat_call;
 
-        if ( call->op < ARRAY_SIZE(hvm_hypercall_table) )
-            func = array_access_nospec(hvm_hypercall_table, call->op).compat;
-        if ( func )
-            call->result = func(call->args[0], call->args[1], call->args[2],
-                                call->args[3], call->args[4]);
-        else
-            call->result = -ENOSYS;
+        call_handlers_hvm32(call->op, call->result, call->args[0], call->args[1],
+                            call->args[2], call->args[3], call->args[4]);
     }
 
     return !hvm_get_cpl(curr) ? mc_continue : mc_preempt;
diff --git a/xen/arch/x86/hypercall.c b/xen/arch/x86/hypercall.c
index 07e1a45ef5..6b73cff9b9 100644
--- a/xen/arch/x86/hypercall.c
+++ b/xen/arch/x86/hypercall.c
@@ -22,65 +22,6 @@
 #include <xen/hypercall.h>
 #include <asm/multicall.h>
 
-#ifdef CONFIG_COMPAT
-#define ARGS(x, n)                              \
-    [ __HYPERVISOR_ ## x ] = { n, n }
-#define COMP(x, n, c)                           \
-    [ __HYPERVISOR_ ## x ] = { n, c }
-#else
-#define ARGS(x, n)    [ __HYPERVISOR_ ## x ] = { n }
-#define COMP(x, n, c) ARGS(x, n)
-#endif
-
-const hypercall_args_t hypercall_args_table[NR_hypercalls] =
-{
-    ARGS(set_trap_table, 1),
-    ARGS(mmu_update, 4),
-    ARGS(set_gdt, 2),
-    ARGS(stack_switch, 2),
-    COMP(set_callbacks, 3, 4),
-    ARGS(fpu_taskswitch, 1),
-    ARGS(sched_op_compat, 2),
-    ARGS(platform_op, 1),
-    ARGS(set_debugreg, 2),
-    ARGS(get_debugreg, 1),
-    COMP(update_descriptor, 2, 4),
-    ARGS(memory_op, 2),
-    ARGS(multicall, 2),
-    COMP(update_va_mapping, 3, 4),
-    COMP(set_timer_op, 1, 2),
-    ARGS(event_channel_op_compat, 1),
-    ARGS(xen_version, 2),
-    ARGS(console_io, 3),
-    ARGS(physdev_op_compat, 1),
-    ARGS(grant_table_op, 3),
-    ARGS(vm_assist, 2),
-    COMP(update_va_mapping_otherdomain, 4, 5),
-    ARGS(vcpu_op, 3),
-    COMP(set_segment_base, 2, 0),
-    ARGS(mmuext_op, 4),
-    ARGS(xsm_op, 1),
-    ARGS(nmi_op, 2),
-    ARGS(sched_op, 2),
-    ARGS(callback_op, 2),
-    ARGS(xenoprof_op, 2),
-    ARGS(event_channel_op, 2),
-    ARGS(physdev_op, 2),
-    ARGS(sysctl, 1),
-    ARGS(domctl, 1),
-    ARGS(kexec_op, 2),
-    ARGS(argo_op, 5),
-    ARGS(xenpmu_op, 2),
-    ARGS(hvm_op, 2),
-    ARGS(dm_op, 3),
-    ARGS(hypfs_op, 5),
-    ARGS(mca, 1),
-    ARGS(paging_domctl_cont, 1),
-};
-
-#undef COMP
-#undef ARGS
-
 #define NEXT_ARG(fmt, args)                                                 \
 ({                                                                          \
     unsigned long __arg;                                                    \
diff --git a/xen/arch/x86/pv/hypercall.c b/xen/arch/x86/pv/hypercall.c
index 6c4a32d2a6..9b575e5c0b 100644
--- a/xen/arch/x86/pv/hypercall.c
+++ b/xen/arch/x86/pv/hypercall.c
@@ -27,119 +27,22 @@
 #include <asm/multicall.h>
 #include <irq_vectors.h>
 
-typedef struct {
-    hypercall_fn_t *native;
-#ifdef CONFIG_PV32
-    hypercall_fn_t *compat;
-#endif
-} pv_hypercall_table_t;
-
+#ifndef NDEBUG
+static unsigned char hypercall_args_64[] = hypercall_args_pv64;
 #ifdef CONFIG_PV32
-#define HYPERCALL(x)                                                \
-    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,         \
-                               (hypercall_fn_t *) do_ ## x }
-#define COMPAT_CALL(x)                                              \
-    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,         \
-                               (hypercall_fn_t *) compat_ ## x }
-#else
-#define HYPERCALL(x)                                                \
-    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x }
-#define COMPAT_CALL(x) HYPERCALL(x)
-#endif
-
-static const pv_hypercall_table_t pv_hypercall_table[] = {
-    COMPAT_CALL(set_trap_table),
-    HYPERCALL(mmu_update),
-    COMPAT_CALL(set_gdt),
-    HYPERCALL(stack_switch),
-    COMPAT_CALL(set_callbacks),
-    HYPERCALL(fpu_taskswitch),
-    HYPERCALL(sched_op_compat),
-#ifndef CONFIG_PV_SHIM_EXCLUSIVE
-    COMPAT_CALL(platform_op),
-#endif
-    HYPERCALL(set_debugreg),
-    HYPERCALL(get_debugreg),
-    COMPAT_CALL(update_descriptor),
-    COMPAT_CALL(memory_op),
-    COMPAT_CALL(multicall),
-    COMPAT_CALL(update_va_mapping),
-    COMPAT_CALL(set_timer_op),
-    HYPERCALL(event_channel_op_compat),
-    COMPAT_CALL(xen_version),
-    HYPERCALL(console_io),
-    COMPAT_CALL(physdev_op_compat),
-#ifdef CONFIG_GRANT_TABLE
-    COMPAT_CALL(grant_table_op),
-#endif
-    HYPERCALL(vm_assist),
-    COMPAT_CALL(update_va_mapping_otherdomain),
-    COMPAT_CALL(iret),
-    COMPAT_CALL(vcpu_op),
-    HYPERCALL(set_segment_base),
-    COMPAT_CALL(mmuext_op),
-    COMPAT_CALL(xsm_op),
-    COMPAT_CALL(nmi_op),
-    COMPAT_CALL(sched_op),
-    COMPAT_CALL(callback_op),
-#ifdef CONFIG_XENOPROF
-    COMPAT_CALL(xenoprof_op),
-#endif
-    HYPERCALL(event_channel_op),
-    COMPAT_CALL(physdev_op),
-#ifndef CONFIG_PV_SHIM_EXCLUSIVE
-    HYPERCALL(sysctl),
-    HYPERCALL(domctl),
+static unsigned char hypercall_args_32[] = hypercall_args_pv32;
 #endif
-#ifdef CONFIG_KEXEC
-    COMPAT_CALL(kexec_op),
 #endif
-#ifdef CONFIG_ARGO
-    COMPAT_CALL(argo_op),
-#endif
-    HYPERCALL(xenpmu_op),
-#ifdef CONFIG_HVM
-    HYPERCALL(hvm_op),
-    COMPAT_CALL(dm_op),
-#endif
-#ifdef CONFIG_HYPFS
-    HYPERCALL(hypfs_op),
-#endif
-    HYPERCALL(mca),
-#ifndef CONFIG_PV_SHIM_EXCLUSIVE
-    HYPERCALL(paging_domctl_cont),
-#endif
-};
-
-#undef COMPAT_CALL
-#undef HYPERCALL
 
 /* Forced inline to cause 'compat' to be evaluated at compile time. */
 static void always_inline
 _pv_hypercall(struct cpu_user_regs *regs, bool compat)
 {
     struct vcpu *curr = current;
-    unsigned long eax = compat ? regs->eax : regs->rax;
+    unsigned long eax;
 
     ASSERT(guest_kernel_mode(curr, regs));
 
-    BUILD_BUG_ON(ARRAY_SIZE(pv_hypercall_table) >
-                 ARRAY_SIZE(hypercall_args_table));
-
-    if ( eax >= ARRAY_SIZE(pv_hypercall_table) )
-    {
-        regs->rax = -ENOSYS;
-        return;
-    }
-
-    eax = array_index_nospec(eax, ARRAY_SIZE(pv_hypercall_table));
-
-    if ( !pv_hypercall_table[eax].native )
-    {
-        regs->rax = -ENOSYS;
-        return;
-    }
-
     curr->hcall_preempted = false;
 
     if ( !compat )
@@ -150,17 +53,8 @@ _pv_hypercall(struct cpu_user_regs *regs, bool compat)
         unsigned long r10 = regs->r10;
         unsigned long r8 = regs->r8;
 
-#ifndef NDEBUG
-        /* Deliberately corrupt parameter regs not used by this hypercall. */
-        switch ( hypercall_args_table[eax].native )
-        {
-        case 0: rdi = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 1: rsi = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 2: rdx = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 3: r10 = 0xdeadbeefdeadf00dUL; fallthrough;
-        case 4: r8 = 0xdeadbeefdeadf00dUL;
-        }
-#endif
+        eax = regs->rax;
+
         if ( unlikely(tb_init_done) )
         {
             unsigned long args[5] = { rdi, rsi, rdx, r10, r8 };
@@ -168,21 +62,11 @@ _pv_hypercall(struct cpu_user_regs *regs, bool compat)
             __trace_hypercall(TRC_PV_HYPERCALL_V2, eax, args);
         }
 
-        regs->rax = pv_hypercall_table[eax].native(rdi, rsi, rdx, r10, r8);
+        call_handlers_pv64(eax, regs->rax, rdi, rsi, rdx, r10, r8);
 
 #ifndef NDEBUG
-        if ( !curr->hcall_preempted )
-        {
-            /* Deliberately corrupt parameter regs used by this hypercall. */
-            switch ( hypercall_args_table[eax].native )
-            {
-            case 5: regs->r8  = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 4: regs->r10 = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 3: regs->rdx = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 2: regs->rsi = 0xdeadbeefdeadf00dUL; fallthrough;
-            case 1: regs->rdi = 0xdeadbeefdeadf00dUL;
-            }
-        }
+        if ( !curr->hcall_preempted && regs->rax != -ENOSYS )
+            clobber_regs(regs, hypercall_args_64[eax]);
 #endif
     }
 #ifdef CONFIG_PV32
@@ -194,17 +78,7 @@ _pv_hypercall(struct cpu_user_regs *regs, bool compat)
         unsigned int esi = regs->esi;
         unsigned int edi = regs->edi;
 
-#ifndef NDEBUG
-        /* Deliberately corrupt parameter regs not used by this hypercall. */
-        switch ( hypercall_args_table[eax].compat )
-        {
-        case 0: ebx = 0xdeadf00d; fallthrough;
-        case 1: ecx = 0xdeadf00d; fallthrough;
-        case 2: edx = 0xdeadf00d; fallthrough;
-        case 3: esi = 0xdeadf00d; fallthrough;
-        case 4: edi = 0xdeadf00d;
-        }
-#endif
+        eax = regs->eax;
 
         if ( unlikely(tb_init_done) )
         {
@@ -214,22 +88,12 @@ _pv_hypercall(struct cpu_user_regs *regs, bool compat)
         }
 
         curr->hcall_compat = true;
-        regs->eax = pv_hypercall_table[eax].compat(ebx, ecx, edx, esi, edi);
+        call_handlers_pv32(eax, regs->eax, ebx, ecx, edx, esi, edi);
         curr->hcall_compat = false;
 
 #ifndef NDEBUG
-        if ( !curr->hcall_preempted )
-        {
-            /* Deliberately corrupt parameter regs used by this hypercall. */
-            switch ( hypercall_args_table[eax].compat )
-            {
-            case 5: regs->edi = 0xdeadf00d; fallthrough;
-            case 4: regs->esi = 0xdeadf00d; fallthrough;
-            case 3: regs->edx = 0xdeadf00d; fallthrough;
-            case 2: regs->ecx = 0xdeadf00d; fallthrough;
-            case 1: regs->ebx = 0xdeadf00d;
-            }
-        }
+        if ( !curr->hcall_preempted && regs->eax != -ENOSYS )
+            clobber_regs32(regs, hypercall_args_32[eax]);
 #endif
     }
 #endif /* CONFIG_PV32 */
@@ -256,13 +120,8 @@ enum mc_disposition pv_do_multicall_call(struct mc_state *state)
         struct compat_multicall_entry *call = &state->compat_call;
 
         op = call->op;
-        if ( (op < ARRAY_SIZE(pv_hypercall_table)) &&
-             pv_hypercall_table[op].compat )
-            call->result = pv_hypercall_table[op].compat(
-                call->args[0], call->args[1], call->args[2],
-                call->args[3], call->args[4]);
-        else
-            call->result = -ENOSYS;
+        call_handlers_pv32(op, call->result, call->args[0], call->args[1],
+                           call->args[2], call->args[3], call->args[4]);
     }
     else
 #endif
@@ -270,13 +129,8 @@ enum mc_disposition pv_do_multicall_call(struct mc_state *state)
         struct multicall_entry *call = &state->call;
 
         op = call->op;
-        if ( (op < ARRAY_SIZE(pv_hypercall_table)) &&
-             pv_hypercall_table[op].native )
-            call->result = pv_hypercall_table[op].native(
-                call->args[0], call->args[1], call->args[2],
-                call->args[3], call->args[4]);
-        else
-            call->result = -ENOSYS;
+        call_handlers_pv64(op, call->result, call->args[0], call->args[1],
+                           call->args[2], call->args[3], call->args[4]);
     }
 
     return unlikely(op == __HYPERVISOR_iret)
diff --git a/xen/include/asm-x86/hypercall.h b/xen/include/asm-x86/hypercall.h
index eb2907b5b6..f2db3f3c21 100644
--- a/xen/include/asm-x86/hypercall.h
+++ b/xen/include/asm-x86/hypercall.h
@@ -17,19 +17,6 @@
 
 #define __HYPERVISOR_paging_domctl_cont __HYPERVISOR_arch_1
 
-typedef unsigned long hypercall_fn_t(
-    unsigned long, unsigned long, unsigned long,
-    unsigned long, unsigned long);
-
-typedef struct {
-    uint8_t native;
-#ifdef CONFIG_COMPAT
-    uint8_t compat;
-#endif
-} hypercall_args_t;
-
-extern const hypercall_args_t hypercall_args_table[NR_hypercalls];
-
 #ifdef CONFIG_PV
 void pv_hypercall(struct cpu_user_regs *regs);
 #endif
@@ -55,4 +42,34 @@ compat_common_vcpu_op(
 
 #endif /* CONFIG_COMPAT */
 
+#ifndef NDEBUG
+static inline void clobber_regs(struct cpu_user_regs *regs,
+                                unsigned int nargs)
+{
+    /* Deliberately corrupt used parameter regs. */
+    switch ( nargs )
+    {
+    case 5: regs->r8  = 0xdeadbeefdeadf00dUL; fallthrough;
+    case 4: regs->r10 = 0xdeadbeefdeadf00dUL; fallthrough;
+    case 3: regs->rdx = 0xdeadbeefdeadf00dUL; fallthrough;
+    case 2: regs->rsi = 0xdeadbeefdeadf00dUL; fallthrough;
+    case 1: regs->rdi = 0xdeadbeefdeadf00dUL;
+    }
+}
+
+static inline void clobber_regs32(struct cpu_user_regs *regs,
+                                  unsigned int nargs)
+{
+    /* Deliberately corrupt used parameter regs. */
+    switch ( nargs )
+    {
+    case 5: regs->edi = 0xdeadf00dUL; fallthrough;
+    case 4: regs->esi = 0xdeadf00dUL; fallthrough;
+    case 3: regs->edx = 0xdeadf00dUL; fallthrough;
+    case 2: regs->ecx = 0xdeadf00dUL; fallthrough;
+    case 1: regs->ebx = 0xdeadf00dUL;
+    }
+}
+#endif
+
 #endif /* __ASM_X86_HYPERCALL_H__ */
-- 
2.26.2

Re: [PATCH 10/12] xen/x86: call hypercall handlers via switch statement

Posted by Jan Beulich 4 years, 3 months ago

On 15.10.2021 14:51, Juergen Gross wrote:
> Instead of using a function table use the generated switch statement
> macros for calling the appropriate hypercall handlers.
> 
> This is beneficial to performance and avoids speculation issues.
> 
> With calling the handlers using the correct number of parameters now
> it is possible to do the parameter register clobbering in the NDEBUG
> case after returning from the handler. This in turn removes the only
> users of hypercall_args_table[] which can be removed now.

"removed" reads misleading to me: You really replace it by new tables,
using script-generated initializers. Also it looks like you're doubling
the data, as the same sets were previously used by pv64/hvm64 and
pv32/hvm32 respectively.

> --- a/xen/arch/x86/hvm/hypercall.c
> +++ b/xen/arch/x86/hvm/hypercall.c
> @@ -108,56 +108,10 @@ long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
>          return compat_physdev_op(cmd, arg);
>  }
>  
> -#define HYPERCALL(x)                                         \
> -    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,  \
> -                               (hypercall_fn_t *) do_ ## x }
> -
> -#define HVM_CALL(x)                                          \
> -    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) hvm_ ## x, \
> -                               (hypercall_fn_t *) hvm_ ## x }
> -
> -#define COMPAT_CALL(x)                                       \
> -    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,  \
> -                               (hypercall_fn_t *) compat_ ## x }
> -
> -static const struct {
> -    hypercall_fn_t *native, *compat;
> -} hvm_hypercall_table[] = {
> -    HVM_CALL(memory_op),
> -    COMPAT_CALL(multicall),
> -#ifdef CONFIG_GRANT_TABLE
> -    HVM_CALL(grant_table_op),
> -#endif
> -    HYPERCALL(vm_assist),
> -    COMPAT_CALL(vcpu_op),
> -    HVM_CALL(physdev_op),
> -    COMPAT_CALL(xen_version),
> -    HYPERCALL(console_io),
> -    HYPERCALL(event_channel_op),
> -    COMPAT_CALL(sched_op),
> -    COMPAT_CALL(set_timer_op),
> -    COMPAT_CALL(xsm_op),
> -    HYPERCALL(hvm_op),
> -    HYPERCALL(sysctl),
> -    HYPERCALL(domctl),
> -#ifdef CONFIG_ARGO
> -    COMPAT_CALL(argo_op),
> -#endif
> -    COMPAT_CALL(platform_op),
> -#ifdef CONFIG_PV
> -    COMPAT_CALL(mmuext_op),
> -#endif
> -    HYPERCALL(xenpmu_op),
> -    COMPAT_CALL(dm_op),
> -#ifdef CONFIG_HYPFS
> -    HYPERCALL(hypfs_op),
> +#ifndef NDEBUG
> +static unsigned char hypercall_args_64[] = hypercall_args_hvm64;
> +static unsigned char hypercall_args_32[] = hypercall_args_hvm32;

Irrespective of this being debugging-only: Const?

> @@ -239,33 +176,11 @@ int hvm_hypercall(struct cpu_user_regs *regs)
>          HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%lu(%lx, %lx, %lx, %lx, %lx)",
>                      eax, rdi, rsi, rdx, r10, r8);
>  
> -#ifndef NDEBUG
> -        /* Deliberately corrupt parameter regs not used by this hypercall. */
> -        switch ( hypercall_args_table[eax].native )
> -        {
> -        case 0: rdi = 0xdeadbeefdeadf00dUL; fallthrough;
> -        case 1: rsi = 0xdeadbeefdeadf00dUL; fallthrough;
> -        case 2: rdx = 0xdeadbeefdeadf00dUL; fallthrough;
> -        case 3: r10 = 0xdeadbeefdeadf00dUL; fallthrough;
> -        case 4: r8 = 0xdeadbeefdeadf00dUL;
> -        }
> -#endif
> -
> -        regs->rax = hvm_hypercall_table[eax].native(rdi, rsi, rdx, r10, r8);
> +        call_handlers_hvm64(eax, regs->rax, rdi, rsi, rdx, r10, r8);
>  
>  #ifndef NDEBUG
> -        if ( !curr->hcall_preempted )
> -        {
> -            /* Deliberately corrupt parameter regs used by this hypercall. */
> -            switch ( hypercall_args_table[eax].native )
> -            {
> -            case 5: regs->r8  = 0xdeadbeefdeadf00dUL; fallthrough;
> -            case 4: regs->r10 = 0xdeadbeefdeadf00dUL; fallthrough;
> -            case 3: regs->rdx = 0xdeadbeefdeadf00dUL; fallthrough;
> -            case 2: regs->rsi = 0xdeadbeefdeadf00dUL; fallthrough;
> -            case 1: regs->rdi = 0xdeadbeefdeadf00dUL;
> -            }
> -        }
> +        if ( !curr->hcall_preempted && regs->rax != -ENOSYS )
> +            clobber_regs(regs, hypercall_args_64[eax]);

I'm not fundamentally opposed, but sadly -ENOSYS comes back also in undue
situations, e.g. various hypercalls still produce this for "unknown
sub-function". Hence the weakened clobbering wants at least mentioning,
perhaps also justifying, in the description.

> @@ -55,4 +42,34 @@ compat_common_vcpu_op(
>  
>  #endif /* CONFIG_COMPAT */
>  
> +#ifndef NDEBUG

Hmm, I was actuall hoping for the conditional to actually live ...

> +static inline void clobber_regs(struct cpu_user_regs *regs,
> +                                unsigned int nargs)
> +{

... here and ...

> +    /* Deliberately corrupt used parameter regs. */
> +    switch ( nargs )
> +    {
> +    case 5: regs->r8  = 0xdeadbeefdeadf00dUL; fallthrough;
> +    case 4: regs->r10 = 0xdeadbeefdeadf00dUL; fallthrough;
> +    case 3: regs->rdx = 0xdeadbeefdeadf00dUL; fallthrough;
> +    case 2: regs->rsi = 0xdeadbeefdeadf00dUL; fallthrough;
> +    case 1: regs->rdi = 0xdeadbeefdeadf00dUL;
> +    }
> +}
> +
> +static inline void clobber_regs32(struct cpu_user_regs *regs,
> +                                  unsigned int nargs)
> +{

... here, such that the conditionals in the .c files could go away
altogether.

> +    /* Deliberately corrupt used parameter regs. */
> +    switch ( nargs )
> +    {
> +    case 5: regs->edi = 0xdeadf00dUL; fallthrough;
> +    case 4: regs->esi = 0xdeadf00dUL; fallthrough;
> +    case 3: regs->edx = 0xdeadf00dUL; fallthrough;
> +    case 2: regs->ecx = 0xdeadf00dUL; fallthrough;
> +    case 1: regs->ebx = 0xdeadf00dUL;

No need for the UL suffixes here afaics; U ones may want to be there.

Overall, besides these mainly cosmetic aspects the main thing missing
is an approach to prioritize the handful most frequently used functions,
for them to be pulled out of the switch() so we don't depend on the
compiler's choice for the order of comparisons done.

Jan

Re: [PATCH 10/12] xen/x86: call hypercall handlers via switch statement

Posted by Juergen Gross 4 years, 3 months ago

On 21.10.21 16:41, Jan Beulich wrote:
> On 15.10.2021 14:51, Juergen Gross wrote:
>> Instead of using a function table use the generated switch statement
>> macros for calling the appropriate hypercall handlers.
>>
>> This is beneficial to performance and avoids speculation issues.
>>
>> With calling the handlers using the correct number of parameters now
>> it is possible to do the parameter register clobbering in the NDEBUG
>> case after returning from the handler. This in turn removes the only
>> users of hypercall_args_table[] which can be removed now.
> 
> "removed" reads misleading to me: You really replace it by new tables,
> using script-generated initializers. Also it looks like you're doubling
> the data, as the same sets were previously used by pv64/hvm64 and
> pv32/hvm32 respectively.

Yes, I'll change that paragraph.

Regarding having 4 tables on x86 now: merging the pv/hvm tables would be
possible, but this would add some complexity to the script generating
the tables (it should test whether the number of parameters of pv and
hvm match). As the tables are present in debug build only I don't think
this is a real issue.

> 
>> --- a/xen/arch/x86/hvm/hypercall.c
>> +++ b/xen/arch/x86/hvm/hypercall.c
>> @@ -108,56 +108,10 @@ long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
>>           return compat_physdev_op(cmd, arg);
>>   }
>>   
>> -#define HYPERCALL(x)                                         \
>> -    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,  \
>> -                               (hypercall_fn_t *) do_ ## x }
>> -
>> -#define HVM_CALL(x)                                          \
>> -    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) hvm_ ## x, \
>> -                               (hypercall_fn_t *) hvm_ ## x }
>> -
>> -#define COMPAT_CALL(x)                                       \
>> -    [ __HYPERVISOR_ ## x ] = { (hypercall_fn_t *) do_ ## x,  \
>> -                               (hypercall_fn_t *) compat_ ## x }
>> -
>> -static const struct {
>> -    hypercall_fn_t *native, *compat;
>> -} hvm_hypercall_table[] = {
>> -    HVM_CALL(memory_op),
>> -    COMPAT_CALL(multicall),
>> -#ifdef CONFIG_GRANT_TABLE
>> -    HVM_CALL(grant_table_op),
>> -#endif
>> -    HYPERCALL(vm_assist),
>> -    COMPAT_CALL(vcpu_op),
>> -    HVM_CALL(physdev_op),
>> -    COMPAT_CALL(xen_version),
>> -    HYPERCALL(console_io),
>> -    HYPERCALL(event_channel_op),
>> -    COMPAT_CALL(sched_op),
>> -    COMPAT_CALL(set_timer_op),
>> -    COMPAT_CALL(xsm_op),
>> -    HYPERCALL(hvm_op),
>> -    HYPERCALL(sysctl),
>> -    HYPERCALL(domctl),
>> -#ifdef CONFIG_ARGO
>> -    COMPAT_CALL(argo_op),
>> -#endif
>> -    COMPAT_CALL(platform_op),
>> -#ifdef CONFIG_PV
>> -    COMPAT_CALL(mmuext_op),
>> -#endif
>> -    HYPERCALL(xenpmu_op),
>> -    COMPAT_CALL(dm_op),
>> -#ifdef CONFIG_HYPFS
>> -    HYPERCALL(hypfs_op),
>> +#ifndef NDEBUG
>> +static unsigned char hypercall_args_64[] = hypercall_args_hvm64;
>> +static unsigned char hypercall_args_32[] = hypercall_args_hvm32;
> 
> Irrespective of this being debugging-only: Const?

Yes.

> 
>> @@ -239,33 +176,11 @@ int hvm_hypercall(struct cpu_user_regs *regs)
>>           HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%lu(%lx, %lx, %lx, %lx, %lx)",
>>                       eax, rdi, rsi, rdx, r10, r8);
>>   
>> -#ifndef NDEBUG
>> -        /* Deliberately corrupt parameter regs not used by this hypercall. */
>> -        switch ( hypercall_args_table[eax].native )
>> -        {
>> -        case 0: rdi = 0xdeadbeefdeadf00dUL; fallthrough;
>> -        case 1: rsi = 0xdeadbeefdeadf00dUL; fallthrough;
>> -        case 2: rdx = 0xdeadbeefdeadf00dUL; fallthrough;
>> -        case 3: r10 = 0xdeadbeefdeadf00dUL; fallthrough;
>> -        case 4: r8 = 0xdeadbeefdeadf00dUL;
>> -        }
>> -#endif
>> -
>> -        regs->rax = hvm_hypercall_table[eax].native(rdi, rsi, rdx, r10, r8);
>> +        call_handlers_hvm64(eax, regs->rax, rdi, rsi, rdx, r10, r8);
>>   
>>   #ifndef NDEBUG
>> -        if ( !curr->hcall_preempted )
>> -        {
>> -            /* Deliberately corrupt parameter regs used by this hypercall. */
>> -            switch ( hypercall_args_table[eax].native )
>> -            {
>> -            case 5: regs->r8  = 0xdeadbeefdeadf00dUL; fallthrough;
>> -            case 4: regs->r10 = 0xdeadbeefdeadf00dUL; fallthrough;
>> -            case 3: regs->rdx = 0xdeadbeefdeadf00dUL; fallthrough;
>> -            case 2: regs->rsi = 0xdeadbeefdeadf00dUL; fallthrough;
>> -            case 1: regs->rdi = 0xdeadbeefdeadf00dUL;
>> -            }
>> -        }
>> +        if ( !curr->hcall_preempted && regs->rax != -ENOSYS )
>> +            clobber_regs(regs, hypercall_args_64[eax]);
> 
> I'm not fundamentally opposed, but sadly -ENOSYS comes back also in undue
> situations, e.g. various hypercalls still produce this for "unknown
> sub-function". Hence the weakened clobbering wants at least mentioning,
> perhaps also justifying, in the description.

Okay.

> 
>> @@ -55,4 +42,34 @@ compat_common_vcpu_op(
>>   
>>   #endif /* CONFIG_COMPAT */
>>   
>> +#ifndef NDEBUG
> 
> Hmm, I was actuall hoping for the conditional to actually live ...
> 
>> +static inline void clobber_regs(struct cpu_user_regs *regs,
>> +                                unsigned int nargs)
>> +{
> 
> ... here and ...
> 
>> +    /* Deliberately corrupt used parameter regs. */
>> +    switch ( nargs )
>> +    {
>> +    case 5: regs->r8  = 0xdeadbeefdeadf00dUL; fallthrough;
>> +    case 4: regs->r10 = 0xdeadbeefdeadf00dUL; fallthrough;
>> +    case 3: regs->rdx = 0xdeadbeefdeadf00dUL; fallthrough;
>> +    case 2: regs->rsi = 0xdeadbeefdeadf00dUL; fallthrough;
>> +    case 1: regs->rdi = 0xdeadbeefdeadf00dUL;
>> +    }
>> +}
>> +
>> +static inline void clobber_regs32(struct cpu_user_regs *regs,
>> +                                  unsigned int nargs)
>> +{
> 
> ... here, such that the conditionals in the .c files could go away
> altogether.

I didn't do that in order to be able to have the tables with the
number of parameters inside #ifndef NDEBUG sections.

I think I can change that by using a macro for reading the table
values.

> 
>> +    /* Deliberately corrupt used parameter regs. */
>> +    switch ( nargs )
>> +    {
>> +    case 5: regs->edi = 0xdeadf00dUL; fallthrough;
>> +    case 4: regs->esi = 0xdeadf00dUL; fallthrough;
>> +    case 3: regs->edx = 0xdeadf00dUL; fallthrough;
>> +    case 2: regs->ecx = 0xdeadf00dUL; fallthrough;
>> +    case 1: regs->ebx = 0xdeadf00dUL;
> 
> No need for the UL suffixes here afaics; U ones may want to be there.

Okay.

> Overall, besides these mainly cosmetic aspects the main thing missing
> is an approach to prioritize the handful most frequently used functions,
> for them to be pulled out of the switch() so we don't depend on the
> compiler's choice for the order of comparisons done.

I have already prepared that step by generating the complete call
sequence, so any change for prioritizing some hypercalls can be local to
the generator script and the used input data.

The main question is how to do that. I've collected some hypercall
statistics data for PV and PVH guests running some simple tests (once a
build of the Xen hypervisor, and once a scp of a large file). The data
is split between guest and dom0 (PV) counts. There is no clear "winner"
which hypercall should be fastest, but several hypercalls are clearly
not important.

Here is the data:

PV-hypercall    PV-guest build   PV-guest scp    dom0 build     dom0 scp
mmu_update           186175729           2865         20936        33725
stack_switch           1273311          62381        108589       270764
multicall              2182803             50           302          524
update_va_mapping       571868             10            60           80
xen_version              73061            850           859         5432
grant_table_op               0              0         35557       139110
iret                  75673006         484132        268157       757958
vcpu_op                 453037          71199        138224       334988
set_segment_base       1650249          62387        108645       270823
mmuext_op             11225681            188          7239         3426
sched_op                280153         134645         70729       137943
event_channel_op        192327          66204         71409       214191
physdev_op                   0              0          7721         4315
(the dom0 values are for the guest running the build or scp test, so
dom0 acting as backend)

HVM-hypercall   PVH-guest build    PVH-guest scp
vcpu_op                  277684             2324
event_channel_op         350233            57383
(the related dom0 counter values are in the same range as with the test
running in the PV guest)

It should be noted that during boot of the guests the numbers for the PV
guest are more like the ones for the build test with the exception of
iret and sched_op being higher, while for PVH sched_op is by far the
most often used hypercall.

I'm not sure how to translate those numbers into a good algorithm for
generating the call sequence.

I could add priorities to each hypercall in hypercall-defs.c and have a
cascade of if (likely(foo)) call_foo; else if (likely(bla)) ... else
switch(rest).

Or I could have groups of hypercalls with a priority for each group and:

mask = 1ULL << num;
if (likely(mask & prio_1_mask)) switch(num) ...
else if (likely(mask & prio_2_mask)) switch (num) ...
...
else switch (num) ...

Or I could combine those approaches using the mask variant for cases of
multiple entries having the same priority and the direct call variant
for the cases of only a single entry having a specific priority.

And then there is the problem to set the priorities (fairly simple for
HVM, PV is more diffcult).


Juergen

Re: [PATCH 10/12] xen/x86: call hypercall handlers via switch statement

Posted by Jan Beulich 4 years, 3 months ago

On 28.10.2021 16:32, Juergen Gross wrote:
> On 21.10.21 16:41, Jan Beulich wrote:
>> On 15.10.2021 14:51, Juergen Gross wrote:
>>> Instead of using a function table use the generated switch statement
>>> macros for calling the appropriate hypercall handlers.
>>>
>>> This is beneficial to performance and avoids speculation issues.
>>>
>>> With calling the handlers using the correct number of parameters now
>>> it is possible to do the parameter register clobbering in the NDEBUG
>>> case after returning from the handler. This in turn removes the only
>>> users of hypercall_args_table[] which can be removed now.
>>
>> "removed" reads misleading to me: You really replace it by new tables,
>> using script-generated initializers. Also it looks like you're doubling
>> the data, as the same sets were previously used by pv64/hvm64 and
>> pv32/hvm32 respectively.
> 
> Yes, I'll change that paragraph.
> 
> Regarding having 4 tables on x86 now: merging the pv/hvm tables would be
> possible, but this would add some complexity to the script generating
> the tables (it should test whether the number of parameters of pv and
> hvm match). As the tables are present in debug build only I don't think
> this is a real issue.

Sure, but that imo wants saying in the description.

>> Overall, besides these mainly cosmetic aspects the main thing missing
>> is an approach to prioritize the handful most frequently used functions,
>> for them to be pulled out of the switch() so we don't depend on the
>> compiler's choice for the order of comparisons done.
> 
> I have already prepared that step by generating the complete call
> sequence, so any change for prioritizing some hypercalls can be local to
> the generator script and the used input data.
> 
> The main question is how to do that. I've collected some hypercall
> statistics data for PV and PVH guests running some simple tests (once a
> build of the Xen hypervisor, and once a scp of a large file). The data
> is split between guest and dom0 (PV) counts. There is no clear "winner"
> which hypercall should be fastest, but several hypercalls are clearly
> not important.
> 
> Here is the data:
> 
> PV-hypercall    PV-guest build   PV-guest scp    dom0 build     dom0 scp
> mmu_update           186175729           2865         20936        33725

Builds should be local to the guest and I/O should involve gnttab ops
but no mmu-update. Hence I have a hard time seeing where the huge
difference here would be coming from. Did you have any thoughts here?

> stack_switch           1273311          62381        108589       270764
> multicall              2182803             50           302          524

A fair amount of the mmu-updates is going to be coming through
muticalls, I would guess. Priorities therefore may even differ for
the two separate dispatch points.

> update_va_mapping       571868             10            60           80
> xen_version              73061            850           859         5432
> grant_table_op               0              0         35557       139110
> iret                  75673006         484132        268157       757958

The huge differences for builds is puzzling mere here ...

> vcpu_op                 453037          71199        138224       334988
> set_segment_base       1650249          62387        108645       270823
> mmuext_op             11225681            188          7239         3426

... and here as well. Did Dom0 and DomU use identical numbers of
vCPU-s and identical -j make option values?

> sched_op                280153         134645         70729       137943
> event_channel_op        192327          66204         71409       214191
> physdev_op                   0              0          7721         4315
> (the dom0 values are for the guest running the build or scp test, so
> dom0 acting as backend)
> 
> HVM-hypercall   PVH-guest build    PVH-guest scp
> vcpu_op                  277684             2324
> event_channel_op         350233            57383
> (the related dom0 counter values are in the same range as with the test
> running in the PV guest)
> 
> It should be noted that during boot of the guests the numbers for the PV
> guest are more like the ones for the build test with the exception of
> iret and sched_op being higher, while for PVH sched_op is by far the
> most often used hypercall.
> 
> I'm not sure how to translate those numbers into a good algorithm for
> generating the call sequence.

Well, there's never going to be a clear cut fitting everything, I
suppose.

> I could add priorities to each hypercall in hypercall-defs.c and have a
> cascade of if (likely(foo)) call_foo; else if (likely(bla)) ... else
> switch(rest).

Personally I'd lean to an approach like this one; perhaps there's not
even a need to specify priorities for every hypercall, but just the
ones we deem most frequently used?

Jan

> Or I could have groups of hypercalls with a priority for each group and:
> 
> mask = 1ULL << num;
> if (likely(mask & prio_1_mask)) switch(num) ...
> else if (likely(mask & prio_2_mask)) switch (num) ...
> ...
> else switch (num) ...
> 
> Or I could combine those approaches using the mask variant for cases of
> multiple entries having the same priority and the direct call variant
> for the cases of only a single entry having a specific priority.
> 
> And then there is the problem to set the priorities (fairly simple for
> HVM, PV is more diffcult).
> 
> 
> Juergen
>

Re: [PATCH 10/12] xen/x86: call hypercall handlers via switch statement

Posted by Juergen Gross 4 years, 3 months ago

On 02.11.21 10:54, Jan Beulich wrote:
> On 28.10.2021 16:32, Juergen Gross wrote:
>> On 21.10.21 16:41, Jan Beulich wrote:
>>> On 15.10.2021 14:51, Juergen Gross wrote:
>>>> Instead of using a function table use the generated switch statement
>>>> macros for calling the appropriate hypercall handlers.
>>>>
>>>> This is beneficial to performance and avoids speculation issues.
>>>>
>>>> With calling the handlers using the correct number of parameters now
>>>> it is possible to do the parameter register clobbering in the NDEBUG
>>>> case after returning from the handler. This in turn removes the only
>>>> users of hypercall_args_table[] which can be removed now.
>>>
>>> "removed" reads misleading to me: You really replace it by new tables,
>>> using script-generated initializers. Also it looks like you're doubling
>>> the data, as the same sets were previously used by pv64/hvm64 and
>>> pv32/hvm32 respectively.
>>
>> Yes, I'll change that paragraph.
>>
>> Regarding having 4 tables on x86 now: merging the pv/hvm tables would be
>> possible, but this would add some complexity to the script generating
>> the tables (it should test whether the number of parameters of pv and
>> hvm match). As the tables are present in debug build only I don't think
>> this is a real issue.
> 
> Sure, but that imo wants saying in the description.
> 
>>> Overall, besides these mainly cosmetic aspects the main thing missing
>>> is an approach to prioritize the handful most frequently used functions,
>>> for them to be pulled out of the switch() so we don't depend on the
>>> compiler's choice for the order of comparisons done.
>>
>> I have already prepared that step by generating the complete call
>> sequence, so any change for prioritizing some hypercalls can be local to
>> the generator script and the used input data.
>>
>> The main question is how to do that. I've collected some hypercall
>> statistics data for PV and PVH guests running some simple tests (once a
>> build of the Xen hypervisor, and once a scp of a large file). The data
>> is split between guest and dom0 (PV) counts. There is no clear "winner"
>> which hypercall should be fastest, but several hypercalls are clearly
>> not important.
>>
>> Here is the data:
>>
>> PV-hypercall    PV-guest build   PV-guest scp    dom0 build     dom0 scp
>> mmu_update           186175729           2865         20936        33725
> 
> Builds should be local to the guest and I/O should involve gnttab ops
> but no mmu-update. Hence I have a hard time seeing where the huge
> difference here would be coming from. Did you have any thoughts here?

I think you misunderstood the columns.

The first column of data is the build job running in domU and the number
of hypercalls done by that domU. The 3rd data column is the same test
(build running in domU), but the number of hypercalls done by dom0 (so
pure backend hypercall activity).

The missing gnttab ops on domU side are fine, as granting a page doesn't
require a hypercall.

> 
>> stack_switch           1273311          62381        108589       270764
>> multicall              2182803             50           302          524
> 
> A fair amount of the mmu-updates is going to be coming through
> muticalls, I would guess. Priorities therefore may even differ for
> the two separate dispatch points.

I can look into collecting some data here.

> 
>> update_va_mapping       571868             10            60           80
>> xen_version              73061            850           859         5432
>> grant_table_op               0              0         35557       139110
>> iret                  75673006         484132        268157       757958
> 
> The huge differences for builds is puzzling mere here ...
> 
>> vcpu_op                 453037          71199        138224       334988
>> set_segment_base       1650249          62387        108645       270823
>> mmuext_op             11225681            188          7239         3426
> 
> ... and here as well. Did Dom0 and DomU use identical numbers of
> vCPU-s and identical -j make option values?
> 
>> sched_op                280153         134645         70729       137943
>> event_channel_op        192327          66204         71409       214191
>> physdev_op                   0              0          7721         4315
>> (the dom0 values are for the guest running the build or scp test, so
>> dom0 acting as backend)
>>
>> HVM-hypercall   PVH-guest build    PVH-guest scp
>> vcpu_op                  277684             2324
>> event_channel_op         350233            57383
>> (the related dom0 counter values are in the same range as with the test
>> running in the PV guest)
>>
>> It should be noted that during boot of the guests the numbers for the PV
>> guest are more like the ones for the build test with the exception of
>> iret and sched_op being higher, while for PVH sched_op is by far the
>> most often used hypercall.
>>
>> I'm not sure how to translate those numbers into a good algorithm for
>> generating the call sequence.
> 
> Well, there's never going to be a clear cut fitting everything, I
> suppose.
> 
>> I could add priorities to each hypercall in hypercall-defs.c and have a
>> cascade of if (likely(foo)) call_foo; else if (likely(bla)) ... else
>> switch(rest).
> 
> Personally I'd lean to an approach like this one; perhaps there's not
> even a need to specify priorities for every hypercall, but just the
> ones we deem most frequently used?

See my new series.


Juergen