xen/arch/x86/pv/hypercall.c | 24 +++++++++++++++++++----- xen/arch/x86/pv/traps.c | 11 ----------- 2 files changed, 19 insertions(+), 16 deletions(-)
The is_pv_32bit_vcpu() conditionals hide four lfences, with two taken on any
individual path through the function. There is very little code common
between compat and native, and context-dependent conditionals predict very
badly for a period of time after context switch.
Move do_entry_int82() from pv/traps.c into pv/hypercall.c, allowing
_pv_hypercall() to be static and forced inline. The delta is:
add/remove: 0/0 grow/shrink: 1/1 up/down: 300/-282 (18)
Function old new delta
do_entry_int82 50 350 +300
pv_hypercall 579 297 -282
which is tiny, but the perf implications are large:
Guest | Naples | Milan | SKX | CFL-R |
------+--------+--------+--------+--------+
pv64 | 17.4% | 15.5% | 2.6% | 4.5% |
pv32 | 1.9% | 10.9% | 1.4% | 2.5% |
These are percentage improvements in raw TSC detlas for a xen_version
hypercall, with obvious outliers excluded. Therefore, it is an idealised best
case improvement.
The pv64 path uses `syscall`, while the pv32 path uses `int $0x82` so
necessarily has higher overhead. Therefore, dropping the lfences is less over
an overall improvement.
I don't know why the Naples pv32 improvement is so small, but I've double
checked the numbers and they're correct. There's something we're doing which
is a large overhead in the pipeline.
On the Intel side, both systems are writing to MSR_SPEC_CTRL on
entry/exit (SKX using the retrofitted microcode implementation, CFL-R using
the hardware implementation), while SKX is suffering further from XPTI for
Meltdown protection.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
---
xen/arch/x86/pv/hypercall.c | 24 +++++++++++++++++++-----
xen/arch/x86/pv/traps.c | 11 -----------
2 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/xen/arch/x86/pv/hypercall.c b/xen/arch/x86/pv/hypercall.c
index 9765e674cf60..3579ba905c1c 100644
--- a/xen/arch/x86/pv/hypercall.c
+++ b/xen/arch/x86/pv/hypercall.c
@@ -23,6 +23,7 @@
#include <xen/hypercall.h>
#include <xen/nospec.h>
#include <xen/trace.h>
+#include <asm/apic.h>
#include <asm/multicall.h>
#include <irq_vectors.h>
@@ -109,15 +110,15 @@ const pv_hypercall_table_t pv_hypercall_table[] = {
#undef COMPAT_CALL
#undef HYPERCALL
-void pv_hypercall(struct cpu_user_regs *regs)
+/* Forced inline to cause 'compat' to be evaluated at compile time. */
+static void always_inline
+_pv_hypercall(struct cpu_user_regs *regs, bool compat)
{
struct vcpu *curr = current;
- unsigned long eax;
+ unsigned long eax = compat ? regs->eax : regs->rax;
ASSERT(guest_kernel_mode(curr, regs));
- eax = is_pv_32bit_vcpu(curr) ? regs->eax : regs->rax;
-
BUILD_BUG_ON(ARRAY_SIZE(pv_hypercall_table) >
ARRAY_SIZE(hypercall_args_table));
@@ -137,7 +138,7 @@ void pv_hypercall(struct cpu_user_regs *regs)
curr->hcall_preempted = false;
- if ( !is_pv_32bit_vcpu(curr) )
+ if ( !compat )
{
unsigned long rdi = regs->rdi;
unsigned long rsi = regs->rsi;
@@ -348,8 +349,21 @@ void pv_ring1_init_hypercall_page(void *p)
*(u8 *)(p+ 7) = 0xc3; /* ret */
}
}
+
+void do_entry_int82(struct cpu_user_regs *regs)
+{
+ if ( unlikely(untrusted_msi) )
+ check_for_unexpected_msi((uint8_t)regs->entry_vector);
+
+ _pv_hypercall(regs, true /* compat */);
+}
#endif
+void pv_hypercall(struct cpu_user_regs *regs)
+{
+ _pv_hypercall(regs, false /* native */);
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
index 764773c02104..1e05a9f1cdad 100644
--- a/xen/arch/x86/pv/traps.c
+++ b/xen/arch/x86/pv/traps.c
@@ -24,22 +24,11 @@
#include <xen/lib.h>
#include <xen/softirq.h>
-#include <asm/apic.h>
#include <asm/pv/trace.h>
#include <asm/shared.h>
#include <asm/traps.h>
#include <irq_vectors.h>
-#ifdef CONFIG_PV32
-void do_entry_int82(struct cpu_user_regs *regs)
-{
- if ( unlikely(untrusted_msi) )
- check_for_unexpected_msi((uint8_t)regs->entry_vector);
-
- pv_hypercall(regs);
-}
-#endif
-
void pv_inject_event(const struct x86_event *event)
{
struct vcpu *curr = current;
--
2.11.0
On 11.10.2021 20:05, Andrew Cooper wrote: > The is_pv_32bit_vcpu() conditionals hide four lfences, with two taken on any > individual path through the function. There is very little code common > between compat and native, and context-dependent conditionals predict very > badly for a period of time after context switch. > > Move do_entry_int82() from pv/traps.c into pv/hypercall.c, allowing > _pv_hypercall() to be static and forced inline. The delta is: > > add/remove: 0/0 grow/shrink: 1/1 up/down: 300/-282 (18) > Function old new delta > do_entry_int82 50 350 +300 > pv_hypercall 579 297 -282 > > which is tiny, but the perf implications are large: > > Guest | Naples | Milan | SKX | CFL-R | > ------+--------+--------+--------+--------+ > pv64 | 17.4% | 15.5% | 2.6% | 4.5% | > pv32 | 1.9% | 10.9% | 1.4% | 2.5% | > > These are percentage improvements in raw TSC detlas for a xen_version > hypercall, with obvious outliers excluded. Therefore, it is an idealised best > case improvement. > > The pv64 path uses `syscall`, while the pv32 path uses `int $0x82` so > necessarily has higher overhead. Therefore, dropping the lfences is less over > an overall improvement. > > I don't know why the Naples pv32 improvement is so small, but I've double > checked the numbers and they're correct. There's something we're doing which > is a large overhead in the pipeline. > > On the Intel side, both systems are writing to MSR_SPEC_CTRL on > entry/exit (SKX using the retrofitted microcode implementation, CFL-R using > the hardware implementation), while SKX is suffering further from XPTI for > Meltdown protection. > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> Reviewed-by: Jan Beulich <jbeulich@suse.com>
© 2016 - 2024 Red Hat, Inc.