[v2] x86/kvm/emulate: Avoid RET for FASTOPs

[PATCH v2 11/12] x86/kvm/emulate: Implement test_cc() in C

Posted by Peter Zijlstra 1 year, 2 months ago

Current test_cc() uses the fastop infrastructure to test flags using
SETcc instructions. However, int3_emulate_jcc() already fully
implements the flags->CC mapping, use that.

Removes a pile of gnarly asm.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/text-patching.h |   20 +++++++++++++-------
 arch/x86/kvm/emulate.c               |   34 ++--------------------------------
 2 files changed, 15 insertions(+), 39 deletions(-)

--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -176,9 +176,9 @@ void int3_emulate_ret(struct pt_regs *re
 }
 
 static __always_inline
-void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+bool __emulate_cc(unsigned long flags, u8 cc)
 {
-	static const unsigned long jcc_mask[6] = {
+	static const unsigned long cc_mask[6] = {
 		[0] = X86_EFLAGS_OF,
 		[1] = X86_EFLAGS_CF,
 		[2] = X86_EFLAGS_ZF,
@@ -191,15 +191,21 @@ void int3_emulate_jcc(struct pt_regs *re
 	bool match;
 
 	if (cc < 0xc) {
-		match = regs->flags & jcc_mask[cc >> 1];
+		match = flags & cc_mask[cc >> 1];
 	} else {
-		match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
-			((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+		match = ((flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+			((flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
 		if (cc >= 0xe)
-			match = match || (regs->flags & X86_EFLAGS_ZF);
+			match = match || (flags & X86_EFLAGS_ZF);
 	}
 
-	if ((match && !invert) || (!match && invert))
+	return (match && !invert) || (!match && invert);
+}
+
+static __always_inline
+void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+{
+	if (__emulate_cc(regs->flags, cc))
 		ip += disp;
 
 	int3_emulate_jmp(regs, ip);
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -26,6 +26,7 @@
 #include <asm/debugreg.h>
 #include <asm/nospec-branch.h>
 #include <asm/ibt.h>
+#include <asm/text-patching.h>
 
 #include "x86.h"
 #include "tss.h"
@@ -416,31 +417,6 @@ static int fastop(struct x86_emulate_ctx
 	ON64(FOP3E(op##q, rax, rdx, cl)) \
 	FOP_END
 
-/* Special case for SETcc - 1 instruction per cc */
-#define FOP_SETCC(op) \
-	FOP_FUNC(op) \
-	#op " %al \n\t" \
-	FOP_RET(op)
-
-FOP_START(setcc)
-FOP_SETCC(seto)
-FOP_SETCC(setno)
-FOP_SETCC(setc)
-FOP_SETCC(setnc)
-FOP_SETCC(setz)
-FOP_SETCC(setnz)
-FOP_SETCC(setbe)
-FOP_SETCC(setnbe)
-FOP_SETCC(sets)
-FOP_SETCC(setns)
-FOP_SETCC(setp)
-FOP_SETCC(setnp)
-FOP_SETCC(setl)
-FOP_SETCC(setnl)
-FOP_SETCC(setle)
-FOP_SETCC(setnle)
-FOP_END;
-
 FOP_START(salc)
 FOP_FUNC(salc)
 "pushf; sbb %al, %al; popf \n\t"
@@ -1064,13 +1040,7 @@ static int em_bsr_c(struct x86_emulate_c
 
 static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
 {
-	u8 rc;
-	void (*fop)(void) = (void *)em_setcc + FASTOP_SIZE * (condition & 0xf);
-
-	flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
-	asm("push %[flags]; popf; " CALL_NOSPEC
-	    : "=a"(rc), ASM_CALL_CONSTRAINT : [thunk_target]"r"(fop), [flags]"r"(flags));
-	return rc;
+	return __emulate_cc(flags, condition & 0xf);
 }
 
 static void fetch_register_operand(struct operand *op)

Re: [PATCH v2 11/12] x86/kvm/emulate: Implement test_cc() in C

Posted by Sean Christopherson 1 year, 2 months ago

Can you use "KVM: x86" for the scope?  "x86/kvm" is used for guest changes, i.e.
for paravirt code when running as a KVM guest.

On Mon, Nov 11, 2024, Peter Zijlstra wrote:
> Current test_cc() uses the fastop infrastructure to test flags using
> SETcc instructions. However, int3_emulate_jcc() already fully
> implements the flags->CC mapping, use that.

I think it's worth presenting this as a revert of sorts, even though it's not a
strict revert.  KVM also emulated jcc-like operations in software prior to commit
9ae9febae950 ("KVM: x86 emulator: covert SETCC to fastop"), i.e. the fastop
madness was introduced for performance reasons, not because writing the code was
hard.

Looking at the output of the fastop versus __emulate_cc(), the worst case cost is
two extra conditional branches.  Assuming that eliminating the test_cc() fastop
code avoids having to add another special case in objtool, I am a-ok with the
tradeoff.  Especially since emulating instructions that use test_cc() is largely
limited to older hardware running guest firmware that uses (emulated) SMM, and
maybe a few bespoke use cases.  E.g. I get literally zero hits on test_cc() when
booting a 24 vCPU VM (64-bit or 32-bit kernel) with EPT and unrestricted guest
disabled, as the OVMF build I use doesn't rely on SMM.

And FWIW, a straight revert appears to generate worse code.  I see no reason to
bring it back.

With a massaged shortlog+changelog,

Acked-by: Sean Christopherson <seanjc@google.com>


fastop:
   0x0000000000042c41 <+1537>:  movzbl 0x61(%rbp),%eax
   0x0000000000042c45 <+1541>:  mov    0x10(%rbp),%rdx
   0x0000000000042c49 <+1545>:  shl    $0x4,%rax
   0x0000000000042c4d <+1549>:  and    $0xf0,%eax
   0x0000000000042c52 <+1554>:  and    $0x8d5,%edx
   0x0000000000042c58 <+1560>:  or     $0x2,%dh
   0x0000000000042c5b <+1563>:  add    $0x0,%rax
   0x0000000000042c61 <+1569>:  push   %rdx
   0x0000000000042c62 <+1570>:  popf   
   0x0000000000042c63 <+1571>:  call   0x42c68 <x86_emulate_insn+1576>
   0x0000000000042c68 <+1576>:  mov    %eax,%edx
   0x0000000000042c6a <+1578>:  xor    %eax,%eax
   0x0000000000042c6c <+1580>:  test   %dl,%dl
   0x0000000000042c6e <+1582>:  jne    0x42f24 <x86_emulate_insn+2276>

__emulate_cc:
   0x0000000000042b95 <+1541>:	movzbl 0x61(%rbp),%eax
   0x0000000000042b99 <+1545>:	mov    0x10(%rbp),%rcx
   0x0000000000042b9d <+1549>:	and    $0xf,%eax
   0x0000000000042ba0 <+1552>:	cmp    $0xb,%al
   0x0000000000042ba2 <+1554>:	ja     0x42e90 <x86_emulate_insn+2304>
        0x0000000000042e90 <+2304>:  mov    %rcx,%rdx
        0x0000000000042e93 <+2307>:  mov    %rcx,%rsi
        0x0000000000042e96 <+2310>:  shr    $0x7,%rdx
        0x0000000000042e9a <+2314>:  shr    $0xb,%rsi
        0x0000000000042e9e <+2318>:  xor    %rsi,%rdx
        0x0000000000042ea1 <+2321>:  cmp    $0xd,%al
        0x0000000000042ea3 <+2323>:  ja     0x4339a <x86_emulate_insn+3594>
                0x000000000004339a <+3594>:  and    $0x1,%edx
                0x000000000004339d <+3597>:  and    $0x40,%ecx
                0x00000000000433a0 <+3600>:  or     %rcx,%rdx
                0x00000000000433a3 <+3603>:  setne  %dl
                0x00000000000433a6 <+3606>:  jmp    0x42bba <x86_emulate_insn+1578>
        0x0000000000042ea9 <+2329>:  and    $0x1,%edx
        0x0000000000042eac <+2332>:  jmp    0x42bba <x86_emulate_insn+1578>
   0x0000000000042ba8 <+1560>:	mov    %eax,%edx
   0x0000000000042baa <+1562>:	shr    %dl
   0x0000000000042bac <+1564>:	and    $0x7,%edx
   0x0000000000042baf <+1567>:	and    0x0(,%rdx,8),%rcx
   0x0000000000042bb7 <+1575>:	setne  %dl
   0x0000000000042bba <+1578>:	test   $0x1,%al
   0x0000000000042bbc <+1580>:	jne    0x43323 <x86_emulate_insn+3475>
        0x0000000000043323 <+3475>:  test   %dl,%dl
        0x0000000000043325 <+3477>:  jne    0x4332f <x86_emulate_insn+3487>
        0x0000000000043327 <+3479>:  test   $0x1,%al
        0x0000000000043329 <+3481>:  jne    0x42bca <x86_emulate_insn+1594>
        0x000000000004332f <+3487>:  xor    %eax,%eax
        0x0000000000043331 <+3489>:  jmp    0x42be0 <x86_emulate_insn+1616>
   0x0000000000042bc2 <+1586>:	test   %dl,%dl
   0x0000000000042bc4 <+1588>:	je     0x43327 <x86_emulate_insn+3479>
        0x0000000000043327 <+3479>:  test   $0x1,%al
        0x0000000000043329 <+3481>:  jne    0x42bca <x86_emulate_insn+1594>
        0x000000000004332f <+3487>:  xor    %eax,%eax
        0x0000000000043331 <+3489>:  jmp    0x42be0 <x86_emulate_insn+1616>
   0x0000000000042bca <+1594>:	movslq 0xd0(%rbp),%rsi
   0x0000000000042bd1 <+1601>:	mov    %rbp,%rdi
   0x0000000000042bd4 <+1604>:	add    0x90(%rbp),%rsi
   0x0000000000042bdb <+1611>:	call   0x3a7c0 <assign_eip>

revert:
   0x0000000000042bc9 <+1545>:  movzbl 0x61(%rbp),%esi
   0x0000000000042bcd <+1549>:  mov    0x10(%rbp),%rdx
   0x0000000000042bd1 <+1553>:  mov    %esi,%eax
   0x0000000000042bd3 <+1555>:  shr    %eax
   0x0000000000042bd5 <+1557>:  and    $0x7,%eax
   0x0000000000042bd8 <+1560>:  cmp    $0x4,%eax
   0x0000000000042bdb <+1563>:  je     0x42ed4 <x86_emulate_insn+2324>
        0x0000000000042ed4 <+2324>:  mov    %edx,%ecx
        0x0000000000042ed6 <+2326>:  and    $0x80,%ecx
        0x0000000000042edc <+2332>:  jmp    0x42bff <x86_emulate_insn+1599>
   0x0000000000042be1 <+1569>:  ja     0x43225 <x86_emulate_insn+3173>
        0x0000000000043225 <+3173>:  cmp    $0x6,%eax
        0x0000000000043228 <+3176>:  je     0x434ec <x86_emulate_insn+3884>
                0x00000000000434ec <+3884>:  xor    %edi,%edi
                0x00000000000434ee <+3886>:  jmp    0x43238 <x86_emulate_insn+3192>
        0x000000000004322e <+3182>:  mov    %edx,%edi
        0x0000000000043230 <+3184>:  and    $0x40,%edi
        0x0000000000043233 <+3187>:  cmp    $0x7,%eax
        0x0000000000043236 <+3190>:  jne    0x4325c <x86_emulate_insn+3228>
                0x000000000004325c <+3228>:  mov    %edx,%ecx
                0x000000000004325e <+3230>:  and    $0x4,%ecx
                0x0000000000043261 <+3233>:  cmp    $0x5,%eax
                0x0000000000043264 <+3236>:  je     0x42bff <x86_emulate_insn+1599>
                0x000000000004326a <+3242>:  jmp    0x43218 <x86_emulate_insn+3160>
        0x0000000000043238 <+3192>:  mov    %rdx,%rcx
        0x000000000004323b <+3195>:  shr    $0xb,%rdx
        0x000000000004323f <+3199>:  shr    $0x7,%rcx
        0x0000000000043243 <+3203>:  xor    %edx,%ecx
        0x0000000000043245 <+3205>:  and    $0x1,%ecx
        0x0000000000043248 <+3208>:  or     %edi,%ecx
        0x000000000004324a <+3210>:  jmp    0x42bff <x86_emulate_insn+1599>
   0x0000000000042be7 <+1575>:  mov    %edx,%ecx
   0x0000000000042be9 <+1577>:  and    $0x40,%ecx
   0x0000000000042bec <+1580>:  cmp    $0x2,%eax
   0x0000000000042bef <+1583>:  je     0x42bff <x86_emulate_insn+1599>
   0x0000000000042bf1 <+1585>:  mov    %edx,%ecx
   0x0000000000042bf3 <+1587>:  and    $0x41,%ecx
   0x0000000000042bf6 <+1590>:  cmp    $0x3,%eax
   0x0000000000042bf9 <+1593>:  jne    0x4320a <x86_emulate_insn+3146>
        0x000000000004320a <+3146>:  mov    %edx,%ecx
        0x000000000004320c <+3148>:  and    $0x1,%ecx
        0x000000000004320f <+3151>:  cmp    $0x1,%eax
        0x0000000000043212 <+3154>:  je     0x42bff <x86_emulate_insn+1599>
        0x0000000000043218 <+3160>:  mov    %edx,%ecx
        0x000000000004321a <+3162>:  and    $0x800,%ecx
        0x0000000000043220 <+3168>:  jmp    0x42bff <x86_emulate_insn+1599>
   0x0000000000042bff <+1599>:  test   %ecx,%ecx
   0x0000000000042c01 <+1601>:  setne  %dl
   0x0000000000042c04 <+1604>:  and    $0x1,%esi
   0x0000000000042c07 <+1607>:  xor    %eax,%eax
   0x0000000000042c09 <+1609>:  cmp    %sil,%dl
   0x0000000000042c0c <+1612>:  je     0x42c24 <x86_emulate_insn+1636>
   0x0000000000042c0e <+1614>:  movslq 0xd0(%rbp),%rsi
   0x0000000000042c15 <+1621>:  mov    %rbp,%rdi
   0x0000000000042c18 <+1624>:  add    0x90(%rbp),%rsi
   0x0000000000042c1f <+1631>:  call   0x3a7c0 <assign_eip>