From: Ammar Faizi <ammarfaizi2@gnuweeb.org>
The current selftest asserts (%r11 == %rflags) after the 'syscall'
returns to user. Such an assertion doesn't apply to the FRED system
because in that system the 'syscall' instruction does not set
%r11=%rflags and %rcx=%rip.
Handle the FRED case. Now, test that:
- "syscall" in a FRED system doesn't clobber %rcx and %r11.
- "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
The 'raise()' function from libc can't be used to control those
registers. Therefore, create a syscall wrapper in inline Assembly to
fully control them.
Fixes: 660602140103 ("selftests/x86: Add a selftest for SYSRET to noncanonical addresses")
Link: https://lore.kernel.org/lkml/25b96960-a07e-a952-5c23-786b55054126@zytor.com
Reported-by: Xin Li <xin3.li@intel.com>
Co-developed-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
---
tools/testing/selftests/x86/sysret_rip.c | 126 +++++++++++++++++++++--
1 file changed, 119 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c
index 84d74be1d90207ab..d3dc5ec496854179 100644
--- a/tools/testing/selftests/x86/sysret_rip.c
+++ b/tools/testing/selftests/x86/sysret_rip.c
@@ -39,6 +39,112 @@ asm (
extern const char test_page[];
static void const *current_test_page_addr = test_page;
+/* Arbitrary values */
+static const unsigned long r11_sentinel = 0xfeedfacedeadbeef;
+static const unsigned long rcx_sentinel = 0x5ca1ab1e0b57ac1e;
+
+/* An arbitrary *valid* RFLAGS value */
+static const unsigned long rflags_sentinel = 0x200a93;
+
+enum regs_ok {
+ REGS_UNDEFINED = -2, /* For consistency checker init, never returned */
+ REGS_ERROR = -1, /* Invalid register contents */
+ REGS_SAVED = 0, /* Registers properly preserved */
+ REGS_SYSRET = 1 /* Registers match syscall/sysret */
+};
+
+/*
+ * Returns:
+ * 0 = %rcx and %r11 preserved.
+ * 1 = %rcx and %r11 set to %rflags and %rip.
+ * -1 = %rcx and/or %r11 set to any other values.
+ *
+ * @rbx should be set to the syscall return %rip.
+ */
+static enum regs_ok check_regs_result(unsigned long r11, unsigned long rcx,
+ unsigned long rbx)
+{
+ if (r11 == r11_sentinel && rcx == rcx_sentinel)
+ return REGS_SAVED;
+
+ if (r11 == rflags_sentinel && rcx == rbx)
+ return REGS_SYSRET;
+
+ printf("[FAIL] check_regs_result\n");
+ printf(" r11_sentinel = %#lx; %%r11 = %#lx;\n", r11_sentinel, r11);
+ printf(" rcx_sentinel = %#lx; %%rcx = %#lx;\n", rcx_sentinel, rcx);
+ printf(" rflags_sentinel = %#lx\n", rflags_sentinel);
+ return REGS_ERROR;
+}
+
+static long do_syscall(long nr_syscall, unsigned long arg1, unsigned long arg2,
+ unsigned long arg3, unsigned long arg4,
+ unsigned long arg5, unsigned long arg6)
+{
+ static enum regs_ok regs_ok_state = REGS_UNDEFINED;
+ register unsigned long r11 asm("%r11");
+ register unsigned long r10 asm("%r10");
+ register unsigned long r8 asm("%r8");
+ register unsigned long r9 asm("%r9");
+ register void *rsp asm("%rsp");
+ unsigned long rcx, rbx;
+ enum regs_ok ret;
+
+ r11 = r11_sentinel;
+ rcx = rcx_sentinel;
+ r10 = arg4;
+ r8 = arg5;
+ r9 = arg6;
+
+ asm volatile (
+ "pushq %[rflags_sentinel]\n\t"
+ "popf\n\t"
+ "leaq 1f(%%rip), %[rbx]\n\t"
+ "syscall\n"
+ "1:"
+
+ : "+a" (nr_syscall),
+ "+r" (r11),
+ "+c" (rcx),
+ [rbx] "=b" (rbx),
+ "+r" (rsp) /* Clobber the redzone */
+
+ : [rflags_sentinel] "g" (rflags_sentinel),
+ "D" (arg1), /* %rdi */
+ "S" (arg2), /* %rsi */
+ "d" (arg3), /* %rdx */
+ "r" (r10),
+ "r" (r8),
+ "r" (r9)
+
+ : "memory"
+ );
+
+ /*
+ * Test that:
+ *
+ * - "syscall" in a FRED system doesn't clobber %rcx and %r11.
+ * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags.
+ */
+ ret = check_regs_result(r11, rcx, rbx);
+ assert(ret != REGS_ERROR);
+
+ /*
+ * Test that we don't get a mix of REGS_SAVED and REGS_SYSRET.
+ * It needs at least calling do_syscall() twice to assert.
+ */
+ if (regs_ok_state == REGS_UNDEFINED) {
+ /*
+ * First time calling do_syscall().
+ */
+ regs_ok_state = ret;
+ } else {
+ assert(regs_ok_state == ret);
+ }
+
+ return nr_syscall;
+}
+
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
@@ -85,27 +191,33 @@ static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void)
static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t*)ctx_void;
+ enum regs_ok ret;
memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ ret = check_regs_result(ctx->uc_mcontext.gregs[REG_R11],
+ ctx->uc_mcontext.gregs[REG_RCX],
+ ctx->uc_mcontext.gregs[REG_RBX]);
+
+ assert(ret != REGS_ERROR);
+
/* Set IP and CX to match so that SYSRET can happen. */
ctx->uc_mcontext.gregs[REG_RIP] = rip;
ctx->uc_mcontext.gregs[REG_RCX] = rip;
-
- /* R11 and EFLAGS should already match. */
- assert(ctx->uc_mcontext.gregs[REG_EFL] ==
- ctx->uc_mcontext.gregs[REG_R11]);
-
sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
- return;
+}
+
+static void __raise(int sig)
+{
+ do_syscall(__NR_kill, getpid(), sig, 0, 0, 0, 0);
}
static void test_sigreturn_to(unsigned long ip)
{
rip = ip;
printf("[RUN]\tsigreturn to 0x%lx\n", ip);
- raise(SIGUSR1);
+ __raise(SIGUSR1);
}
static jmp_buf jmpbuf;
--
Ammar Faizi
On 1/24/23 19:49, Ammar Faizi wrote: > + > + /* > + * Test that: > + * > + * - "syscall" in a FRED system doesn't clobber %rcx and %r11. > + * - "syscall" in a non-FRED system sets %rcx=%rip and %r11=%rflags. > + */ > + ret = check_regs_result(r11, rcx, rbx); > + assert(ret != REGS_ERROR); > + > + /* > + * Test that we don't get a mix of REGS_SAVED and REGS_SYSRET. > + * It needs at least calling do_syscall() twice to assert. > + */ > + if (regs_ok_state == REGS_UNDEFINED) { > + /* > + * First time calling do_syscall(). > + */ > + regs_ok_state = ret; > + } else { > + assert(regs_ok_state == ret); > + } > + [...] > + ret = check_regs_result(ctx->uc_mcontext.gregs[REG_R11], > + ctx->uc_mcontext.gregs[REG_RCX], > + ctx->uc_mcontext.gregs[REG_RBX]); > + > + assert(ret != REGS_ERROR); > + This instance, too, needs to be checked against regs_ok_result. It would make most sense to move that handling, and the assert() into check_regs_result() or into a separate function around it. > /* Set IP and CX to match so that SYSRET can happen. */ > ctx->uc_mcontext.gregs[REG_RIP] = rip; > ctx->uc_mcontext.gregs[REG_RCX] = rip; It would be interesting to have the syscall handler try both with and without this (so it would end up doing both IRET and SYSCALL on legacy.) Perhaps SIGUSR1 versus SIGUSR2... -hpa
On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote: > > /* Set IP and CX to match so that SYSRET can happen. */ > > ctx->uc_mcontext.gregs[REG_RIP] = rip; > > ctx->uc_mcontext.gregs[REG_RCX] = rip; > > It would be interesting to have the syscall handler try both with and > without this (so it would end up doing both IRET and SYSCALL on legacy.) > Perhaps SIGUSR1 versus SIGUSR2... Just to clarify this more so I am sure I understand it correctly. Did you mean to have the same signal handler without modifiying 'REG_RCX' but still change 'REG_RIP'? IOW, we want to only *remove*: ctx->uc_mcontext.gregs[REG_RCX] = rip; and *keep*: ctx->uc_mcontext.gregs[REG_RIP] = rip; for the SIGUSR2 handler. Thus, inside the entry64 we will jump to the iret path because %rcx != %r11 upon rt_sigreturn()? -- Ammar Faizi
On January 25, 2023 1:57:15 AM PST, Ammar Faizi <ammarfaizi2@gnuweeb.org> wrote: >On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote: >> > /* Set IP and CX to match so that SYSRET can happen. */ >> > ctx->uc_mcontext.gregs[REG_RIP] = rip; >> > ctx->uc_mcontext.gregs[REG_RCX] = rip; >> >> It would be interesting to have the syscall handler try both with and >> without this (so it would end up doing both IRET and SYSCALL on legacy.) >> Perhaps SIGUSR1 versus SIGUSR2... > >Just to clarify this more so I am sure I understand it correctly. > >Did you mean to have the same signal handler without modifiying >'REG_RCX' but still change 'REG_RIP'? > >IOW, we want to only *remove*: > > ctx->uc_mcontext.gregs[REG_RCX] = rip; > >and *keep*: > > ctx->uc_mcontext.gregs[REG_RIP] = rip; > >for the SIGUSR2 handler. Thus, inside the entry64 we will jump to the >iret path because %rcx != %r11 upon rt_sigreturn()? > I guess it would depend on what they "normally" are. My #1 impulse would be to leave them both unchanged.
On Wed, Jan 25, 2023 at 02:17:41AM -0800, H. Peter Anvin wrote: > I guess it would depend on what they "normally" are. My #1 impulse would be to leave them both unchanged. Ah okay... I think I understand now. My confusion came from a comment in that code. The current SIGUSR1 handler has a comment: /* Set IP and CX to match so that SYSRET can happen. */ ctx->uc_mcontext.gregs[REG_RIP] = rip; ctx->uc_mcontext.gregs[REG_RCX] = rip; So I thought if we leave them both unchanged, then SYSRET can happen too, because IP and CX match. My initial confusion about that was: Where do we actually exercise IRET if the SIGUSR2 handler exercises SYSRET then? I realized my assumption was wrong. The current SIGUSR1 handler actually forces the kernel to use IRET, not SYSRET. Because the %rip is set to a non-canonical address. So that's the place where it exercises IRET. IOW, my understanding now: The current SIGUSR1 handler exercises the SYSRET-appropriate condition detector in the kernel. It doesn't actually go to the SYSRET path despite the comment saying "SYSRET can happen". That detector must take us to the IRET path or we will #GP in kernel space on Intel CPUs. In short, the SIGUSR1 handler asserts that "SYSRET must *not* happen". The expected SIGUSR2 handler addition exercises the SYSRET path by leaving REG_IP and REG_CX unchanged. Am I correct? -- Ammar Faizi
On January 25, 2023 3:37:23 AM PST, Ammar Faizi <ammarfaizi2@gnuweeb.org> wrote: >On Wed, Jan 25, 2023 at 02:17:41AM -0800, H. Peter Anvin wrote: >> I guess it would depend on what they "normally" are. My #1 impulse would be to leave them both unchanged. > >Ah okay... I think I understand now. My confusion came from a comment >in that code. > >The current SIGUSR1 handler has a comment: > > /* Set IP and CX to match so that SYSRET can happen. */ > ctx->uc_mcontext.gregs[REG_RIP] = rip; > ctx->uc_mcontext.gregs[REG_RCX] = rip; > >So I thought if we leave them both unchanged, then SYSRET can happen >too, because IP and CX match. My initial confusion about that was: > > Where do we actually exercise IRET if the SIGUSR2 handler > exercises SYSRET then? > >I realized my assumption was wrong. The current SIGUSR1 handler >actually forces the kernel to use IRET, not SYSRET. Because the %rip >is set to a non-canonical address. So that's the place where it >exercises IRET. > >IOW, my understanding now: > >The current SIGUSR1 handler exercises the SYSRET-appropriate condition >detector in the kernel. It doesn't actually go to the SYSRET path >despite the comment saying "SYSRET can happen". That detector must take >us to the IRET path or we will #GP in kernel space on Intel CPUs. > >In short, the SIGUSR1 handler asserts that "SYSRET must *not* happen". > >The expected SIGUSR2 handler addition exercises the SYSRET path by >leaving REG_IP and REG_CX unchanged. > >Am I correct? > That's the idea.
On 1/25/23 4:57 PM, Ammar Faizi wrote: > On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote: >>> /* Set IP and CX to match so that SYSRET can happen. */ >>> ctx->uc_mcontext.gregs[REG_RIP] = rip; >>> ctx->uc_mcontext.gregs[REG_RCX] = rip; >> >> It would be interesting to have the syscall handler try both with and >> without this (so it would end up doing both IRET and SYSCALL on legacy.) >> Perhaps SIGUSR1 versus SIGUSR2... > > Just to clarify this more so I am sure I understand it correctly. > > Did you mean to have the same signal handler without modifiying > 'REG_RCX' but still change 'REG_RIP'? > > IOW, we want to only *remove*: > > ctx->uc_mcontext.gregs[REG_RCX] = rip; > > and *keep*: > > ctx->uc_mcontext.gregs[REG_RIP] = rip; > > for the SIGUSR2 handler. Thus, inside the entry64 we will jump to the > iret path because %rcx != %r11 upon rt_sigreturn()? s/%rcx != %r11/%rcx != %rip/ -- Ammar Faizi
On Wed, Jan 25, 2023 at 12:39:26AM -0800, H. Peter Anvin wrote: > > + ret = check_regs_result(ctx->uc_mcontext.gregs[REG_R11], > > + ctx->uc_mcontext.gregs[REG_RCX], > > + ctx->uc_mcontext.gregs[REG_RBX]); > > + > > + assert(ret != REGS_ERROR); > > + > > This instance, too, needs to be checked against regs_ok_result. It would > make most sense to move that handling, and the assert() into > check_regs_result() or into a separate function around it. OK. Sounds better. > > /* Set IP and CX to match so that SYSRET can happen. */ > > ctx->uc_mcontext.gregs[REG_RIP] = rip; > > ctx->uc_mcontext.gregs[REG_RCX] = rip; > > It would be interesting to have the syscall handler try both with and > without this (so it would end up doing both IRET and SYSCALL on legacy.) > Perhaps SIGUSR1 versus SIGUSR2... We will have a new separate patch for that. -- Ammar Faizi
© 2016 - 2025 Red Hat, Inc.