tools/testing/selftests/x86/Makefile | 3 tools/testing/selftests/x86/nx_stack.c | 167 +++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+)
Here is how it works:
* fault and fill the stack from rsp with int3 down until rlimit allows,
* fill upwards with int3 too, overwrite libc stuff, argv, envp,
* try to exec int3 on each page and catch it with either SIGSEGV or
SIGTRAP handler.
Note: trying to execute _every_ int3 takes 30-40 seconds even on fast
machine, so only 1 int3 per page is tried.
Tested on F37 kernel and on custom kernel which did
vm_flags |= VM_EXEC;
to stack VMA.
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
tools/testing/selftests/x86/Makefile | 3
tools/testing/selftests/x86/nx_stack.c | 167 +++++++++++++++++++++++++++++++++
2 files changed, 170 insertions(+)
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -19,6 +19,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
vdso_restorer
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \
corrupt_xstate_header amx lam test_shadow_stack
+TARGETS_C_64BIT_ONLY += nx_stack
# Some selftests require 32bit support enabled also on 64bit systems
TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
@@ -109,3 +110,5 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
# state.
$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
+
+$(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack
new file mode 100644
--- /dev/null
+++ b/tools/testing/selftests/x86/nx_stack.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that userspace stack is NX.
+ * Requires linking with -Wl,-z,noexecstack .
+ *
+ * Fill the stack with int3's and then try to execute some of them:
+ * SIGSEGV -- good, SIGTRAP -- bad.
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#undef NDEBUG
+#include <assert.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#define PAGE_SIZE 4096
+
+/*
+ * This is memset(rsp, 0xcc, -1); but down.
+ * It will SIGSEGV when bottom of the stack is reached.
+ * Byte-size access is important! (see rdi tweaking in the signal handler).
+ */
+void make_stack1(void);
+asm(
+".pushsection .text\n"
+".globl make_stack1\n"
+".align 16\n"
+"make_stack1:\n"
+ "mov %rsp, %rdi\n"
+ "mov $0xcc, %eax\n"
+ "mov $-1, %rcx\n"
+ "std\n"
+ "rep stosb\n"
+ "hlt\n"
+".type make_stack1,@function\n"
+".size make_stack1,.-make_stack1\n"
+".popsection\n"
+);
+
+/*
+ * memset(p, 0xcc, -1);
+ * It will SIGSEGV when top of the stack is reached.
+ * Byte-size access is important! (see rdi tweaking in the signal handler).
+ */
+void make_stack2(uint64_t p);
+asm(
+".pushsection .text\n"
+".globl make_stack2\n"
+".align 16\n"
+"make_stack2:\n"
+ "mov $0xcc, %eax\n"
+ "mov $-1, %rcx\n"
+ "cld\n"
+ "rep stosb\n"
+ "hlt\n"
+".type make_stack2,@function\n"
+".size make_stack2,.-make_stack2\n"
+".popsection\n"
+);
+
+static volatile int test_state = 0;
+static volatile uint64_t stack_min_addr;
+
+static void sigsegv(int _, siginfo_t *__, void *uc_)
+{
+ ucontext_t *uc = uc_;
+
+ if (test_state == 0) {
+ /* Stack is faulted and cleared from rsp to the lowest address. */
+ stack_min_addr = ++uc->uc_mcontext.gregs[REG_RDI];
+ if (1) {
+ printf("stack min %016lx\n", stack_min_addr);
+ }
+ uc->uc_mcontext.gregs[REG_RIP] = (uintptr_t)&make_stack2;
+ test_state = 1;
+ } else if (test_state == 1) {
+ /* Stack has been cleared from top to bottom. */
+ uint64_t stack_max_addr = uc->uc_mcontext.gregs[REG_RDI];
+ if (1) {
+ printf("stack max %016lx\n", stack_max_addr);
+ }
+ uc->uc_mcontext.gregs[REG_RIP] = stack_max_addr - PAGE_SIZE;
+ test_state = 2;
+ } else if (test_state == 2) {
+ /* SIGSEGV while trying to execute int3 on stack -- PASS. */
+ uc->uc_mcontext.gregs[REG_RIP] -= PAGE_SIZE;
+ if (uc->uc_mcontext.gregs[REG_RIP] == stack_min_addr) {
+ /* One more SIGSEGV and test ends. */
+ test_state = 3;
+ }
+ } else {
+ _exit(EXIT_SUCCESS);
+ }
+}
+
+static void sigtrap(int _, siginfo_t *__, void *uc_)
+{
+ /* SIGTRAP while trying to execute int3 on stack -- FAIL. */
+ _exit(EXIT_FAILURE);
+}
+
+int main(void)
+{
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigsegv;
+ int rv = sigaction(SIGSEGV, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigtrap;
+ int rv = sigaction(SIGTRAP, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct rlimit rlim;
+ int rv = getrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ // Cap stack at time-honored 8 MiB value.
+ rlim.rlim_max = rlim.rlim_cur;
+ if (rlim.rlim_max > 8 * 1024 * 1024) {
+ rlim.rlim_max = 8 * 1024 * 1024;
+ }
+ rv = setrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ }
+ {
+ /*
+ * Regular stack is overwritten completely during testing.
+ * All the useful work is done in the signal handlers.
+ */
+ const size_t len = (MINSIGSTKSZ + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE;
+ void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ assert(p != MAP_FAILED);
+ stack_t ss = {};
+ ss.ss_sp = p;
+ ss.ss_size = len;
+ int rv = sigaltstack(&ss, NULL);
+ assert(rv == 0);
+ }
+ make_stack1();
+ __builtin_trap();
+}
On 10/1/23 09:31, Alexey Dobriyan wrote: > Here is how it works: > > * fault and fill the stack from rsp with int3 down until rlimit allows, > * fill upwards with int3 too, overwrite libc stuff, argv, envp, > * try to exec int3 on each page and catch it with either SIGSEGV or > SIGTRAP handler. > > Note: trying to execute _every_ int3 takes 30-40 seconds even on fast > machine, so only 1 int3 per page is tried. > > Tested on F37 kernel and on custom kernel which did > > vm_flags |= VM_EXEC; > > to stack VMA. I guess the subject implies it, but it's probably worth a sentence or two in the changelog about this being 64-bit only. IIRC, there _are_ x86_64 CPUs that don't support NX. It's also entirely possible for a hypervisor to disable NX enumeration for a guest. Those two are (probably) rare enough that they can be ignored for now. But it might mean adding a CPUID check at some point. Basically, could you spend a moment in the changelog to talk about: 1. 32-bit kernels on NX hardware and 2. 64-bit kernels on non-NX hardware ?
On Mon, Oct 02, 2023 at 07:23:10AM -0700, Dave Hansen wrote: > On 10/1/23 09:31, Alexey Dobriyan wrote: > > Here is how it works: > > > > * fault and fill the stack from rsp with int3 down until rlimit allows, > > * fill upwards with int3 too, overwrite libc stuff, argv, envp, > > * try to exec int3 on each page and catch it with either SIGSEGV or > > SIGTRAP handler. > > > > Note: trying to execute _every_ int3 takes 30-40 seconds even on fast > > machine, so only 1 int3 per page is tried. > > > > Tested on F37 kernel and on custom kernel which did > > > > vm_flags |= VM_EXEC; > > > > to stack VMA. > > I guess the subject implies it, but it's probably worth a sentence or > two in the changelog about this being 64-bit only. > > IIRC, there _are_ x86_64 CPUs that don't support NX. It's also entirely > possible for a hypervisor to disable NX enumeration for a guest. Those > two are (probably) rare enough that they can be ignored for now. But it > might mean adding a CPUID check at some point. > > Basically, could you spend a moment in the changelog to talk about: > > 1. 32-bit kernels on NX hardware > and > 2. 64-bit kernels on non-NX hardware Sure. My logic whas that i386 is dead arch, but this test is easy to port to i386, only 2 simple functions. I don't want to parse /proc/cpuinfo. If someone knows they're shipping NX-incapable hardware, just let them disable the test.
On 10/3/23 06:00, Alexey Dobriyan wrote: > On Mon, Oct 02, 2023 at 07:23:10AM -0700, Dave Hansen wrote: >> Basically, could you spend a moment in the changelog to talk about: >> >> 1. 32-bit kernels on NX hardware >> and >> 2. 64-bit kernels on non-NX hardware > > Sure. My logic whas that i386 is dead arch, but this test is easy to > port to i386, only 2 simple functions. I honestly don't feel strongly about it one way or the other. But whatever we do, let's explain it, please. > I don't want to parse /proc/cpuinfo. If someone knows they're shipping > NX-incapable hardware, just let them disable the test. Other than clearcpuid=nx, I don't _think_ we have any way to clear the X86_FEATURE_NX bit right now. That should mean that you can use regular old CPUID to see if the booted kernel supports NX. Perhaps something like what: tools/testing/selftests/x86/amx.c does with CPUID_LEAF1_ECX_XSAVE_MASK. That should be quite a bit easier than parsing /proc/cpuinfo. If someone does use clearcpuid, then I think it's perfectly reasonable to fail the selftest.
* Dave Hansen <dave.hansen@intel.com> wrote: > On 10/3/23 06:00, Alexey Dobriyan wrote: > > On Mon, Oct 02, 2023 at 07:23:10AM -0700, Dave Hansen wrote: > >> Basically, could you spend a moment in the changelog to talk about: > >> > >> 1. 32-bit kernels on NX hardware > >> and > >> 2. 64-bit kernels on non-NX hardware > > > > Sure. My logic whas that i386 is dead arch, but this test is easy to > > port to i386, only 2 simple functions. > > I honestly don't feel strongly about it one way or the other. But > whatever we do, let's explain it, please. > > > I don't want to parse /proc/cpuinfo. If someone knows they're shipping > > NX-incapable hardware, just let them disable the test. > > Other than clearcpuid=nx, I don't _think_ we have any way to clear the > X86_FEATURE_NX bit right now. That should mean that you can use regular > old CPUID to see if the booted kernel supports NX. [...] I think that's probably overkill - the test should report a failure if NX is not available for whatever reason. Because not having NX in 2023 on any system that is threatened is a big security vulnerability in itself, and whether the vendor or owner intentionally did that or not doesn't really matter, and a failing kernel testcase will be the least of their problems. In fact I'd argue that we should fail this testcase in that situation as a matter of principle: NX clearly doesn't work and there's very few situations where that's acceptable. Anyone who doesn't want or have NX can skip paying attention to this failing testcase just fine. Thanks, Ingo
* Ingo Molnar <mingo@kernel.org> wrote: > Because not having NX in 2023 on any system that is threatened is a > big security vulnerability in itself, and whether the vendor or owner > intentionally did that or not doesn't really matter, and a failing > kernel testcase will be the least of their problems. BTW., it's also questionable whether the owner is *aware* of the fact that NX is not available: what if some kernel debug option cleared the NX flag, unintended, or there's some serious firmware bug? However unlikely those situations might be, I think unconditionally warning about NX not available is a very 2023 thing to do. Thanks, Ingo
On 10/3/23 12:30, Ingo Molnar wrote: > * Ingo Molnar <mingo@kernel.org> wrote: >> Because not having NX in 2023 on any system that is threatened is a >> big security vulnerability in itself, and whether the vendor or owner >> intentionally did that or not doesn't really matter, and a failing >> kernel testcase will be the least of their problems. > BTW., it's also questionable whether the owner is *aware* of the fact that > NX is not available: what if some kernel debug option cleared the NX flag, > unintended, or there's some serious firmware bug? > > However unlikely those situations might be, I think unconditionally warning > about NX not available is a very 2023 thing to do. 100% agree for x86_64. Any sane x86_64 system has NX and the rest are noise that can live with the error message, unless someone shows up with a compelling reason why not. For 32-bit, the situation is reversed. The majority of 32-bit-only CPUs never had NX. The only reason to even *do* this check on 32-bit is that we think folks are running i386 kernels on x86_64 hardware _or_ we just don't care about 32-bit in the first place. In the end, I think if we're going to do this test on i386, we should _also_ do the 5-lines-of-code CPUID check. But I honestly don't care that much. I wouldn't NAK (or not merge) this patch over it.
On October 3, 2023 1:46:20 PM PDT, Dave Hansen <dave.hansen@intel.com> wrote: >On 10/3/23 12:30, Ingo Molnar wrote: >> * Ingo Molnar <mingo@kernel.org> wrote: >>> Because not having NX in 2023 on any system that is threatened is a >>> big security vulnerability in itself, and whether the vendor or owner >>> intentionally did that or not doesn't really matter, and a failing >>> kernel testcase will be the least of their problems. >> BTW., it's also questionable whether the owner is *aware* of the fact that >> NX is not available: what if some kernel debug option cleared the NX flag, >> unintended, or there's some serious firmware bug? >> >> However unlikely those situations might be, I think unconditionally warning >> about NX not available is a very 2023 thing to do. > >100% agree for x86_64. Any sane x86_64 system has NX and the rest are >noise that can live with the error message, unless someone shows up with >a compelling reason why not. > >For 32-bit, the situation is reversed. The majority of 32-bit-only CPUs >never had NX. The only reason to even *do* this check on 32-bit is that >we think folks are running i386 kernels on x86_64 hardware _or_ we just >don't care about 32-bit in the first place. > >In the end, I think if we're going to do this test on i386, we should >_also_ do the 5-lines-of-code CPUID check. But I honestly don't care >that much. I wouldn't NAK (or not merge) this patch over it. Perhaps we should also complain at people who are still running 32-bit kernels on 64-bit hardware? It has been 20 years...
* Alexey Dobriyan <adobriyan@gmail.com> wrote: > Here is how it works: > > * fault and fill the stack from rsp with int3 down until rlimit allows, > * fill upwards with int3 too, overwrite libc stuff, argv, envp, > * try to exec int3 on each page and catch it with either SIGSEGV or > SIGTRAP handler. > > Note: trying to execute _every_ int3 takes 30-40 seconds even on fast > machine, so only 1 int3 per page is tried. > > Tested on F37 kernel and on custom kernel which did > > vm_flags |= VM_EXEC; > > to stack VMA. > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> > --- > > tools/testing/selftests/x86/Makefile | 3 > tools/testing/selftests/x86/nx_stack.c | 167 +++++++++++++++++++++++++++++++++ > 2 files changed, 170 insertions(+) Ok, that's a good idea, but could the test case please output something human-readable that indicates whether the test was a success or not? A typical testcase output is like: kepler:~/tip/tools/testing/selftests/x86> ./sigaltstack_32 [RUN] Test an alternate signal stack of sufficient size. Raise SIGALRM. It is expected to be delivered. [OK] SIGALRM signal delivered. Or: kepler:~/tip/tools/testing/selftests/x86> ./test_vsyscall_64 ... [OK] vsyscalls are emulated (1 instructions in vsyscall page) ... ... where the 'OK' denotes success of a test. The nx_stack testcase only outputs: stack min 00007fffd75b5000 stack max 00007fffd7db5000 ... with only the exit code denoting success/failure. Thanks, Ingo
On Mon, Oct 02, 2023 at 03:12:35PM +0200, Ingo Molnar wrote: > > * Alexey Dobriyan <adobriyan@gmail.com> wrote: > > > Here is how it works: > > > > * fault and fill the stack from rsp with int3 down until rlimit allows, > > * fill upwards with int3 too, overwrite libc stuff, argv, envp, > > * try to exec int3 on each page and catch it with either SIGSEGV or > > SIGTRAP handler. > > > > Note: trying to execute _every_ int3 takes 30-40 seconds even on fast > > machine, so only 1 int3 per page is tried. > > > > Tested on F37 kernel and on custom kernel which did > > > > vm_flags |= VM_EXEC; > > > > to stack VMA. > > > > Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> > > --- > > > > tools/testing/selftests/x86/Makefile | 3 > > tools/testing/selftests/x86/nx_stack.c | 167 +++++++++++++++++++++++++++++++++ > > 2 files changed, 170 insertions(+) > > Ok, that's a good idea, but could the test case please output something > human-readable that indicates whether the test was a success or not? > > A typical testcase output is like: > > kepler:~/tip/tools/testing/selftests/x86> ./sigaltstack_32 > [RUN] Test an alternate signal stack of sufficient size. > Raise SIGALRM. It is expected to be delivered. > [OK] SIGALRM signal delivered. > > Or: > > kepler:~/tip/tools/testing/selftests/x86> ./test_vsyscall_64 > ... > [OK] vsyscalls are emulated (1 instructions in vsyscall page) > ... > > ... where the 'OK' denotes success of a test. > > The nx_stack testcase only outputs: > > stack min 00007fffd75b5000 > stack max 00007fffd7db5000 > > ... with only the exit code denoting success/failure. The way Unix Founding Fathers intented! :-) OK, I'll add something.
Here is how it works:
* fault and fill the stack from rsp with int3 down until rlimit allows,
* fill upwards with int3 too, overwrite libc stuff, argv, envp,
* try to exec int3 on each page and catch it in either SIGSEGV or
SIGTRAP handler.
Note: trying to execute _every_ int3 on a 8 MiB stack takes 30-40 seconds
even on fast machine which is too much for kernel selftesting
(not for LTP!) so only 1 int3 per page is tried.
Tested on F37 kernel and on a custom kernel which does
vm_flags |= VM_EXEC;
to stack VMA.
Report from the buggy kernel:
$ ./nx_stack_32
stack min ff007000
stack max ff807000
FAIL executable page on the stack: eip ff806001
$ ./nx_stack_64
stack min 7ffe65bb0000
stack max 7ffe663b0000
FAIL executable page on the stack: rip 7ffe663af001
Changes since v1:
i386 support
nice pretty printing of test result
cld in the SIGSEGV handler for robustness
SIGSTKSZ is recommended not MINSIGSTKSZ
better comments
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
tools/testing/selftests/x86/Makefile | 4
tools/testing/selftests/x86/nx_stack.c | 212 +++++++++++++++++++++++++++++++++
2 files changed, 216 insertions(+)
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -14,6 +14,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
syscall_arg_fault fsgsbase_restore sigaltstack
+TARGETS_C_BOTHBITS += nx_stack
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
@@ -109,3 +110,6 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
# state.
$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
+
+$(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack
+$(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack
new file mode 100644
--- /dev/null
+++ b/tools/testing/selftests/x86/nx_stack.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that userspace stack is NX. Requires linking with -Wl,-z,noexecstack
+ * because I don't want to bother with PT_GNU_STACK detection.
+ *
+ * Fill the stack with int3's and then try to execute some of them:
+ * SIGSEGV -- good, SIGTRAP -- bad.
+ *
+ * Regular stack is completely overwritten before testing.
+ * Test doesn't exit SIGSEGV handler after first fault at int3.
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#undef NDEBUG
+#include <assert.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#define PAGE_SIZE 4096
+
+/*
+ * This is memset(rsp, 0xcc, -1); but down.
+ * It will SIGSEGV when bottom of the stack is reached.
+ * Byte-size access is important! (see rdi tweak in the signal handler).
+ */
+void make_stack1(void);
+asm(
+".pushsection .text\n"
+".globl make_stack1\n"
+".align 16\n"
+"make_stack1:\n"
+ "mov $0xcc, %al\n"
+#if defined __amd64__
+ "mov %rsp, %rdi\n"
+ "mov $-1, %rcx\n"
+#elif defined __i386__
+ "mov %esp, %edi\n"
+ "mov $-1, %ecx\n"
+#else
+#error
+#endif
+ "std\n"
+ "rep stosb\n"
+ /* unreachable */
+ "hlt\n"
+".type make_stack1,@function\n"
+".size make_stack1,.-make_stack1\n"
+".popsection\n"
+);
+
+/*
+ * memset(p, 0xcc, -1);
+ * It will SIGSEGV when top of the stack is reached.
+ */
+void make_stack2(uint64_t p);
+asm(
+".pushsection .text\n"
+".globl make_stack2\n"
+".align 16\n"
+"make_stack2:\n"
+ "mov $0xcc, %al\n"
+#if defined __amd64__
+ "mov $-1, %rcx\n"
+#elif defined __i386__
+ "mov $-1, %ecx\n"
+#else
+#error
+#endif
+ "cld\n"
+ "rep stosb\n"
+ /* unreachable */
+ "hlt\n"
+".type make_stack2,@function\n"
+".size make_stack2,.-make_stack2\n"
+".popsection\n"
+);
+
+static volatile int test_state = 0;
+static volatile unsigned long stack_min_addr;
+
+#if defined __amd64__
+#define RDI REG_RDI
+#define RIP REG_RIP
+#define RIP_STRING "rip"
+#elif defined __i386__
+#define RDI REG_EDI
+#define RIP REG_EIP
+#define RIP_STRING "eip"
+#else
+#error
+#endif
+
+static void sigsegv(int _, siginfo_t *__, void *uc_)
+{
+ /*
+ * Some Linux versions didn't clear DF before entering signal
+ * handler. make_stack1() doesn't have a chance to clear DF
+ * either so we clear it by hand here.
+ */
+ asm volatile ("cld" ::: "memory");
+
+ ucontext_t *uc = uc_;
+
+ if (test_state == 0) {
+ /* Stack is faulted and cleared from rsp to the lowest address. */
+ stack_min_addr = ++uc->uc_mcontext.gregs[RDI];
+ if (1) {
+ printf("stack min %lx\n", stack_min_addr);
+ }
+ uc->uc_mcontext.gregs[RIP] = (uintptr_t)&make_stack2;
+ test_state = 1;
+ } else if (test_state == 1) {
+ /* Stack has been cleared from top to bottom. */
+ unsigned long stack_max_addr = uc->uc_mcontext.gregs[RDI];
+ if (1) {
+ printf("stack max %lx\n", stack_max_addr);
+ }
+ /* Start faulting pages on stack and see what happens. */
+ uc->uc_mcontext.gregs[RIP] = stack_max_addr - PAGE_SIZE;
+ test_state = 2;
+ } else if (test_state == 2) {
+ /* Stack page is NX -- good, test next page. */
+ uc->uc_mcontext.gregs[RIP] -= PAGE_SIZE;
+ if (uc->uc_mcontext.gregs[RIP] == stack_min_addr) {
+ /* One more SIGSEGV and test ends. */
+ test_state = 3;
+ }
+ } else {
+ printf("PASS\tAll stack pages are NX\n");
+ _exit(EXIT_SUCCESS);
+ }
+}
+
+static void sigtrap(int _, siginfo_t *__, void *uc_)
+{
+ const ucontext_t *uc = uc_;
+ unsigned long rip = uc->uc_mcontext.gregs[RIP];
+ printf("FAIL\texecutable page on the stack: " RIP_STRING " %lx\n", rip);
+ _exit(EXIT_FAILURE);
+}
+
+int main(void)
+{
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigsegv;
+ int rv = sigaction(SIGSEGV, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigtrap;
+ int rv = sigaction(SIGTRAP, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct rlimit rlim;
+ int rv = getrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ /* Cap stack at time-honored 8 MiB value. */
+ rlim.rlim_max = rlim.rlim_cur;
+ if (rlim.rlim_max > 8 * 1024 * 1024) {
+ rlim.rlim_max = 8 * 1024 * 1024;
+ }
+ rv = setrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ }
+ {
+ /*
+ * We don't know now much stack SIGSEGV handler uses.
+ * Bump this by 1 page every time someone complains,
+ * or rewrite it in assembly.
+ */
+ const size_t len = SIGSTKSZ;
+ void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ assert(p != MAP_FAILED);
+ stack_t ss = {};
+ ss.ss_sp = p;
+ ss.ss_size = len;
+ int rv = sigaltstack(&ss, NULL);
+ assert(rv == 0);
+ }
+ make_stack1();
+ /*
+ * Unreachable, but if _this_ int3 is ever reached, it's a bug somewhere.
+ * Fold it into main SIGTRAP pathway.
+ */
+ __builtin_trap();
+}
* Alexey Dobriyan <adobriyan@gmail.com> wrote: > Here is how it works: > > * fault and fill the stack from rsp with int3 down until rlimit allows, > * fill upwards with int3 too, overwrite libc stuff, argv, envp, > * try to exec int3 on each page and catch it in either SIGSEGV or > SIGTRAP handler. > > Note: trying to execute _every_ int3 on a 8 MiB stack takes 30-40 seconds > even on fast machine which is too much for kernel selftesting > (not for LTP!) so only 1 int3 per page is tried. > > Tested on F37 kernel and on a custom kernel which does > > vm_flags |= VM_EXEC; > > to stack VMA. > > Report from the buggy kernel: > > $ ./nx_stack_32 > stack min ff007000 > stack max ff807000 > FAIL executable page on the stack: eip ff806001 > > $ ./nx_stack_64 > stack min 7ffe65bb0000 > stack max 7ffe663b0000 > FAIL executable page on the stack: rip 7ffe663af001 Nice, thanks! Ingo
The following commit has been merged into the x86/mm branch of tip:
Commit-ID: 802e87cc464613441f9098ebf940b1895fe3f5e5
Gitweb: https://git.kernel.org/tip/802e87cc464613441f9098ebf940b1895fe3f5e5
Author: Alexey Dobriyan <adobriyan@gmail.com>
AuthorDate: Tue, 03 Oct 2023 19:18:43 +03:00
Committer: Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 03 Oct 2023 21:00:45 +02:00
selftests/x86/mm: Add new test that userspace stack is in fact NX
Here is how it works:
* fault and fill the stack from RSP with INT3 down until rlimit allows,
* fill upwards with INT3 too, overwrite libc stuff, argv, envp,
* try to exec INT3 on each page and catch it in either SIGSEGV or
SIGTRAP handler.
Note: trying to execute _every_ INT3 on a 8 MiB stack takes 30-40 seconds
even on fast machine which is too much for kernel selftesting
(not for LTP!) so only 1 INT3 per page is tried.
Tested on F37 kernel and on a custom kernel which does:
vm_flags |= VM_EXEC;
to stack VMA.
Report from the buggy kernel:
$ ./nx_stack_32
stack min ff007000
stack max ff807000
FAIL executable page on the stack: eip ff806001
$ ./nx_stack_64
stack min 7ffe65bb0000
stack max 7ffe663b0000
FAIL executable page on the stack: rip 7ffe663af001
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/4cef8266-ad6d-48af-a5f1-fc2b6a8eb422@p183
---
tools/testing/selftests/x86/Makefile | 4 +-
tools/testing/selftests/x86/nx_stack.c | 212 ++++++++++++++++++++++++-
2 files changed, 216 insertions(+)
create mode 100644 tools/testing/selftests/x86/nx_stack.c
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 7e8c937..0b872c0 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -14,6 +14,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
check_initial_reg_state sigreturn iopl ioperm \
test_vsyscall mov_ss_trap \
syscall_arg_fault fsgsbase_restore sigaltstack
+TARGETS_C_BOTHBITS += nx_stack
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
@@ -109,3 +110,6 @@ $(OUTPUT)/test_syscall_vdso_32: thunks_32.S
# state.
$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
+
+$(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack
+$(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack
diff --git a/tools/testing/selftests/x86/nx_stack.c b/tools/testing/selftests/x86/nx_stack.c
new file mode 100644
index 0000000..ea4a4e2
--- /dev/null
+++ b/tools/testing/selftests/x86/nx_stack.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that userspace stack is NX. Requires linking with -Wl,-z,noexecstack
+ * because I don't want to bother with PT_GNU_STACK detection.
+ *
+ * Fill the stack with INT3's and then try to execute some of them:
+ * SIGSEGV -- good, SIGTRAP -- bad.
+ *
+ * Regular stack is completely overwritten before testing.
+ * Test doesn't exit SIGSEGV handler after first fault at INT3.
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#undef NDEBUG
+#include <assert.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#define PAGE_SIZE 4096
+
+/*
+ * This is memset(rsp, 0xcc, -1); but down.
+ * It will SIGSEGV when bottom of the stack is reached.
+ * Byte-size access is important! (see rdi tweak in the signal handler).
+ */
+void make_stack1(void);
+asm(
+".pushsection .text\n"
+".globl make_stack1\n"
+".align 16\n"
+"make_stack1:\n"
+ "mov $0xcc, %al\n"
+#if defined __amd64__
+ "mov %rsp, %rdi\n"
+ "mov $-1, %rcx\n"
+#elif defined __i386__
+ "mov %esp, %edi\n"
+ "mov $-1, %ecx\n"
+#else
+#error
+#endif
+ "std\n"
+ "rep stosb\n"
+ /* unreachable */
+ "hlt\n"
+".type make_stack1,@function\n"
+".size make_stack1,.-make_stack1\n"
+".popsection\n"
+);
+
+/*
+ * memset(p, 0xcc, -1);
+ * It will SIGSEGV when top of the stack is reached.
+ */
+void make_stack2(uint64_t p);
+asm(
+".pushsection .text\n"
+".globl make_stack2\n"
+".align 16\n"
+"make_stack2:\n"
+ "mov $0xcc, %al\n"
+#if defined __amd64__
+ "mov $-1, %rcx\n"
+#elif defined __i386__
+ "mov $-1, %ecx\n"
+#else
+#error
+#endif
+ "cld\n"
+ "rep stosb\n"
+ /* unreachable */
+ "hlt\n"
+".type make_stack2,@function\n"
+".size make_stack2,.-make_stack2\n"
+".popsection\n"
+);
+
+static volatile int test_state = 0;
+static volatile unsigned long stack_min_addr;
+
+#if defined __amd64__
+#define RDI REG_RDI
+#define RIP REG_RIP
+#define RIP_STRING "rip"
+#elif defined __i386__
+#define RDI REG_EDI
+#define RIP REG_EIP
+#define RIP_STRING "eip"
+#else
+#error
+#endif
+
+static void sigsegv(int _, siginfo_t *__, void *uc_)
+{
+ /*
+ * Some Linux versions didn't clear DF before entering signal
+ * handler. make_stack1() doesn't have a chance to clear DF
+ * either so we clear it by hand here.
+ */
+ asm volatile ("cld" ::: "memory");
+
+ ucontext_t *uc = uc_;
+
+ if (test_state == 0) {
+ /* Stack is faulted and cleared from RSP to the lowest address. */
+ stack_min_addr = ++uc->uc_mcontext.gregs[RDI];
+ if (1) {
+ printf("stack min %lx\n", stack_min_addr);
+ }
+ uc->uc_mcontext.gregs[RIP] = (uintptr_t)&make_stack2;
+ test_state = 1;
+ } else if (test_state == 1) {
+ /* Stack has been cleared from top to bottom. */
+ unsigned long stack_max_addr = uc->uc_mcontext.gregs[RDI];
+ if (1) {
+ printf("stack max %lx\n", stack_max_addr);
+ }
+ /* Start faulting pages on stack and see what happens. */
+ uc->uc_mcontext.gregs[RIP] = stack_max_addr - PAGE_SIZE;
+ test_state = 2;
+ } else if (test_state == 2) {
+ /* Stack page is NX -- good, test next page. */
+ uc->uc_mcontext.gregs[RIP] -= PAGE_SIZE;
+ if (uc->uc_mcontext.gregs[RIP] == stack_min_addr) {
+ /* One more SIGSEGV and test ends. */
+ test_state = 3;
+ }
+ } else {
+ printf("PASS\tAll stack pages are NX\n");
+ _exit(EXIT_SUCCESS);
+ }
+}
+
+static void sigtrap(int _, siginfo_t *__, void *uc_)
+{
+ const ucontext_t *uc = uc_;
+ unsigned long rip = uc->uc_mcontext.gregs[RIP];
+ printf("FAIL\texecutable page on the stack: " RIP_STRING " %lx\n", rip);
+ _exit(EXIT_FAILURE);
+}
+
+int main(void)
+{
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigsegv;
+ int rv = sigaction(SIGSEGV, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct sigaction act = {};
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = &sigtrap;
+ int rv = sigaction(SIGTRAP, &act, NULL);
+ assert(rv == 0);
+ }
+ {
+ struct rlimit rlim;
+ int rv = getrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ /* Cap stack at time-honored 8 MiB value. */
+ rlim.rlim_max = rlim.rlim_cur;
+ if (rlim.rlim_max > 8 * 1024 * 1024) {
+ rlim.rlim_max = 8 * 1024 * 1024;
+ }
+ rv = setrlimit(RLIMIT_STACK, &rlim);
+ assert(rv == 0);
+ }
+ {
+ /*
+ * We don't know now much stack SIGSEGV handler uses.
+ * Bump this by 1 page every time someone complains,
+ * or rewrite it in assembly.
+ */
+ const size_t len = SIGSTKSZ;
+ void *p = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ assert(p != MAP_FAILED);
+ stack_t ss = {};
+ ss.ss_sp = p;
+ ss.ss_size = len;
+ int rv = sigaltstack(&ss, NULL);
+ assert(rv == 0);
+ }
+ make_stack1();
+ /*
+ * Unreachable, but if _this_ INT3 is ever reached, it's a bug somewhere.
+ * Fold it into main SIGTRAP pathway.
+ */
+ __builtin_trap();
+}
© 2016 - 2025 Red Hat, Inc.