[v12] riscv control-flow integrity for usermode

[PATCH v12 10/28] riscv/mm: Implement map_shadow_stack() syscall

Posted by Deepak Gupta 11 months ago

As discussed extensively in the changelog for the addition of this
syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
existing mmap() and madvise() syscalls do not map entirely well onto the
security requirements for shadow stack memory since they lead to windows
where memory is allocated but not yet protected or stacks which are not
properly and safely initialised. Instead a new syscall map_shadow_stack()
has been defined which allocates and initialises a shadow stack page.

This patch implements this syscall for riscv. riscv doesn't require token
to be setup by kernel because user mode can do that by itself. However to
provide compatibility and portability with other architectues, user mode
can specify token set flag.

Reviewed-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Deepak Gupta <debug@rivosinc.com>
---
 arch/riscv/kernel/Makefile  |   1 +
 arch/riscv/kernel/usercfi.c | 144 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+)

diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 8d186bfced45..3a861d320654 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_ACPI)		+= acpi.o
 obj-$(CONFIG_ACPI_NUMA)	+= acpi_numa.o
 
 obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += bugs.o
+obj-$(CONFIG_RISCV_USER_CFI) += usercfi.o
diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
new file mode 100644
index 000000000000..24022809a7b5
--- /dev/null
+++ b/arch/riscv/kernel/usercfi.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Rivos, Inc.
+ * Deepak Gupta <debug@rivosinc.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <linux/syscalls.h>
+#include <linux/prctl.h>
+#include <asm/csr.h>
+#include <asm/usercfi.h>
+
+#define SHSTK_ENTRY_SIZE sizeof(void *)
+
+/*
+ * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can happen
+ * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes pointer to
+ * shadow stack. To keep it simple, we plan to use `ssamoswap` to perform writes on shadow
+ * stack.
+ */
+static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned long val)
+{
+	/*
+	 * Never expect -1 on shadow stack. Expect return addresses and zero
+	 */
+	unsigned long swap = -1;
+
+	__enable_user_access();
+	asm goto(
+		".option push\n"
+		".option arch, +zicfiss\n"
+		"1: ssamoswap.d %[swap], %[val], %[addr]\n"
+		_ASM_EXTABLE(1b, %l[fault])
+		RISCV_ACQUIRE_BARRIER
+		".option pop\n"
+		: [swap] "=r" (swap), [addr] "+A" (*addr)
+		: [val] "r" (val)
+		: "memory"
+		: fault
+		);
+	__disable_user_access();
+	return swap;
+fault:
+	__disable_user_access();
+	return -1;
+}
+
+/*
+ * Create a restore token on the shadow stack.  A token is always XLEN wide
+ * and aligned to XLEN.
+ */
+static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+{
+	unsigned long addr;
+
+	/* Token must be aligned */
+	if (!IS_ALIGNED(ssp, SHSTK_ENTRY_SIZE))
+		return -EINVAL;
+
+	/* On RISC-V we're constructing token to be function of address itself */
+	addr = ssp - SHSTK_ENTRY_SIZE;
+
+	if (amo_user_shstk((unsigned long __user *)addr, (unsigned long)ssp) == -1)
+		return -EFAULT;
+
+	if (token_addr)
+		*token_addr = addr;
+
+	return 0;
+}
+
+static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size,
+					   unsigned long token_offset, bool set_tok)
+{
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+	struct mm_struct *mm = current->mm;
+	unsigned long populate, tok_loc = 0;
+
+	if (addr)
+		flags |= MAP_FIXED_NOREPLACE;
+
+	mmap_write_lock(mm);
+	addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+		       VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL);
+	mmap_write_unlock(mm);
+
+	if (!set_tok || IS_ERR_VALUE(addr))
+		goto out;
+
+	if (create_rstor_token(addr + token_offset, &tok_loc)) {
+		vm_munmap(addr, size);
+		return -EINVAL;
+	}
+
+	addr = tok_loc;
+
+out:
+	return addr;
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+	bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
+	unsigned long aligned_size = 0;
+
+	if (!cpu_supports_shadow_stack())
+		return -EOPNOTSUPP;
+
+	/* Anything other than set token should result in invalid param */
+	if (flags & ~SHADOW_STACK_SET_TOKEN)
+		return -EINVAL;
+
+	/*
+	 * Unlike other architectures, on RISC-V, SSP pointer is held in CSR_SSP and is available
+	 * CSR in all modes. CSR accesses are performed using 12bit index programmed in instruction
+	 * itself. This provides static property on register programming and writes to CSR can't
+	 * be unintentional from programmer's perspective. As long as programmer has guarded areas
+	 * which perform writes to CSR_SSP properly, shadow stack pivoting is not possible. Since
+	 * CSR_SSP is writeable by user mode, it itself can setup a shadow stack token subsequent
+	 * to allocation. Although in order to provide portablity with other architecture (because
+	 * `map_shadow_stack` is arch agnostic syscall), RISC-V will follow expectation of a token
+	 * flag in flags and if provided in flags, setup a token at the base.
+	 */
+
+	/* If there isn't space for a token */
+	if (set_tok && size < SHSTK_ENTRY_SIZE)
+		return -ENOSPC;
+
+	if (addr && (addr & (PAGE_SIZE - 1)))
+		return -EINVAL;
+
+	aligned_size = PAGE_ALIGN(size);
+	if (aligned_size < size)
+		return -EOVERFLOW;
+
+	return allocate_shadow_stack(addr, aligned_size, size, set_tok);
+}

-- 
2.34.1

Re: [PATCH v12 10/28] riscv/mm: Implement map_shadow_stack() syscall

Posted by Radim Krčmář 10 months ago

2025-03-14T14:39:29-07:00, Deepak Gupta <debug@rivosinc.com>:
> As discussed extensively in the changelog for the addition of this
> syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
> existing mmap() and madvise() syscalls do not map entirely well onto the
> security requirements for shadow stack memory since they lead to windows
> where memory is allocated but not yet protected or stacks which are not
> properly and safely initialised. Instead a new syscall map_shadow_stack()
> has been defined which allocates and initialises a shadow stack page.
>
> This patch implements this syscall for riscv. riscv doesn't require token
> to be setup by kernel because user mode can do that by itself. However to
> provide compatibility and portability with other architectues, user mode
> can specify token set flag.

RISC-V shadow stack could use mmap() and madvise() perfectly well.
Userspace can always initialize the shadow stack properly and the shadow
stack memory is never protected from other malicious threads.

I think that the compatibility argument is reasonable.  We'd need to
modify the other syscalls to allow a write-only mapping anyway.

> diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
> +static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned long val)
> +{
> +	/*
> +	 * Never expect -1 on shadow stack. Expect return addresses and zero
> +	 */
> +	unsigned long swap = -1;
> +	__enable_user_access();
> +	asm goto(
> +		".option push\n"
> +		".option arch, +zicfiss\n"

Shouldn't compiler accept ssamoswap.d opcode even without zicfiss arch?

> +		"1: ssamoswap.d %[swap], %[val], %[addr]\n"
> +		_ASM_EXTABLE(1b, %l[fault])
> +		RISCV_ACQUIRE_BARRIER

Why is the barrier here?

> +		".option pop\n"
> +		: [swap] "=r" (swap), [addr] "+A" (*addr)
> +		: [val] "r" (val)
> +		: "memory"
> +		: fault
> +		);
> +	__disable_user_access();
> +	return swap;
> +fault:
> +	__disable_user_access();
> +	return -1;

I think we should return 0 and -EFAULT.
We can ignore the swapped value, or return it through a pointer.

> +}
> +
> +static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size,
> +					   unsigned long token_offset, bool set_tok)
> +{
> +	int flags = MAP_ANONYMOUS | MAP_PRIVATE;

Is MAP_GROWSDOWN pointless?

> +	struct mm_struct *mm = current->mm;
> +	unsigned long populate, tok_loc = 0;
> +
> +	if (addr)
> +		flags |= MAP_FIXED_NOREPLACE;
> +
> +	mmap_write_lock(mm);
> +	addr = do_mmap(NULL, addr, size, PROT_READ, flags,

PROT_READ implies VM_READ, so won't this select PAGE_COPY in the
protection_map instead of PAGE_SHADOWSTACK?

Wouldn't avoiding VM_READ also allow us to get rid of the ugly hack in
pte_mkwrite?  (VM_WRITE would naturally select the right XWR flags.)

> +		       VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL);
> +	mmap_write_unlock(mm);
> +
> +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
> +{
> [...]
> +	if (addr && (addr & (PAGE_SIZE - 1)))

if (!PAGE_ALIGNED(addr))

Re: [PATCH v12 10/28] riscv/mm: Implement map_shadow_stack() syscall

Posted by Deepak Gupta 9 months, 3 weeks ago

On Thu, Apr 10, 2025 at 11:56:44AM +0200, Radim Krčmář wrote:
>2025-03-14T14:39:29-07:00, Deepak Gupta <debug@rivosinc.com>:
>> As discussed extensively in the changelog for the addition of this
>> syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
>> existing mmap() and madvise() syscalls do not map entirely well onto the
>> security requirements for shadow stack memory since they lead to windows
>> where memory is allocated but not yet protected or stacks which are not
>> properly and safely initialised. Instead a new syscall map_shadow_stack()
>> has been defined which allocates and initialises a shadow stack page.
>>
>> This patch implements this syscall for riscv. riscv doesn't require token
>> to be setup by kernel because user mode can do that by itself. However to
>> provide compatibility and portability with other architectues, user mode
>> can specify token set flag.
>
>RISC-V shadow stack could use mmap() and madvise() perfectly well.

Deviating from what other arches are doing will create more thrash. I expect
there will be merging of common logic between x86, arm64 and riscv. Infact I
did post one such RFC patch set last year (didn't follow up on it). Using
`mmap/madvise` defeats that purpose of creating common logic between arches.

There are pitfalls as mentioned with respect to mmap/madivse because of
unique nature of shadow stack. And thus it was accepted to create a new syscall
to create such mappings. RISC-V will stick to that.

>Userspace can always initialize the shadow stack properly and the shadow
>stack memory is never protected from other malicious threads.

Shadow stack memory is protected from inadvertent stores (be it same thread
or a different thread in same address space). Malicious code which can do
`sspush/ssamoswap` already assumes that code integrity policies are broken.

>
>I think that the compatibility argument is reasonable.  We'd need to
>modify the other syscalls to allow a write-only mapping anyway.

>
>> diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
>> +static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned long val)
>> +{
>> +	/*
>> +	 * Never expect -1 on shadow stack. Expect return addresses and zero
>> +	 */
>> +	unsigned long swap = -1;
>> +	__enable_user_access();
>> +	asm goto(
>> +		".option push\n"
>> +		".option arch, +zicfiss\n"
>
>Shouldn't compiler accept ssamoswap.d opcode even without zicfiss arch?

Its illegal instruction if shadow stack aren't available. Current toolchain
emits it only if zicfiss is specified in march.

>
>> +		"1: ssamoswap.d %[swap], %[val], %[addr]\n"
>> +		_ASM_EXTABLE(1b, %l[fault])
>> +		RISCV_ACQUIRE_BARRIER
>
>Why is the barrier here?

IIRC, I was following `arch_cmpxchg_acquire`.
But I think that's not needed. 
What we are doing is `arch_xchg_relaxed` and barrier is not needed.

I did consider adding it to arch/riscv/include/asm/cmpxchg.h but there is
limited usage of this primitive and thus kept it limited to usercfi.c

Anyways I'll re-spin removing the barrier.

>
>> +		".option pop\n"
>> +		: [swap] "=r" (swap), [addr] "+A" (*addr)
>> +		: [val] "r" (val)
>> +		: "memory"
>> +		: fault
>> +		);
>> +	__disable_user_access();
>> +	return swap;
>> +fault:
>> +	__disable_user_access();
>> +	return -1;
>
>I think we should return 0 and -EFAULT.
>We can ignore the swapped value, or return it through a pointer.

Consumer of this detects -1 and then return -EFAULT.
We would eventually need this when creating shadow stack tokens for
kernel shadow stack. I believe `-1` is safe return value which can't
be construed as negative kernel address (-EFAULT will be)

>
>> +}
>> +
>> +static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size,
>> +					   unsigned long token_offset, bool set_tok)
>> +{
>> +	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
>
>Is MAP_GROWSDOWN pointless?

Not sure. Didn't see that in x86 or arm64 shadow stack creation.
Let me know if its useful.

>
>> +	struct mm_struct *mm = current->mm;
>> +	unsigned long populate, tok_loc = 0;
>> +
>> +	if (addr)
>> +		flags |= MAP_FIXED_NOREPLACE;
>> +
>> +	mmap_write_lock(mm);
>> +	addr = do_mmap(NULL, addr, size, PROT_READ, flags,
>
>PROT_READ implies VM_READ, so won't this select PAGE_COPY in the
>protection_map instead of PAGE_SHADOWSTACK?

PROT_READ is pointless here and redundant. I haven't checked if I remove it
what happens.

`VM_SHADOW_STACK` takes precedence (take a look at pte_mkwrite and pmd_mkwrite.
Only way `VM_SHADOW_STACK` is possible in vmflags is via `map_shadow_stack` or
`fork/clone` on existing task with shadow stack enabled.

In a nutshell user can't specify `VM_SHADOW_STACK` directly (indirectly via
map_shadow_stack syscall or fork/clone) . But if set in vmaflags then it'll
take precedence.

>
>Wouldn't avoiding VM_READ also allow us to get rid of the ugly hack in
>pte_mkwrite?  (VM_WRITE would naturally select the right XWR flags.)

>
>> +		       VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL);
>> +	mmap_write_unlock(mm);
>> +
>> +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
>> +{
>> [...]
>> +	if (addr && (addr & (PAGE_SIZE - 1)))
>
>if (!PAGE_ALIGNED(addr))

Re: [PATCH v12 10/28] riscv/mm: Implement map_shadow_stack() syscall

Posted by Radim Krčmář 9 months, 3 weeks ago

2025-04-23T20:16:58-07:00, Deepak Gupta <debug@rivosinc.com>:
> On Thu, Apr 10, 2025 at 11:56:44AM +0200, Radim Krčmář wrote:
>>2025-03-14T14:39:29-07:00, Deepak Gupta <debug@rivosinc.com>:
>>> As discussed extensively in the changelog for the addition of this
>>> syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
>>> existing mmap() and madvise() syscalls do not map entirely well onto the
>>> security requirements for shadow stack memory since they lead to windows
>>> where memory is allocated but not yet protected or stacks which are not
>>> properly and safely initialised. Instead a new syscall map_shadow_stack()
>>> has been defined which allocates and initialises a shadow stack page.
>>>
>>> This patch implements this syscall for riscv. riscv doesn't require token
>>> to be setup by kernel because user mode can do that by itself. However to
>>> provide compatibility and portability with other architectues, user mode
>>> can specify token set flag.
>>
>>RISC-V shadow stack could use mmap() and madvise() perfectly well.
>
> Deviating from what other arches are doing will create more thrash. I expect
> there will be merging of common logic between x86, arm64 and riscv. Infact I
> did post one such RFC patch set last year (didn't follow up on it). Using
> `mmap/madvise` defeats that purpose of creating common logic between arches.
>
> There are pitfalls as mentioned with respect to mmap/madivse because of
> unique nature of shadow stack. And thus it was accepted to create a new syscall
> to create such mappings. RISC-V will stick to that.

Ok.

>>> diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
>>> +static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned long val)
>>> +{
>>> +	/*
>>> +	 * Never expect -1 on shadow stack. Expect return addresses and zero
>>> +	 */
>>> +	unsigned long swap = -1;
>>> +	__enable_user_access();
>>> +	asm goto(
>>> +		".option push\n"
>>> +		".option arch, +zicfiss\n"
>>
>>Shouldn't compiler accept ssamoswap.d opcode even without zicfiss arch?
>
> Its illegal instruction if shadow stack aren't available. Current toolchain
> emits it only if zicfiss is specified in march.

Oof, I'll look why arch is being used like that, thanks.

(I thought arch is only for compiler generated code, so assembly
 mnemonics would always be defined if the compiler knows them.)

>>
>>> +		".option pop\n"
>>> +		: [swap] "=r" (swap), [addr] "+A" (*addr)
>>> +		: [val] "r" (val)
>>> +		: "memory"
>>> +		: fault
>>> +		);
>>> +	__disable_user_access();
>>> +	return swap;
>>> +fault:
>>> +	__disable_user_access();
>>> +	return -1;
>>
>>I think we should return 0 and -EFAULT.
>>We can ignore the swapped value, or return it through a pointer.
>
> Consumer of this detects -1 and then return -EFAULT.
> We would eventually need this when creating shadow stack tokens for
> kernel shadow stack. I believe `-1` is safe return value which can't
> be construed as negative kernel address (-EFAULT will be)

I believe it as well, but I don't see a reason why we need to risk it
when we can return the stack value though a pointer and have simple
success/failure return value.

>>> +}
>>> +
>>> +static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size,
>>> +					   unsigned long token_offset, bool set_tok)
>>> +{
>>> +	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
>>
>>Is MAP_GROWSDOWN pointless?
>
> Not sure. Didn't see that in x86 or arm64 shadow stack creation.
> Let me know if its useful.

It is for automated growing of the stack.  I think that the default
stack is pointlessly large already, and if other arches don't do it, so
we can probably follow their design here as well...

>>> +	struct mm_struct *mm = current->mm;
>>> +	unsigned long populate, tok_loc = 0;
>>> +
>>> +	if (addr)
>>> +		flags |= MAP_FIXED_NOREPLACE;
>>> +
>>> +	mmap_write_lock(mm);
>>> +	addr = do_mmap(NULL, addr, size, PROT_READ, flags,
>>
>>PROT_READ implies VM_READ, so won't this select PAGE_COPY in the
>>protection_map instead of PAGE_SHADOWSTACK?
>
> PROT_READ is pointless here and redundant. I haven't checked if I remove it
> what happens.
>
> `VM_SHADOW_STACK` takes precedence (take a look at pte_mkwrite and pmd_mkwrite.
> Only way `VM_SHADOW_STACK` is possible in vmflags is via `map_shadow_stack` or
> `fork/clone` on existing task with shadow stack enabled.
>
> In a nutshell user can't specify `VM_SHADOW_STACK` directly (indirectly via
> map_shadow_stack syscall or fork/clone) . But if set in vmaflags then it'll
> take precedence.

Yeah, I don't like that ugly special case at all, so I was hoping we
could somehow avoid it. :)

Re: [PATCH v12 10/28] riscv/mm: Implement map_shadow_stack() syscall

Posted by Zong Li 10 months, 1 week ago

On Sat, Mar 15, 2025 at 5:39 AM Deepak Gupta <debug@rivosinc.com> wrote:
>
> As discussed extensively in the changelog for the addition of this
> syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
> existing mmap() and madvise() syscalls do not map entirely well onto the
> security requirements for shadow stack memory since they lead to windows
> where memory is allocated but not yet protected or stacks which are not
> properly and safely initialised. Instead a new syscall map_shadow_stack()
> has been defined which allocates and initialises a shadow stack page.
>
> This patch implements this syscall for riscv. riscv doesn't require token
> to be setup by kernel because user mode can do that by itself. However to
> provide compatibility and portability with other architectues, user mode
> can specify token set flag.
>
> Reviewed-by: Zong Li <zong.li@sifive.com>
> Signed-off-by: Deepak Gupta <debug@rivosinc.com>
> ---
>  arch/riscv/kernel/Makefile  |   1 +
>  arch/riscv/kernel/usercfi.c | 144 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 145 insertions(+)
>
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index 8d186bfced45..3a861d320654 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -125,3 +125,4 @@ obj-$(CONFIG_ACPI)          += acpi.o
>  obj-$(CONFIG_ACPI_NUMA)        += acpi_numa.o
>
>  obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += bugs.o
> +obj-$(CONFIG_RISCV_USER_CFI) += usercfi.o
> diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
> new file mode 100644
> index 000000000000..24022809a7b5
> --- /dev/null
> +++ b/arch/riscv/kernel/usercfi.c
> @@ -0,0 +1,144 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2024 Rivos, Inc.
> + * Deepak Gupta <debug@rivosinc.com>
> + */
> +
> +#include <linux/sched.h>
> +#include <linux/bitops.h>
> +#include <linux/types.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/uaccess.h>
> +#include <linux/sizes.h>
> +#include <linux/user.h>
> +#include <linux/syscalls.h>
> +#include <linux/prctl.h>
> +#include <asm/csr.h>
> +#include <asm/usercfi.h>
> +
> +#define SHSTK_ENTRY_SIZE sizeof(void *)
> +
> +/*
> + * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can happen
> + * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes pointer to
> + * shadow stack. To keep it simple, we plan to use `ssamoswap` to perform writes on shadow
> + * stack.
> + */
> +static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned long val)
> +{
> +       /*
> +        * Never expect -1 on shadow stack. Expect return addresses and zero
> +        */
> +       unsigned long swap = -1;
> +
> +       __enable_user_access();
> +       asm goto(
> +               ".option push\n"
> +               ".option arch, +zicfiss\n"
> +               "1: ssamoswap.d %[swap], %[val], %[addr]\n"

Hi Deepak,
It just came to my mind, do we need to ensure that menvcfg.SSE is not
zero before executing the ssamoswap instruction? Since ssamoswap is
not encoded using MOP, I’m wondering if we should make sure that
executing ssamoswap won’t accidentally trigger an illegal instruction
exception. Thanks.

> +               _ASM_EXTABLE(1b, %l[fault])
> +               RISCV_ACQUIRE_BARRIER
> +               ".option pop\n"
> +               : [swap] "=r" (swap), [addr] "+A" (*addr)
> +               : [val] "r" (val)
> +               : "memory"
> +               : fault
> +               );
> +       __disable_user_access();
> +       return swap;
> +fault:
> +       __disable_user_access();
> +       return -1;
> +}
> +
> +/*
> + * Create a restore token on the shadow stack.  A token is always XLEN wide
> + * and aligned to XLEN.
> + */
> +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
> +{
> +       unsigned long addr;
> +
> +       /* Token must be aligned */
> +       if (!IS_ALIGNED(ssp, SHSTK_ENTRY_SIZE))
> +               return -EINVAL;
> +
> +       /* On RISC-V we're constructing token to be function of address itself */
> +       addr = ssp - SHSTK_ENTRY_SIZE;
> +
> +       if (amo_user_shstk((unsigned long __user *)addr, (unsigned long)ssp) == -1)
> +               return -EFAULT;
> +
> +       if (token_addr)
> +               *token_addr = addr;
> +
> +       return 0;
> +}
> +
> +static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size,
> +                                          unsigned long token_offset, bool set_tok)
> +{
> +       int flags = MAP_ANONYMOUS | MAP_PRIVATE;
> +       struct mm_struct *mm = current->mm;
> +       unsigned long populate, tok_loc = 0;
> +
> +       if (addr)
> +               flags |= MAP_FIXED_NOREPLACE;
> +
> +       mmap_write_lock(mm);
> +       addr = do_mmap(NULL, addr, size, PROT_READ, flags,
> +                      VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL);
> +       mmap_write_unlock(mm);
> +
> +       if (!set_tok || IS_ERR_VALUE(addr))
> +               goto out;
> +
> +       if (create_rstor_token(addr + token_offset, &tok_loc)) {
> +               vm_munmap(addr, size);
> +               return -EINVAL;
> +       }
> +
> +       addr = tok_loc;
> +
> +out:
> +       return addr;
> +}
> +
> +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
> +{
> +       bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
> +       unsigned long aligned_size = 0;
> +
> +       if (!cpu_supports_shadow_stack())
> +               return -EOPNOTSUPP;
> +
> +       /* Anything other than set token should result in invalid param */
> +       if (flags & ~SHADOW_STACK_SET_TOKEN)
> +               return -EINVAL;
> +
> +       /*
> +        * Unlike other architectures, on RISC-V, SSP pointer is held in CSR_SSP and is available
> +        * CSR in all modes. CSR accesses are performed using 12bit index programmed in instruction
> +        * itself. This provides static property on register programming and writes to CSR can't
> +        * be unintentional from programmer's perspective. As long as programmer has guarded areas
> +        * which perform writes to CSR_SSP properly, shadow stack pivoting is not possible. Since
> +        * CSR_SSP is writeable by user mode, it itself can setup a shadow stack token subsequent
> +        * to allocation. Although in order to provide portablity with other architecture (because
> +        * `map_shadow_stack` is arch agnostic syscall), RISC-V will follow expectation of a token
> +        * flag in flags and if provided in flags, setup a token at the base.
> +        */
> +
> +       /* If there isn't space for a token */
> +       if (set_tok && size < SHSTK_ENTRY_SIZE)
> +               return -ENOSPC;
> +
> +       if (addr && (addr & (PAGE_SIZE - 1)))
> +               return -EINVAL;
> +
> +       aligned_size = PAGE_ALIGN(size);
> +       if (aligned_size < size)
> +               return -EOVERFLOW;
> +
> +       return allocate_shadow_stack(addr, aligned_size, size, set_tok);
> +}
>
> --
> 2.34.1
>

Re: [PATCH v12 10/28] riscv/mm: Implement map_shadow_stack() syscall

Posted by Deepak Gupta 10 months ago

On Mon, Apr 07, 2025 at 12:50:35PM +0800, Zong Li wrote:
>On Sat, Mar 15, 2025 at 5:39 AM Deepak Gupta <debug@rivosinc.com> wrote:
>>
>> As discussed extensively in the changelog for the addition of this
>> syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
>> existing mmap() and madvise() syscalls do not map entirely well onto the
>> security requirements for shadow stack memory since they lead to windows
>> where memory is allocated but not yet protected or stacks which are not
>> properly and safely initialised. Instead a new syscall map_shadow_stack()
>> has been defined which allocates and initialises a shadow stack page.
>>
>> This patch implements this syscall for riscv. riscv doesn't require token
>> to be setup by kernel because user mode can do that by itself. However to
>> provide compatibility and portability with other architectues, user mode
>> can specify token set flag.
>>
>> Reviewed-by: Zong Li <zong.li@sifive.com>
>> Signed-off-by: Deepak Gupta <debug@rivosinc.com>
>> ---
>>  arch/riscv/kernel/Makefile  |   1 +
>>  arch/riscv/kernel/usercfi.c | 144 ++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 145 insertions(+)
>>
>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
>> index 8d186bfced45..3a861d320654 100644
>> --- a/arch/riscv/kernel/Makefile
>> +++ b/arch/riscv/kernel/Makefile
>> @@ -125,3 +125,4 @@ obj-$(CONFIG_ACPI)          += acpi.o
>>  obj-$(CONFIG_ACPI_NUMA)        += acpi_numa.o
>>
>>  obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += bugs.o
>> +obj-$(CONFIG_RISCV_USER_CFI) += usercfi.o
>> diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
>> new file mode 100644
>> index 000000000000..24022809a7b5
>> --- /dev/null
>> +++ b/arch/riscv/kernel/usercfi.c
>> @@ -0,0 +1,144 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2024 Rivos, Inc.
>> + * Deepak Gupta <debug@rivosinc.com>
>> + */
>> +
>> +#include <linux/sched.h>
>> +#include <linux/bitops.h>
>> +#include <linux/types.h>
>> +#include <linux/mm.h>
>> +#include <linux/mman.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/sizes.h>
>> +#include <linux/user.h>
>> +#include <linux/syscalls.h>
>> +#include <linux/prctl.h>
>> +#include <asm/csr.h>
>> +#include <asm/usercfi.h>
>> +
>> +#define SHSTK_ENTRY_SIZE sizeof(void *)
>> +
>> +/*
>> + * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can happen
>> + * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes pointer to
>> + * shadow stack. To keep it simple, we plan to use `ssamoswap` to perform writes on shadow
>> + * stack.
>> + */
>> +static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned long val)
>> +{
>> +       /*
>> +        * Never expect -1 on shadow stack. Expect return addresses and zero
>> +        */
>> +       unsigned long swap = -1;
>> +
>> +       __enable_user_access();
>> +       asm goto(
>> +               ".option push\n"
>> +               ".option arch, +zicfiss\n"
>> +               "1: ssamoswap.d %[swap], %[val], %[addr]\n"
>
>Hi Deepak,
>It just came to my mind, do we need to ensure that menvcfg.SSE is not
>zero before executing the ssamoswap instruction? Since ssamoswap is
>not encoded using MOP, I’m wondering if we should make sure that
>executing ssamoswap won’t accidentally trigger an illegal instruction
>exception. Thanks.

FWFT patches turn SSE during early boot. There is a bug there though,
I need to check if those FWFT SBI call succeeded or not. If it failed
then itshould set a global variable indicating shadow stack can't be
turned on. And in that case this flow wouldn't be reachable. Soon I
will post v13 with these changes.

Thanks for noticing.
>
>> +               _ASM_EXTABLE(1b, %l[fault])
>> +               RISCV_ACQUIRE_BARRIER
>> +               ".option pop\n"
>> +               : [swap] "=r" (swap), [addr] "+A" (*addr)
>> +               : [val] "r" (val)
>> +               : "memory"
>> +               : fault
>> +               );
>> +       __disable_user_access();
>> +       return swap;
>> +fault:
>> +       __disable_user_access();
>> +       return -1;
>> +}
>> +
>> +/*
>> + * Create a restore token on the shadow stack.  A token is always XLEN wide
>> + * and aligned to XLEN.
>> + */
>> +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
>> +{
>> +       unsigned long addr;
>> +
>> +       /* Token must be aligned */
>> +       if (!IS_ALIGNED(ssp, SHSTK_ENTRY_SIZE))
>> +               return -EINVAL;
>> +
>> +       /* On RISC-V we're constructing token to be function of address itself */
>> +       addr = ssp - SHSTK_ENTRY_SIZE;
>> +
>> +       if (amo_user_shstk((unsigned long __user *)addr, (unsigned long)ssp) == -1)
>> +               return -EFAULT;
>> +
>> +       if (token_addr)
>> +               *token_addr = addr;
>> +
>> +       return 0;
>> +}
>> +
>> +static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size,
>> +                                          unsigned long token_offset, bool set_tok)
>> +{
>> +       int flags = MAP_ANONYMOUS | MAP_PRIVATE;
>> +       struct mm_struct *mm = current->mm;
>> +       unsigned long populate, tok_loc = 0;
>> +
>> +       if (addr)
>> +               flags |= MAP_FIXED_NOREPLACE;
>> +
>> +       mmap_write_lock(mm);
>> +       addr = do_mmap(NULL, addr, size, PROT_READ, flags,
>> +                      VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL);
>> +       mmap_write_unlock(mm);
>> +
>> +       if (!set_tok || IS_ERR_VALUE(addr))
>> +               goto out;
>> +
>> +       if (create_rstor_token(addr + token_offset, &tok_loc)) {
>> +               vm_munmap(addr, size);
>> +               return -EINVAL;
>> +       }
>> +
>> +       addr = tok_loc;
>> +
>> +out:
>> +       return addr;
>> +}
>> +
>> +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
>> +{
>> +       bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
>> +       unsigned long aligned_size = 0;
>> +
>> +       if (!cpu_supports_shadow_stack())
>> +               return -EOPNOTSUPP;
>> +
>> +       /* Anything other than set token should result in invalid param */
>> +       if (flags & ~SHADOW_STACK_SET_TOKEN)
>> +               return -EINVAL;
>> +
>> +       /*
>> +        * Unlike other architectures, on RISC-V, SSP pointer is held in CSR_SSP and is available
>> +        * CSR in all modes. CSR accesses are performed using 12bit index programmed in instruction
>> +        * itself. This provides static property on register programming and writes to CSR can't
>> +        * be unintentional from programmer's perspective. As long as programmer has guarded areas
>> +        * which perform writes to CSR_SSP properly, shadow stack pivoting is not possible. Since
>> +        * CSR_SSP is writeable by user mode, it itself can setup a shadow stack token subsequent
>> +        * to allocation. Although in order to provide portablity with other architecture (because
>> +        * `map_shadow_stack` is arch agnostic syscall), RISC-V will follow expectation of a token
>> +        * flag in flags and if provided in flags, setup a token at the base.
>> +        */
>> +
>> +       /* If there isn't space for a token */
>> +       if (set_tok && size < SHSTK_ENTRY_SIZE)
>> +               return -ENOSPC;
>> +
>> +       if (addr && (addr & (PAGE_SIZE - 1)))
>> +               return -EINVAL;
>> +
>> +       aligned_size = PAGE_ALIGN(size);
>> +       if (aligned_size < size)
>> +               return -EOVERFLOW;
>> +
>> +       return allocate_shadow_stack(addr, aligned_size, size, set_tok);
>> +}
>>
>> --
>> 2.34.1
>>