[v4] arm64: Implement getrandom() in vDSO

[PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation

Posted by Adhemerval Zanella 1 year, 5 months ago

Hook up the generic vDSO implementation to the aarch64 vDSO data page.
The _vdso_rng_data required data is placed within the _vdso_data vvar
page, by using a offset larger than the vdso_data.

The vDSO function requires a ChaCha20 implementation that does not write
to the stack, and that can do an entire ChaCha20 permutation.  The one
provided uses NEON on the permute operation, with a fallback to the
syscall for chips that do not support AdvSIMD.

This also passes the vdso_test_chacha test along with
vdso_test_getrandom. The vdso_test_getrandom bench-single result on
Neoverse-N1 shows:

   vdso: 25000000 times in 0.783884250 seconds
   libc: 25000000 times in 8.780275399 seconds
syscall: 25000000 times in 8.786581518 seconds

A small fixup to arch/arm64/include/asm/mman.h was required to avoid
pulling kernel code into the vDSO, similar to what's already done in
arch/arm64/include/asm/rwonce.h.

Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
---
 arch/arm64/Kconfig                         |   1 +
 arch/arm64/include/asm/mman.h              |   6 +-
 arch/arm64/include/asm/vdso.h              |   6 +
 arch/arm64/include/asm/vdso/getrandom.h    |  50 ++++++
 arch/arm64/include/asm/vdso/vsyscall.h     |  10 ++
 arch/arm64/kernel/vdso.c                   |   6 -
 arch/arm64/kernel/vdso/Makefile            |  25 ++-
 arch/arm64/kernel/vdso/vdso                |   1 +
 arch/arm64/kernel/vdso/vdso.lds.S          |   4 +
 arch/arm64/kernel/vdso/vgetrandom-chacha.S | 178 +++++++++++++++++++++
 arch/arm64/kernel/vdso/vgetrandom.c        |  15 ++
 tools/arch/arm64/vdso                      |   1 +
 tools/include/linux/compiler.h             |   4 +
 tools/testing/selftests/vDSO/Makefile      |   3 +-
 14 files changed, 294 insertions(+), 16 deletions(-)
 create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
 create mode 120000 arch/arm64/kernel/vdso/vdso
 create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
 create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
 create mode 120000 tools/arch/arm64/vdso

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca6..7f7424d1b3b8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -262,6 +262,7 @@ config ARM64
 	select TRACE_IRQFLAGS_NMI_SUPPORT
 	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select USER_STACKTRACE_SUPPORT
+	select VDSO_GETRANDOM
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 5966ee4a6154..ceb4370a69c5 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -2,9 +2,11 @@
 #ifndef __ASM_MMAN_H__
 #define __ASM_MMAN_H__
 
+#include <uapi/asm/mman.h>
+
+#ifndef BUILD_VDSO
 #include <linux/compiler.h>
 #include <linux/types.h>
-#include <uapi/asm/mman.h>
 
 static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
 	unsigned long pkey __always_unused)
@@ -60,4 +62,6 @@ static inline bool arch_validate_flags(unsigned long vm_flags)
 }
 #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
 
+#endif /* !BUILD_VDSO */
+
 #endif /* ! __ASM_MMAN_H__ */
diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h
index 4305995c8f82..18407b757c95 100644
--- a/arch/arm64/include/asm/vdso.h
+++ b/arch/arm64/include/asm/vdso.h
@@ -16,6 +16,12 @@
 
 #ifndef __ASSEMBLY__
 
+enum vvar_pages {
+	VVAR_DATA_PAGE_OFFSET,
+	VVAR_TIMENS_PAGE_OFFSET,
+	VVAR_NR_PAGES,
+};
+
 #include <generated/vdso-offsets.h>
 
 #define VDSO_SYMBOL(base, name)						   \
diff --git a/arch/arm64/include/asm/vdso/getrandom.h b/arch/arm64/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..6cda64e6da83
--- /dev/null
+++ b/arch/arm64/include/asm/vdso/getrandom.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/vdso.h>
+#include <asm/unistd.h>
+#include <vdso/datapage.h>
+
+/**
+ * getrandom_syscall - Invoke the getrandom() syscall.
+ * @buffer:	Destination buffer to fill with random bytes.
+ * @len:	Size of @buffer in bytes.
+ * @flags:	Zero or more GRND_* flags.
+ * Returns:	The number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
+{
+	register void *buffer asm ("x0") = _buffer;
+	register size_t len asm ("x1") = _len;
+	register unsigned int flags asm ("x2") = _flags;
+	register long ret asm ("x0");
+	register long nr asm ("x8") = __NR_getrandom;
+
+	asm volatile(
+	"       svc #0\n"
+	: "=r" (ret)
+	: "r" (buffer), "r" (len), "r" (flags), "r" (nr)
+	: "memory");
+
+	return ret;
+}
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+	/*
+	 * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace
+	 * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_
+	 * PAGE_OFFSET points to the real VVAR page.
+	 */
+	if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
+		return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * (1UL << CONFIG_PAGE_SHIFT);
+	return &_vdso_rng_data;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h
index f94b1457c117..b4241510dc36 100644
--- a/arch/arm64/include/asm/vdso/vsyscall.h
+++ b/arch/arm64/include/asm/vdso/vsyscall.h
@@ -2,8 +2,11 @@
 #ifndef __ASM_VDSO_VSYSCALL_H
 #define __ASM_VDSO_VSYSCALL_H
 
+#define __VDSO_RND_DATA_OFFSET  480
+
 #ifndef __ASSEMBLY__
 
+#include <asm/vdso.h>
 #include <linux/timekeeper_internal.h>
 #include <vdso/datapage.h>
 
@@ -21,6 +24,13 @@ struct vdso_data *__arm64_get_k_vdso_data(void)
 }
 #define __arch_get_k_vdso_data __arm64_get_k_vdso_data
 
+static __always_inline
+struct vdso_rng_data *__arm64_get_k_vdso_rnd_data(void)
+{
+	return (void *)vdso_data + __VDSO_RND_DATA_OFFSET;
+}
+#define __arch_get_k_vdso_rng_data __arm64_get_k_vdso_rnd_data
+
 static __always_inline
 void __arm64_update_vsyscall(struct vdso_data *vdata, struct timekeeper *tk)
 {
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 89b6e7840002..706c9c3a7a50 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -34,12 +34,6 @@ enum vdso_abi {
 	VDSO_ABI_AA32,
 };
 
-enum vvar_pages {
-	VVAR_DATA_PAGE_OFFSET,
-	VVAR_TIMENS_PAGE_OFFSET,
-	VVAR_NR_PAGES,
-};
-
 struct vdso_abi_info {
 	const char *name;
 	const char *vdso_code_start;
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index d11da6461278..35685c036044 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -9,7 +9,7 @@
 # Include the generic Makefile to check the built vdso.
 include $(srctree)/lib/vdso/Makefile
 
-obj-vdso := vgettimeofday.o note.o sigreturn.o
+obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o
 
 # Build rules
 targets := $(obj-vdso) vdso.so vdso.so.dbg
@@ -34,19 +34,28 @@ ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
 ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 
 # -Wmissing-prototypes and -Wmissing-declarations are removed from
-# the CFLAGS of vgettimeofday.c to make possible to build the
-# kernel with CONFIG_WERROR enabled.
-CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
-				$(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
-				$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
-				-Wmissing-prototypes -Wmissing-declarations
+# the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled.
+CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
+			$(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
+			$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
+			-Wmissing-prototypes -Wmissing-declarations
 
-CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables
+CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables
+
+CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_REMOVE_VDSO)
+CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_REMOVE_VDSO)
+
+CFLAGS_vgettimeofday.o = $(CC_FLAGS_ADD_VDSO)
+CFLAGS_vgetrandom.o = $(CC_FLAGS_ADD_VDSO)
 
 ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
 endif
 
+ifneq ($(c-getrandom-y),)
+  CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
+
 targets += vdso.lds
 CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
 
diff --git a/arch/arm64/kernel/vdso/vdso b/arch/arm64/kernel/vdso/vdso
new file mode 120000
index 000000000000..233c7a26f6e5
--- /dev/null
+++ b/arch/arm64/kernel/vdso/vdso
@@ -0,0 +1 @@
+../../../arch/arm64/kernel/vdso
\ No newline at end of file
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index 45354f2ddf70..f204a9ddc833 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -11,7 +11,9 @@
 #include <linux/const.h>
 #include <asm/page.h>
 #include <asm/vdso.h>
+#include <asm/vdso/vsyscall.h>
 #include <asm-generic/vmlinux.lds.h>
+#include <vdso/datapage.h>
 
 OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64")
 OUTPUT_ARCH(aarch64)
@@ -19,6 +21,7 @@ OUTPUT_ARCH(aarch64)
 SECTIONS
 {
 	PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+	PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET);
 #ifdef CONFIG_TIME_NS
 	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
 #endif
@@ -102,6 +105,7 @@ VERSION
 		__kernel_gettimeofday;
 		__kernel_clock_gettime;
 		__kernel_clock_getres;
+		__kernel_getrandom;
 	local: *;
 	};
 }
diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..4e5f9c349522
--- /dev/null
+++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+#include <asm/assembler.h>
+
+	.text
+
+#define state0		v0
+#define state1		v1
+#define state2		v2
+#define state3		v3
+#define copy0		v4
+#define copy0_q		q4
+#define copy1		v5
+#define copy2		v6
+#define copy3		v7
+#define copy3_d		d7
+#define one_d		d16
+#define one_q		q16
+#define one_v		v16
+#define tmp		v17
+#define rot8		v18
+
+/*
+ * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
+ * number of blocks of output with nonce 0, taking an input key and 8-bytes
+ * counter.  Importantly does not spill to the stack.
+ *
+ * This implementation avoids d8-d15 because they are callee-save in user
+ * space.
+ *
+ * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
+ *				       const uint8_t *key,
+ * 				       uint32_t *counter,
+ *				       size_t nblocks)
+ *
+ * 	x0: output bytes
+ *	x1: 32-byte key input
+ *	x2: 8-byte counter input/output
+ *	x3: number of 64-byte block to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+	/* copy0 = "expand 32-byte k" */
+	mov_q		x8, 0x3320646e61707865
+	mov_q		x9, 0x6b20657479622d32
+	mov		copy0.d[0], x8
+	mov		copy0.d[1], x9
+
+	/* copy1,copy2 = key */
+	ld1		{ copy1.4s, copy2.4s }, [x1]
+	/* copy3 = counter || zero nonce  */
+	ldr		copy3_d, [x2]
+CPU_BE( rev64		copy3.4s, copy3.4s)
+
+	movi		one_v.2s, #1
+	uzp1		one_v.4s, one_v.4s, one_v.4s
+
+.Lblock:
+	/* copy state to auxiliary vectors for the final add after the permute.  */
+	mov		state0.16b, copy0.16b
+	mov		state1.16b, copy1.16b
+	mov		state2.16b, copy2.16b
+	mov		state3.16b, copy3.16b
+
+	mov		w4, 20
+.Lpermute:
+	/*
+	 * Permute one 64-byte block where the state matrix is stored in the four NEON
+	 * registers state0-state3.  It performs matrix operations on four words in parallel,
+	 * but requires shuffling to rearrange the words after each round.
+	 */
+
+.Ldoubleround:
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		state3.16b, state3.16b, state0.16b
+	rev32		state3.8h, state3.8h
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #12
+	sri		state1.4s, tmp.4s, #20
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		tmp.16b, state3.16b, state0.16b
+	shl		state3.4s, tmp.4s, #8
+	sri		state3.4s, tmp.4s, #24
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #7
+	sri		state1.4s, tmp.4s, #25
+
+	/* state1[0,1,2,3] = state1[1,2,3,0] */
+	ext		state1.16b, state1.16b, state1.16b, #4
+	/* state2[0,1,2,3] = state2[2,3,0,1] */
+	ext		state2.16b, state2.16b, state2.16b, #8
+	/* state3[0,1,2,3] = state3[1,2,3,0] */
+	ext		state3.16b, state3.16b, state3.16b, #12
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		state3.16b, state3.16b, state0.16b
+	rev32		state3.8h, state3.8h
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #12
+	sri		state1.4s, tmp.4s, #20
+
+	/* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+	add		state0.4s, state0.4s, state1.4s
+	eor		tmp.16b, state3.16b, state0.16b
+	shl		state3.4s, tmp.4s, #8
+	sri		state3.4s, tmp.4s, #24
+
+	/* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+	add		state2.4s, state2.4s, state3.4s
+	eor		tmp.16b, state1.16b, state2.16b
+	shl		state1.4s, tmp.4s, #7
+	sri		state1.4s, tmp.4s, #25
+
+	/* state1[0,1,2,3] = state1[3,0,1,2] */
+	ext		state1.16b, state1.16b, state1.16b, #12
+	/* state2[0,1,2,3] = state2[2,3,0,1] */
+	ext		state2.16b, state2.16b, state2.16b, #8
+	/* state3[0,1,2,3] = state3[1,2,3,0] */
+	ext		state3.16b, state3.16b, state3.16b, #4
+
+	subs		w4, w4, #2
+	b.ne		.Ldoubleround
+
+	/* output0 = state0 + state0 */
+	add		state0.4s, state0.4s, copy0.4s
+CPU_BE( rev32		state0.16b, state0.16b)
+	/* output1 = state1 + state1 */
+	add		state1.4s, state1.4s, copy1.4s
+CPU_BE( rev32		state1.16b, state1.16b)
+	/* output2 = state2 + state2 */
+	add		state2.4s, state2.4s, copy2.4s
+CPU_BE( rev32		state2.16b, state2.16b)
+	/* output2 = state3 + state3 */
+	add		state3.4s, state3.4s, copy3.4s
+CPU_BE( rev32		state3.16b, state3.16b)
+	st1		{ state0.4s - state3.4s }, [x0]
+
+	/*
+	 * ++copy3.counter, the 'add' clears the upper half of the SIMD register
+	 * which is the expected behaviour here.
+	 */
+	add		copy3_d, copy3_d, one_d
+
+	/* output += 64, --nblocks */
+	add		x0, x0, 64
+	subs		x3, x3, #1
+	b.ne		.Lblock
+
+	/* counter = copy3.counter */
+CPU_BE( rev64		copy3.4s, copy3.4s)
+	str		copy3_d, [x2]
+
+	/* Zero out the potentially sensitive regs, in case nothing uses these again. */
+	movi		state0.16b, #0
+	movi		state1.16b, #0
+	movi		state2.16b, #0
+	movi		state3.16b, #0
+	movi		copy1.16b, #0
+	movi		copy2.16b, #0
+	ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
+
+emit_aarch64_feature_1_and
diff --git a/arch/arm64/kernel/vdso/vgetrandom.c b/arch/arm64/kernel/vdso/vgetrandom.c
new file mode 100644
index 000000000000..832fe195292b
--- /dev/null
+++ b/arch/arm64/kernel/vdso/vgetrandom.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <uapi/asm-generic/errno.h>
+
+typeof(__cvdso_getrandom) __kernel_getrandom;
+
+ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	if (alternative_has_cap_likely(ARM64_HAS_FPSIMD))
+		return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+
+	if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags))
+		return -ENOSYS;
+	return getrandom_syscall(buffer, len, flags);
+}
diff --git a/tools/arch/arm64/vdso b/tools/arch/arm64/vdso
new file mode 120000
index 000000000000..233c7a26f6e5
--- /dev/null
+++ b/tools/arch/arm64/vdso
@@ -0,0 +1 @@
+../../../arch/arm64/kernel/vdso
\ No newline at end of file
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 6f7f22ac9da5..4366da278033 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -2,6 +2,8 @@
 #ifndef _TOOLS_LINUX_COMPILER_H_
 #define _TOOLS_LINUX_COMPILER_H_
 
+#ifndef __ASSEMBLY__
+
 #include <linux/compiler_types.h>
 
 #ifndef __compiletime_error
@@ -224,4 +226,6 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	__asm__ ("" : "=r" (var) : "0" (var))
 #endif
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
index 04930125035e..3c6fafbd83a6 100644
--- a/tools/testing/selftests/vDSO/Makefile
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -9,7 +9,7 @@ ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
 TEST_GEN_PROGS += vdso_standalone_test_x86
 endif
 TEST_GEN_PROGS += vdso_test_correctness
-ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch))
+ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch arm64))
 TEST_GEN_PROGS += vdso_test_getrandom
 TEST_GEN_PROGS += vdso_test_chacha
 endif
@@ -40,5 +40,6 @@ $(OUTPUT)/vdso_test_getrandom: CFLAGS += -isystem $(top_srcdir)/tools/include \
 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(SRCARCH)/vdso/vgetrandom-chacha.S
 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \
                                       -idirafter $(top_srcdir)/arch/$(SRCARCH)/include \
+                                      -idirafter $(top_srcdir)/arch/$(SRCARCH)/include/generated \
                                       -idirafter $(top_srcdir)/include \
                                       -D__ASSEMBLY__ -Wa,--noexecstack
-- 
2.43.0

Re: [PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation

Posted by Ard Biesheuvel 1 year, 5 months ago

Hi Adhemerval,

I have just a couple of more points below, on the BE handling in the asm.

On Mon, 2 Sept 2024 at 18:19, Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
> Hook up the generic vDSO implementation to the aarch64 vDSO data page.
> The _vdso_rng_data required data is placed within the _vdso_data vvar
> page, by using a offset larger than the vdso_data.
>
> The vDSO function requires a ChaCha20 implementation that does not write
> to the stack, and that can do an entire ChaCha20 permutation.  The one
> provided uses NEON on the permute operation, with a fallback to the
> syscall for chips that do not support AdvSIMD.
>
> This also passes the vdso_test_chacha test along with
> vdso_test_getrandom. The vdso_test_getrandom bench-single result on
> Neoverse-N1 shows:
>
>    vdso: 25000000 times in 0.783884250 seconds
>    libc: 25000000 times in 8.780275399 seconds
> syscall: 25000000 times in 8.786581518 seconds
>
> A small fixup to arch/arm64/include/asm/mman.h was required to avoid
> pulling kernel code into the vDSO, similar to what's already done in
> arch/arm64/include/asm/rwonce.h.
>
> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
> ---
>  arch/arm64/Kconfig                         |   1 +
>  arch/arm64/include/asm/mman.h              |   6 +-
>  arch/arm64/include/asm/vdso.h              |   6 +
>  arch/arm64/include/asm/vdso/getrandom.h    |  50 ++++++
>  arch/arm64/include/asm/vdso/vsyscall.h     |  10 ++
>  arch/arm64/kernel/vdso.c                   |   6 -
>  arch/arm64/kernel/vdso/Makefile            |  25 ++-
>  arch/arm64/kernel/vdso/vdso                |   1 +
>  arch/arm64/kernel/vdso/vdso.lds.S          |   4 +
>  arch/arm64/kernel/vdso/vgetrandom-chacha.S | 178 +++++++++++++++++++++
>  arch/arm64/kernel/vdso/vgetrandom.c        |  15 ++
>  tools/arch/arm64/vdso                      |   1 +
>  tools/include/linux/compiler.h             |   4 +
>  tools/testing/selftests/vDSO/Makefile      |   3 +-
>  14 files changed, 294 insertions(+), 16 deletions(-)
>  create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
>  create mode 120000 arch/arm64/kernel/vdso/vdso
>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
>  create mode 120000 tools/arch/arm64/vdso
>
...
> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..4e5f9c349522
> --- /dev/null
> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/linkage.h>
> +#include <asm/cache.h>
> +#include <asm/assembler.h>
> +
> +       .text
> +
> +#define state0         v0
> +#define state1         v1
> +#define state2         v2
> +#define state3         v3
> +#define copy0          v4
> +#define copy0_q                q4
> +#define copy1          v5
> +#define copy2          v6
> +#define copy3          v7
> +#define copy3_d                d7
> +#define one_d          d16
> +#define one_q          q16
> +#define one_v          v16
> +#define tmp            v17
> +#define rot8           v18
> +
> +/*
> + * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
> + * number of blocks of output with nonce 0, taking an input key and 8-bytes
> + * counter.  Importantly does not spill to the stack.
> + *
> + * This implementation avoids d8-d15 because they are callee-save in user
> + * space.
> + *
> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
> + *                                    const uint8_t *key,
> + *                                    uint32_t *counter,
> + *                                    size_t nblocks)
> + *
> + *     x0: output bytes
> + *     x1: 32-byte key input
> + *     x2: 8-byte counter input/output
> + *     x3: number of 64-byte block to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +
> +       /* copy0 = "expand 32-byte k" */
> +       mov_q           x8, 0x3320646e61707865
> +       mov_q           x9, 0x6b20657479622d32
> +       mov             copy0.d[0], x8
> +       mov             copy0.d[1], x9
> +
> +       /* copy1,copy2 = key */
> +       ld1             { copy1.4s, copy2.4s }, [x1]
> +       /* copy3 = counter || zero nonce  */
> +       ldr             copy3_d, [x2]
> +CPU_BE( rev64          copy3.4s, copy3.4s)
> +

This loads 2 u32s as a single u64, and then swaps them if we are running on BE.
So better to just use

  ld1 {copy3.2s}, [x2]

here, and drop the CPU_BE() special case.

> +       movi            one_v.2s, #1
> +       uzp1            one_v.4s, one_v.4s, one_v.4s
> +
> +.Lblock:
> +       /* copy state to auxiliary vectors for the final add after the permute.  */
> +       mov             state0.16b, copy0.16b
> +       mov             state1.16b, copy1.16b
> +       mov             state2.16b, copy2.16b
> +       mov             state3.16b, copy3.16b
> +
> +       mov             w4, 20
> +.Lpermute:
> +       /*
> +        * Permute one 64-byte block where the state matrix is stored in the four NEON
> +        * registers state0-state3.  It performs matrix operations on four words in parallel,
> +        * but requires shuffling to rearrange the words after each round.
> +        */
> +
> +.Ldoubleround:
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             state3.16b, state3.16b, state0.16b
> +       rev32           state3.8h, state3.8h
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #12
> +       sri             state1.4s, tmp.4s, #20
> +
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             tmp.16b, state3.16b, state0.16b
> +       shl             state3.4s, tmp.4s, #8
> +       sri             state3.4s, tmp.4s, #24
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #7
> +       sri             state1.4s, tmp.4s, #25
> +
> +       /* state1[0,1,2,3] = state1[1,2,3,0] */
> +       ext             state1.16b, state1.16b, state1.16b, #4
> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
> +       ext             state2.16b, state2.16b, state2.16b, #8
> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
> +       ext             state3.16b, state3.16b, state3.16b, #12
> +
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             state3.16b, state3.16b, state0.16b
> +       rev32           state3.8h, state3.8h
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #12
> +       sri             state1.4s, tmp.4s, #20
> +
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             tmp.16b, state3.16b, state0.16b
> +       shl             state3.4s, tmp.4s, #8
> +       sri             state3.4s, tmp.4s, #24
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #7
> +       sri             state1.4s, tmp.4s, #25
> +
> +       /* state1[0,1,2,3] = state1[3,0,1,2] */
> +       ext             state1.16b, state1.16b, state1.16b, #12
> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
> +       ext             state2.16b, state2.16b, state2.16b, #8
> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
> +       ext             state3.16b, state3.16b, state3.16b, #4
> +
> +       subs            w4, w4, #2
> +       b.ne            .Ldoubleround
> +
> +       /* output0 = state0 + state0 */
> +       add             state0.4s, state0.4s, copy0.4s
> +CPU_BE( rev32          state0.16b, state0.16b)
> +       /* output1 = state1 + state1 */
> +       add             state1.4s, state1.4s, copy1.4s
> +CPU_BE( rev32          state1.16b, state1.16b)
> +       /* output2 = state2 + state2 */
> +       add             state2.4s, state2.4s, copy2.4s
> +CPU_BE( rev32          state2.16b, state2.16b)
> +       /* output2 = state3 + state3 */
> +       add             state3.4s, state3.4s, copy3.4s
> +CPU_BE( rev32          state3.16b, state3.16b)
> +       st1             { state0.4s - state3.4s }, [x0]
> +

If the u32s shouldn't be swabbed for BE, you should simply be able to do

st1 {state0.16b - state3.16b}, [x0]

here, and drop the CPU_BE(*).

> +       /*
> +        * ++copy3.counter, the 'add' clears the upper half of the SIMD register
> +        * which is the expected behaviour here.
> +        */
> +       add             copy3_d, copy3_d, one_d
> +
> +       /* output += 64, --nblocks */
> +       add             x0, x0, 64
> +       subs            x3, x3, #1
> +       b.ne            .Lblock
> +
> +       /* counter = copy3.counter */
> +CPU_BE( rev64          copy3.4s, copy3.4s)
> +       str             copy3_d, [x2]
> +

... and this could be

st1 {copy3.2s}, [x2]

> +       /* Zero out the potentially sensitive regs, in case nothing uses these again. */
> +       movi            state0.16b, #0
> +       movi            state1.16b, #0
> +       movi            state2.16b, #0
> +       movi            state3.16b, #0
> +       movi            copy1.16b, #0
> +       movi            copy2.16b, #0
> +       ret
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> +
> +emit_aarch64_feature_1_and

Re: [PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation

Posted by Adhemerval Zanella Netto 1 year, 5 months ago


On 02/09/24 17:57, Ard Biesheuvel wrote:
> Hi Adhemerval,
> 
> I have just a couple of more points below, on the BE handling in the asm.
> 
> On Mon, 2 Sept 2024 at 18:19, Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>> Hook up the generic vDSO implementation to the aarch64 vDSO data page.
>> The _vdso_rng_data required data is placed within the _vdso_data vvar
>> page, by using a offset larger than the vdso_data.
>>
>> The vDSO function requires a ChaCha20 implementation that does not write
>> to the stack, and that can do an entire ChaCha20 permutation.  The one
>> provided uses NEON on the permute operation, with a fallback to the
>> syscall for chips that do not support AdvSIMD.
>>
>> This also passes the vdso_test_chacha test along with
>> vdso_test_getrandom. The vdso_test_getrandom bench-single result on
>> Neoverse-N1 shows:
>>
>>    vdso: 25000000 times in 0.783884250 seconds
>>    libc: 25000000 times in 8.780275399 seconds
>> syscall: 25000000 times in 8.786581518 seconds
>>
>> A small fixup to arch/arm64/include/asm/mman.h was required to avoid
>> pulling kernel code into the vDSO, similar to what's already done in
>> arch/arm64/include/asm/rwonce.h.
>>
>> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
>> ---
>>  arch/arm64/Kconfig                         |   1 +
>>  arch/arm64/include/asm/mman.h              |   6 +-
>>  arch/arm64/include/asm/vdso.h              |   6 +
>>  arch/arm64/include/asm/vdso/getrandom.h    |  50 ++++++
>>  arch/arm64/include/asm/vdso/vsyscall.h     |  10 ++
>>  arch/arm64/kernel/vdso.c                   |   6 -
>>  arch/arm64/kernel/vdso/Makefile            |  25 ++-
>>  arch/arm64/kernel/vdso/vdso                |   1 +
>>  arch/arm64/kernel/vdso/vdso.lds.S          |   4 +
>>  arch/arm64/kernel/vdso/vgetrandom-chacha.S | 178 +++++++++++++++++++++
>>  arch/arm64/kernel/vdso/vgetrandom.c        |  15 ++
>>  tools/arch/arm64/vdso                      |   1 +
>>  tools/include/linux/compiler.h             |   4 +
>>  tools/testing/selftests/vDSO/Makefile      |   3 +-
>>  14 files changed, 294 insertions(+), 16 deletions(-)
>>  create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
>>  create mode 120000 arch/arm64/kernel/vdso/vdso
>>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
>>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
>>  create mode 120000 tools/arch/arm64/vdso
>>
> ...
>> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
>> new file mode 100644
>> index 000000000000..4e5f9c349522
>> --- /dev/null
>> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
>> @@ -0,0 +1,178 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/cache.h>
>> +#include <asm/assembler.h>
>> +
>> +       .text
>> +
>> +#define state0         v0
>> +#define state1         v1
>> +#define state2         v2
>> +#define state3         v3
>> +#define copy0          v4
>> +#define copy0_q                q4
>> +#define copy1          v5
>> +#define copy2          v6
>> +#define copy3          v7
>> +#define copy3_d                d7
>> +#define one_d          d16
>> +#define one_q          q16
>> +#define one_v          v16
>> +#define tmp            v17
>> +#define rot8           v18
>> +
>> +/*
>> + * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
>> + * number of blocks of output with nonce 0, taking an input key and 8-bytes
>> + * counter.  Importantly does not spill to the stack.
>> + *
>> + * This implementation avoids d8-d15 because they are callee-save in user
>> + * space.
>> + *
>> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
>> + *                                    const uint8_t *key,
>> + *                                    uint32_t *counter,
>> + *                                    size_t nblocks)
>> + *
>> + *     x0: output bytes
>> + *     x1: 32-byte key input
>> + *     x2: 8-byte counter input/output
>> + *     x3: number of 64-byte block to write to output
>> + */
>> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>> +
>> +       /* copy0 = "expand 32-byte k" */
>> +       mov_q           x8, 0x3320646e61707865
>> +       mov_q           x9, 0x6b20657479622d32
>> +       mov             copy0.d[0], x8
>> +       mov             copy0.d[1], x9
>> +
>> +       /* copy1,copy2 = key */
>> +       ld1             { copy1.4s, copy2.4s }, [x1]
>> +       /* copy3 = counter || zero nonce  */
>> +       ldr             copy3_d, [x2]
>> +CPU_BE( rev64          copy3.4s, copy3.4s)
>> +
> 
> This loads 2 u32s as a single u64, and then swaps them if we are running on BE.
> So better to just use
> 
>   ld1 {copy3.2s}, [x2]
> 
> here, and drop the CPU_BE() special case.

Ack.

> 
>> +       movi            one_v.2s, #1
>> +       uzp1            one_v.4s, one_v.4s, one_v.4s
>> +
>> +.Lblock:
>> +       /* copy state to auxiliary vectors for the final add after the permute.  */
>> +       mov             state0.16b, copy0.16b
>> +       mov             state1.16b, copy1.16b
>> +       mov             state2.16b, copy2.16b
>> +       mov             state3.16b, copy3.16b
>> +
>> +       mov             w4, 20
>> +.Lpermute:
>> +       /*
>> +        * Permute one 64-byte block where the state matrix is stored in the four NEON
>> +        * registers state0-state3.  It performs matrix operations on four words in parallel,
>> +        * but requires shuffling to rearrange the words after each round.
>> +        */
>> +
>> +.Ldoubleround:
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             state3.16b, state3.16b, state0.16b
>> +       rev32           state3.8h, state3.8h
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #12
>> +       sri             state1.4s, tmp.4s, #20
>> +
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             tmp.16b, state3.16b, state0.16b
>> +       shl             state3.4s, tmp.4s, #8
>> +       sri             state3.4s, tmp.4s, #24
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #7
>> +       sri             state1.4s, tmp.4s, #25
>> +
>> +       /* state1[0,1,2,3] = state1[1,2,3,0] */
>> +       ext             state1.16b, state1.16b, state1.16b, #4
>> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
>> +       ext             state2.16b, state2.16b, state2.16b, #8
>> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
>> +       ext             state3.16b, state3.16b, state3.16b, #12
>> +
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             state3.16b, state3.16b, state0.16b
>> +       rev32           state3.8h, state3.8h
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #12
>> +       sri             state1.4s, tmp.4s, #20
>> +
>> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
>> +       add             state0.4s, state0.4s, state1.4s
>> +       eor             tmp.16b, state3.16b, state0.16b
>> +       shl             state3.4s, tmp.4s, #8
>> +       sri             state3.4s, tmp.4s, #24
>> +
>> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
>> +       add             state2.4s, state2.4s, state3.4s
>> +       eor             tmp.16b, state1.16b, state2.16b
>> +       shl             state1.4s, tmp.4s, #7
>> +       sri             state1.4s, tmp.4s, #25
>> +
>> +       /* state1[0,1,2,3] = state1[3,0,1,2] */
>> +       ext             state1.16b, state1.16b, state1.16b, #12
>> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
>> +       ext             state2.16b, state2.16b, state2.16b, #8
>> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
>> +       ext             state3.16b, state3.16b, state3.16b, #4
>> +
>> +       subs            w4, w4, #2
>> +       b.ne            .Ldoubleround
>> +
>> +       /* output0 = state0 + state0 */
>> +       add             state0.4s, state0.4s, copy0.4s
>> +CPU_BE( rev32          state0.16b, state0.16b)
>> +       /* output1 = state1 + state1 */
>> +       add             state1.4s, state1.4s, copy1.4s
>> +CPU_BE( rev32          state1.16b, state1.16b)
>> +       /* output2 = state2 + state2 */
>> +       add             state2.4s, state2.4s, copy2.4s
>> +CPU_BE( rev32          state2.16b, state2.16b)
>> +       /* output2 = state3 + state3 */
>> +       add             state3.4s, state3.4s, copy3.4s
>> +CPU_BE( rev32          state3.16b, state3.16b)
>> +       st1             { state0.4s - state3.4s }, [x0]
>> +
> 
> If the u32s shouldn't be swabbed for BE, you should simply be able to do
> 
> st1 {state0.16b - state3.16b}, [x0]
> 
> here, and drop the CPU_BE(*).

Ack.

> 
>> +       /*
>> +        * ++copy3.counter, the 'add' clears the upper half of the SIMD register
>> +        * which is the expected behaviour here.
>> +        */
>> +       add             copy3_d, copy3_d, one_d
>> +
>> +       /* output += 64, --nblocks */
>> +       add             x0, x0, 64
>> +       subs            x3, x3, #1
>> +       b.ne            .Lblock
>> +
>> +       /* counter = copy3.counter */
>> +CPU_BE( rev64          copy3.4s, copy3.4s)
>> +       str             copy3_d, [x2]
>> +
> 
> ... and this could be
> 
> st1 {copy3.2s}, [x2]

Ack.

I just sent a v5 with your proposed suggestions, thanks!

> 
>> +       /* Zero out the potentially sensitive regs, in case nothing uses these again. */
>> +       movi            state0.16b, #0
>> +       movi            state1.16b, #0
>> +       movi            state2.16b, #0
>> +       movi            state3.16b, #0
>> +       movi            copy1.16b, #0
>> +       movi            copy2.16b, #0
>> +       ret
>> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
>> +
>> +emit_aarch64_feature_1_and

[PATCH v4 1/2] arm64: alternative: make alternative_has_cap_likely() VDSO compatible
[PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation