Hook up the generic vDSO implementation to the LoongArch vDSO data page
by providing the required __arch_chacha20_blocks_nostack,
__arch_get_k_vdso_rng_data, and getrandom_syscall implementations.
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
---
arch/loongarch/Kconfig | 1 +
arch/loongarch/include/asm/vdso/getrandom.h | 38 +++
arch/loongarch/include/asm/vdso/vdso.h | 6 +
arch/loongarch/include/asm/vdso/vsyscall.h | 8 +
arch/loongarch/kernel/vdso.c | 1 +
arch/loongarch/vdso/Makefile | 7 +-
arch/loongarch/vdso/vdso.lds.S | 1 +
arch/loongarch/vdso/vgetrandom-chacha.S | 242 ++++++++++++++++++++
arch/loongarch/vdso/vgetrandom.c | 10 +
9 files changed, 313 insertions(+), 1 deletion(-)
create mode 100644 arch/loongarch/include/asm/vdso/getrandom.h
create mode 100644 arch/loongarch/vdso/vgetrandom-chacha.S
create mode 100644 arch/loongarch/vdso/vgetrandom.c
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 70f169210b52..14821c2aba5b 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -190,6 +190,7 @@ config LOONGARCH
select TRACE_IRQFLAGS_SUPPORT
select USE_PERCPU_NUMA_NODE_ID
select USER_STACKTRACE_SUPPORT
+ select VDSO_GETRANDOM
select ZONE_DMA32
config 32BIT
diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..f2d17daec1e2
--- /dev/null
+++ b/arch/loongarch/include/asm/vdso/getrandom.h
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+#include <asm/vdso/vdso.h>
+
+static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
+{
+ register long ret asm("a0");
+ register long nr asm("a7") = __NR_getrandom;
+ register void *buffer asm("a0") = _buffer;
+ register size_t len asm("a1") = _len;
+ register unsigned int flags asm("a2") = _flags;
+
+ asm volatile(
+ " syscall 0\n"
+ : "+r" (ret)
+ : "r" (nr), "r" (buffer), "r" (len), "r" (flags)
+ : "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8",
+ "memory");
+
+ return ret;
+}
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+ return (const struct vdso_rng_data *)(get_vdso_data() + VVAR_LOONGARCH_PAGES_START * PAGE_SIZE + offsetof(struct loongarch_vdso_data, rng_data));
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/loongarch/include/asm/vdso/vdso.h b/arch/loongarch/include/asm/vdso/vdso.h
index 5a12309d9fb5..e31ac7474513 100644
--- a/arch/loongarch/include/asm/vdso/vdso.h
+++ b/arch/loongarch/include/asm/vdso/vdso.h
@@ -4,6 +4,9 @@
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#ifndef _ASM_VDSO_VDSO_H
+#define _ASM_VDSO_VDSO_H
+
#ifndef __ASSEMBLY__
#include <asm/asm.h>
@@ -16,6 +19,7 @@ struct vdso_pcpu_data {
struct loongarch_vdso_data {
struct vdso_pcpu_data pdata[NR_CPUS];
+ struct vdso_rng_data rng_data;
};
/*
@@ -63,3 +67,5 @@ static inline unsigned long get_vdso_data(void)
}
#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/loongarch/include/asm/vdso/vsyscall.h b/arch/loongarch/include/asm/vdso/vsyscall.h
index 5de615383a22..b1273ce6f140 100644
--- a/arch/loongarch/include/asm/vdso/vsyscall.h
+++ b/arch/loongarch/include/asm/vdso/vsyscall.h
@@ -8,6 +8,7 @@
#include <vdso/datapage.h>
extern struct vdso_data *vdso_data;
+extern struct vdso_rng_data *vdso_rng_data;
/*
* Update the vDSO data page to keep in sync with kernel timekeeping.
@@ -19,6 +20,13 @@ struct vdso_data *__loongarch_get_k_vdso_data(void)
}
#define __arch_get_k_vdso_data __loongarch_get_k_vdso_data
+static __always_inline
+struct vdso_rng_data *__loongarch_get_k_vdso_rng_data(void)
+{
+ return vdso_rng_data;
+}
+#define __arch_get_k_vdso_rng_data __loongarch_get_k_vdso_rng_data
+
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c
index 90dfccb41c14..f6fcc52aefae 100644
--- a/arch/loongarch/kernel/vdso.c
+++ b/arch/loongarch/kernel/vdso.c
@@ -37,6 +37,7 @@ static union {
static struct page *vdso_pages[] = { NULL };
struct vdso_data *vdso_data = generic_vdso_data.data;
struct vdso_pcpu_data *vdso_pdata = loongarch_vdso_data.vdata.pdata;
+struct vdso_rng_data *vdso_rng_data = &loongarch_vdso_data.vdata.rng_data;
static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
{
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index d724d46b07c8..40c1175823d6 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -4,7 +4,8 @@
# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
-obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o
+obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o vgetrandom.o \
+ vgetrandom-chacha.o sigreturn.o
# Common compiler flags between ABIs.
ccflags-vdso := \
@@ -29,6 +30,10 @@ ifneq ($(c-gettimeofday-y),)
CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y)
endif
+ifneq ($(c-getrandom-y),)
+ CFLAGS_vgetrandom.o += -include $(c-getrandom-y)
+endif
+
# VDSO linker flags.
ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \
$(filter -E%,$(KBUILD_CFLAGS)) -nostdlib -shared \
diff --git a/arch/loongarch/vdso/vdso.lds.S b/arch/loongarch/vdso/vdso.lds.S
index 56ad855896de..6b441bde4026 100644
--- a/arch/loongarch/vdso/vdso.lds.S
+++ b/arch/loongarch/vdso/vdso.lds.S
@@ -62,6 +62,7 @@ VERSION
__vdso_clock_getres;
__vdso_clock_gettime;
__vdso_gettimeofday;
+ __vdso_getrandom;
__vdso_rt_sigreturn;
local: *;
};
diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..7e86a50f6e85
--- /dev/null
+++ b/arch/loongarch/vdso/vgetrandom-chacha.S
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+
+#include <asm/asm.h>
+#include <asm/regdef.h>
+#include <linux/linkage.h>
+
+.text
+
+/* Salsa20 quarter-round */
+.macro QR a b c d
+ add.w \a, \a, \b
+ xor \d, \d, \a
+ rotri.w \d, \d, 16
+
+ add.w \c, \c, \d
+ xor \b, \b, \c
+ rotri.w \b, \b, 20
+
+ add.w \a, \a, \b
+ xor \d, \d, \a
+ rotri.w \d, \d, 24
+
+ add.w \c, \c, \d
+ xor \b, \b, \c
+ rotri.w \b, \b, 25
+.endm
+
+/*
+ * Very basic LoongArch implementation of ChaCha20. Produces a given positive
+ * number of blocks of output with a nonce of 0, taking an input key and
+ * 8-byte counter. Importantly does not spill to the stack. Its arguments
+ * are:
+ *
+ * a0: output bytes
+ * a1: 32-byte key input
+ * a2: 8-byte counter input/output
+ * a3: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+/* We don't need a frame pointer */
+#define s9 fp
+
+#define output a0
+#define key a1
+#define counter a2
+#define nblocks a3
+#define i a4
+#define state0 s0
+#define state1 s1
+#define state2 s2
+#define state3 s3
+#define state4 s4
+#define state5 s5
+#define state6 s6
+#define state7 s7
+#define state8 s8
+#define state9 s9
+#define state10 a5
+#define state11 a6
+#define state12 a7
+#define state13 t0
+#define state14 t1
+#define state15 t2
+#define cnt_lo t3
+#define cnt_hi t4
+#define copy0 t5
+#define copy1 t6
+#define copy2 t7
+
+/* Reuse i as copy3 */
+#define copy3 i
+
+ /*
+ * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
+ * This does not violate the stack-less requirement: no sensitive data
+ * is spilled onto the stack.
+ */
+ PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN
+ REG_S s0, sp, 0
+ REG_S s1, sp, SZREG
+ REG_S s2, sp, SZREG * 2
+ REG_S s3, sp, SZREG * 3
+ REG_S s4, sp, SZREG * 4
+ REG_S s5, sp, SZREG * 5
+ REG_S s6, sp, SZREG * 6
+ REG_S s7, sp, SZREG * 7
+ REG_S s8, sp, SZREG * 8
+ REG_S s9, sp, SZREG * 9
+
+ li.w copy0, 0x61707865
+ li.w copy1, 0x3320646e
+ li.w copy2, 0x79622d32
+
+ ld.w cnt_lo, counter, 0
+ ld.w cnt_hi, counter, 4
+
+.Lblock:
+ /* state[0,1,2,3] = "expand 32-byte k" */
+ move state0, copy0
+ move state1, copy1
+ move state2, copy2
+ li.w state3, 0x6b206574
+
+ /* state[4,5,..,11] = key */
+ ld.w state4, key, 0
+ ld.w state5, key, 4
+ ld.w state6, key, 8
+ ld.w state7, key, 12
+ ld.w state8, key, 16
+ ld.w state9, key, 20
+ ld.w state10, key, 24
+ ld.w state11, key, 28
+
+ /* state[12,13] = counter */
+ move state12, cnt_lo
+ move state13, cnt_hi
+
+ /* state[14,15] = 0 */
+ move state14, zero
+ move state15, zero
+
+ li.w i, 10
+.Lpermute:
+ /* odd round */
+ QR state0, state4, state8, state12
+ QR state1, state5, state9, state13
+ QR state2, state6, state10, state14
+ QR state3, state7, state11, state15
+
+ /* even round */
+ QR state0, state5, state10, state15
+ QR state1, state6, state11, state12
+ QR state2, state7, state8, state13
+ QR state3, state4, state9, state14
+
+ addi.w i, i, -1
+ bnez i, .Lpermute
+
+ /*
+ * copy[3] = "expa", materialize it here because copy[3] shares the
+ * same register with i which just became dead.
+ */
+ li.w copy3, 0x6b206574
+
+ /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
+ add.w state0, state0, copy0
+ add.w state1, state1, copy1
+ add.w state2, state2, copy2
+ add.w state3, state3, copy3
+ st.w state0, output, 0
+ st.w state1, output, 4
+ st.w state2, output, 8
+ st.w state3, output, 12
+
+ /* from now on state[0,1,2,3] are scratch registers */
+
+ /* state[0,1,2,3] = lo32(key) */
+ ld.w state0, key, 0
+ ld.w state1, key, 4
+ ld.w state2, key, 8
+ ld.w state3, key, 12
+
+ /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
+ add.w state4, state4, state0
+ add.w state5, state5, state1
+ add.w state6, state6, state2
+ add.w state7, state7, state3
+ st.w state4, output, 16
+ st.w state5, output, 20
+ st.w state6, output, 24
+ st.w state7, output, 28
+
+ /* state[0,1,2,3] = hi32(key) */
+ ld.w state0, key, 16
+ ld.w state1, key, 20
+ ld.w state2, key, 24
+ ld.w state3, key, 28
+
+ /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
+ add.w state8, state8, state0
+ add.w state9, state9, state1
+ add.w state10, state10, state2
+ add.w state11, state11, state3
+ st.w state8, output, 32
+ st.w state9, output, 36
+ st.w state10, output, 40
+ st.w state11, output, 44
+
+ /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
+ add.w state12, state12, cnt_lo
+ add.w state13, state13, cnt_hi
+ st.w state12, output, 48
+ st.w state13, output, 52
+ st.w state14, output, 56
+ st.w state15, output, 60
+
+ /* ++counter */
+ addi.w cnt_lo, cnt_lo, 1
+ sltui state0, cnt_lo, 1
+ add.w cnt_hi, cnt_hi, state0
+
+ /* output += 64 */
+ PTR_ADDI output, output, 64
+ /* --nblocks */
+ PTR_ADDI nblocks, nblocks, -1
+ bnez nblocks, .Lblock
+
+ /* counter = [cnt_lo, cnt_hi] */
+ st.w cnt_lo, counter, 0
+ st.w cnt_hi, counter, 4
+
+ /*
+ * Zero out the potentially sensitive regs, in case nothing uses these
+ * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
+ * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
+ * only need to zero state[11,...,15].
+ */
+ move state10, zero
+ move state11, zero
+ move state12, zero
+ move state13, zero
+ move state14, zero
+ move state15, zero
+
+ REG_L s0, sp, 0
+ REG_L s1, sp, SZREG
+ REG_L s2, sp, SZREG * 2
+ REG_L s3, sp, SZREG * 3
+ REG_L s4, sp, SZREG * 4
+ REG_L s5, sp, SZREG * 5
+ REG_L s6, sp, SZREG * 6
+ REG_L s7, sp, SZREG * 7
+ REG_L s8, sp, SZREG * 8
+ REG_L s9, sp, SZREG * 9
+ PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN)
+
+ jr ra
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/loongarch/vdso/vgetrandom.c b/arch/loongarch/vdso/vgetrandom.c
new file mode 100644
index 000000000000..d5f258ac4a36
--- /dev/null
+++ b/arch/loongarch/vdso/vgetrandom.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+ return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}
--
2.46.0
Hi Xi, Le 01/09/2024 à 08:13, Xi Ruoyao a écrit : > Hook up the generic vDSO implementation to the LoongArch vDSO data page > by providing the required __arch_chacha20_blocks_nostack, > __arch_get_k_vdso_rng_data, and getrandom_syscall implementations. > > Signed-off-by: Xi Ruoyao <xry111@xry111.site> > --- ... > diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S > new file mode 100644 > index 000000000000..7e86a50f6e85 > --- /dev/null > +++ b/arch/loongarch/vdso/vgetrandom-chacha.S > @@ -0,0 +1,242 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. > + */ > + > +#include <asm/asm.h> > +#include <asm/regdef.h> > +#include <linux/linkage.h> > + > +.text > + > +/* Salsa20 quarter-round */ > +.macro QR a b c d > + add.w \a, \a, \b > + xor \d, \d, \a > + rotri.w \d, \d, 16 > + > + add.w \c, \c, \d > + xor \b, \b, \c > + rotri.w \b, \b, 20 > + > + add.w \a, \a, \b > + xor \d, \d, \a > + rotri.w \d, \d, 24 > + > + add.w \c, \c, \d > + xor \b, \b, \c > + rotri.w \b, \b, 25 > +.endm > + I know nothing about Loongarch assembly and execution performance, but I see that GCC groups operations by 4 when building reference_chacha20_blocks() from vdso_test_chacha, see below. Shouldn't you do the same and group ROUNDs by 4 just like I did on powerpc ? (https://github.com/torvalds/linux/blob/master/arch/powerpc/kernel/vdso/vgetrandom-chacha.S) 0000000000000134 <.L3>: 134: 001061d8 add.w $s1, $t2, $s1 138: 0015c312 xor $t6, $s1, $t4 13c: 26000070 ldptr.d $t4, $sp, 0 140: 001036d6 add.w $fp, $fp, $t1 144: 001065f9 add.w $s2, $t3, $s2 148: 0010335a add.w $s3, $s3, $t0 14c: 00159ad3 xor $t7, $fp, $a2 150: 0015c344 xor $a0, $s3, $t4 154: 0015c731 xor $t5, $s2, $t5 158: 004cc273 rotri.w $t7, $t7, 0x10 15c: 004cc252 rotri.w $t6, $t6, 0x10 160: 004cc231 rotri.w $t5, $t5, 0x10 164: 004cc084 rotri.w $a0, $a0, 0x10 168: 00104766 add.w $a2, $s4, $t5 16c: 00102088 add.w $a4, $a0, $a4 170: 00102669 add.w $a5, $t7, $a5 174: 001048e7 add.w $a3, $a3, $t6 178: 0015b530 xor $t4, $a5, $t1 17c: 0015b10c xor $t0, $a4, $t0 180: 0015b8ee xor $t2, $a3, $t2 184: 0015bccf xor $t3, $a2, $t3 188: 004cd18d rotri.w $t1, $t0, 0x14 18c: 004cd210 rotri.w $t4, $t4, 0x14 190: 004cd1ce rotri.w $t2, $t2, 0x14 194: 004cd1ef rotri.w $t3, $t3, 0x14 198: 001042d6 add.w $fp, $fp, $t4 19c: 00103b18 add.w $s1, $s1, $t2 1a0: 00103f39 add.w $s2, $s2, $t3 1a4: 0010375a add.w $s3, $s3, $t1 1a8: 0015ced3 xor $t7, $fp, $t7 1ac: 0015cb12 xor $t6, $s1, $t6 1b0: 0015c731 xor $t5, $s2, $t5 1b4: 00159344 xor $a0, $s3, $a0 1b8: 004ce274 rotri.w $t8, $t7, 0x18 1bc: 004ce084 rotri.w $a0, $a0, 0x18 1c0: 004ce253 rotri.w $t7, $t6, 0x18 1c4: 004ce232 rotri.w $t6, $t5, 0x18 1c8: 00105129 add.w $a5, $a5, $t8 1cc: 00101111 add.w $t5, $a4, $a0 1d0: 00104ce7 add.w $a3, $a3, $t7 1d4: 001048c6 add.w $a2, $a2, $t6 1d8: 0015c130 xor $t4, $a5, $t4 1dc: 0015b8ee xor $t2, $a3, $t2 1e0: 0015bccf xor $t3, $a2, $t3 1e4: 0015b62d xor $t1, $t5, $t1 1e8: 004ce610 rotri.w $t4, $t4, 0x19 1ec: 004ce5ce rotri.w $t2, $t2, 0x19 1f0: 004ce5ef rotri.w $t3, $t3, 0x19 1f4: 004ce5ad rotri.w $t1, $t1, 0x19 1f8: 00103ad6 add.w $fp, $fp, $t2 1fc: 00103f18 add.w $s1, $s1, $t3 200: 00103739 add.w $s2, $s2, $t1 204: 0010435a add.w $s3, $s3, $t4 208: 001592c4 xor $a0, $fp, $a0 20c: 0015d314 xor $t8, $s1, $t8 210: 0015cf33 xor $t7, $s2, $t7 214: 0015cb52 xor $t6, $s3, $t6 218: 004cc084 rotri.w $a0, $a0, 0x10 21c: 004cc294 rotri.w $t8, $t8, 0x10 220: 004cc273 rotri.w $t7, $t7, 0x10 224: 004cc252 rotri.w $t6, $t6, 0x10 228: 001010dc add.w $s5, $a2, $a0 22c: 0010523d add.w $s6, $t5, $t8 230: 00104d3e add.w $s7, $a5, $t7 234: 001048ff add.w $s8, $a3, $t6 238: 0015c3ec xor $t0, $s8, $t4 23c: 0015bb8e xor $t2, $s5, $t2 240: 0015bfaf xor $t3, $s6, $t3 244: 0015b7cd xor $t1, $s7, $t1 248: 004cd1ad rotri.w $t1, $t1, 0x14 24c: 004cd18c rotri.w $t0, $t0, 0x14 250: 004cd1ce rotri.w $t2, $t2, 0x14 254: 004cd1ef rotri.w $t3, $t3, 0x14 258: 00103ad7 add.w $s0, $fp, $t2 25c: 00103f0a add.w $a6, $s1, $t3 260: 0010372b add.w $a7, $s2, $t1 264: 00103341 add.w $ra, $s3, $t0 268: 001592e4 xor $a0, $s0, $a0 26c: 0015d154 xor $t8, $a6, $t8 270: 0015cd73 xor $t7, $a7, $t7 274: 0015c832 xor $t6, $ra, $t6 278: 004ce084 rotri.w $a0, $a0, 0x18 27c: 004ce294 rotri.w $t8, $t8, 0x18 280: 004ce273 rotri.w $t7, $t7, 0x18 284: 004ce252 rotri.w $t6, $t6, 0x18 288: 0010139c add.w $s5, $s5, $a0 28c: 001053bd add.w $s6, $s6, $t8 290: 00104fde add.w $s7, $s7, $t7 294: 00104bff add.w $s8, $s8, $t6 298: 0015b7d1 xor $t5, $s7, $t1 29c: 0015bb8e xor $t2, $s5, $t2 2a0: 0015b3ed xor $t1, $s8, $t0 2a4: 0015bfaf xor $t3, $s6, $t3 2a8: 0040808c slli.w $t0, $a0, 0x0 2ac: 004ce631 rotri.w $t5, $t5, 0x19 2b0: 004ce5ce rotri.w $t2, $t2, 0x19 2b4: 004ce5ef rotri.w $t3, $t3, 0x19 2b8: 004ce5ad rotri.w $t1, $t1, 0x19 2bc: 2700006c stptr.d $t0, $sp, 0 2c0: 02bffca5 addi.w $a1, $a1, -1(0xfff) 2c4: 0040822c slli.w $t0, $t5, 0x0 2c8: 004082f6 slli.w $fp, $s0, 0x0 2cc: 0040839b slli.w $s4, $s5, 0x0 2d0: 004081ce slli.w $t2, $t2, 0x0 2d4: 00408158 slli.w $s1, $a6, 0x0 2d8: 00408286 slli.w $a2, $t8, 0x0 2dc: 004083a8 slli.w $a4, $s6, 0x0 2e0: 004081ef slli.w $t3, $t3, 0x0 2e4: 00408179 slli.w $s2, $a7, 0x0 2e8: 00408270 slli.w $t4, $t7, 0x0 2ec: 004083c9 slli.w $a5, $s7, 0x0 2f0: 0040803a slli.w $s3, $ra, 0x0 2f4: 00408251 slli.w $t5, $t6, 0x0 2f8: 004083e7 slli.w $a3, $s8, 0x0 2fc: 004081ad slli.w $t1, $t1, 0x0 300: 47fe34bf bnez $a1, -460(0x7ffe34) # 134 <.L3> Christophe
On Thu, 2024-09-19 at 09:08 +0200, Christophe Leroy wrote: > I know nothing about Loongarch assembly and execution performance, but I > see that GCC groups operations by 4 when building > reference_chacha20_blocks() from vdso_test_chacha, see below. > > Shouldn't you do the same and group ROUNDs by 4 just like I did on > powerpc ? > (https://github.com/torvalds/linux/blob/master/arch/powerpc/kernel/vdso/vgetrandom-chacha.S) Maybe. In theory the scheduling would improve the performance. I'll measure if the scheduling will make an observable performance improvement. -- Xi Ruoyao <xry111@xry111.site> School of Aerospace Science and Technology, Xidian University
On Sun, Sep 01, 2024 at 02:13:11PM +0800, Xi Ruoyao wrote: > Hook up the generic vDSO implementation to the LoongArch vDSO data page > by providing the required __arch_chacha20_blocks_nostack, > __arch_get_k_vdso_rng_data, and getrandom_syscall implementations. Applied, thanks! Congrats on being the first implementation after x86 to do this. Jason
© 2016 - 2026 Red Hat, Inc.