[v1] raid6: Add RISC-V SIMD syndrome and recovery calculations

[RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Chunyan Zhang 1 year, 1 month ago

The assembly is originally based on the ARM NEON and int.uc, but uses
RISC-V vector instructions to implement the RAID6 syndrome and
recovery calculations.

The functions are tested on QEMU.

Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
---
 include/linux/raid/pq.h |   4 +
 lib/raid6/Makefile      |   3 +
 lib/raid6/algos.c       |   8 +
 lib/raid6/recov_rvv.c   | 229 +++++++++++++
 lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 959 insertions(+)
 create mode 100644 lib/raid6/recov_rvv.c
 create mode 100644 lib/raid6/rvv.c

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 98030accf641..4c21f06c662a 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
 extern const struct raid6_calls raid6_vpermxor8;
 extern const struct raid6_calls raid6_lsx;
 extern const struct raid6_calls raid6_lasx;
+extern const struct raid6_calls raid6_rvvx1;
+extern const struct raid6_calls raid6_rvvx2;
+extern const struct raid6_calls raid6_rvvx4;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
@@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
 extern const struct raid6_recov_calls raid6_recov_neon;
 extern const struct raid6_recov_calls raid6_recov_lsx;
 extern const struct raid6_recov_calls raid6_recov_lasx;
+extern const struct raid6_recov_calls raid6_recov_rvv;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 29127dd05d63..e62fb7cd773e 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
+raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
+CFLAGS_rvv.o += -march=rv64gcv
+CFLAGS_recov_rvv.o += -march=rv64gcv
 
 hostprogs	+= mktables
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index cd2e88ee1f14..0a388a605131 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -80,6 +80,11 @@ const struct raid6_calls * const raid6_algos[] = {
 #ifdef CONFIG_CPU_HAS_LSX
 	&raid6_lsx,
 #endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+	&raid6_rvvx1,
+	&raid6_rvvx2,
+	&raid6_rvvx4,
 #endif
 	&raid6_intx8,
 	&raid6_intx4,
@@ -115,6 +120,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #ifdef CONFIG_CPU_HAS_LSX
 	&raid6_recov_lsx,
 #endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+	&raid6_recov_rvv,
 #endif
 	&raid6_recov_intx1,
 	NULL
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
new file mode 100644
index 000000000000..8ae74803ea7f
--- /dev/null
+++ b/lib/raid6/recov_rvv.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+#include <linux/raid/pq.h>
+
+static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
+			      u8 *dq, const u8 *pbmul,
+			      const u8 *qmul)
+{
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		: :
+		[avl]"r"(16)
+	);
+
+	/*
+	 * while ( bytes-- ) {
+	 *	uint8_t px, qx, db;
+	 *
+	 *	px	  = *p ^ *dp;
+	 *	qx	  = qmul[*q ^ *dq];
+	 *	*dq++ = db = pbmul[px] ^ qx;
+	 *	*dp++ = db ^ px;
+	 *	p++; q++;
+	 * }
+	 */
+	while (bytes) {
+		/*
+		 * v0:px, v1:dp,
+		 * v2:qx, v3:dq,
+		 * v4:vx, v5:vy,
+		 * v6:qm0, v7:qm1,
+		 * v8:pm0, v9:pm1,
+		 * v14:p/qm[vx], v15:p/qm[vy]
+		 */
+		asm volatile (
+			"vle8.v		v0, (%[px])\n"
+			"vle8.v		v1, (%[dp])\n"
+			"vxor.vv	v0, v0, v1\n"
+			"vle8.v		v2, (%[qx])\n"
+			"vle8.v		v3, (%[dq])\n"
+			"vxor.vv	v4, v2, v3\n"
+			"vsrl.vi	v5, v4, 4\n"
+			"vand.vi	v4, v4, 0xf\n"
+			"vle8.v		v6, (%[qm0])\n"
+			"vle8.v		v7, (%[qm1])\n"
+			"vrgather.vv	v14, v6, v4\n" /* v14 = qm[vx] */
+			"vrgather.vv	v15, v7, v5\n" /* v15 = qm[vy] */
+			"vxor.vv	v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */
+
+			"vsrl.vi	v5, v0, 4\n"
+			"vand.vi	v4, v0, 0xf\n"
+			"vle8.v		v8, (%[pm0])\n"
+			"vle8.v		v9, (%[pm1])\n"
+			"vrgather.vv	v14, v8, v4\n" /* v14 = pm[vx] */
+			"vrgather.vv	v15, v9, v5\n" /* v15 = pm[vy] */
+			"vxor.vv	v4, v14, v15\n" /* v4 = pbmul[px] */
+			"vxor.vv	v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */
+			"vxor.vv	v1, v3, v0\n" /* v1 = db ^ px; */
+			"vse8.v		v3, (%[dq])\n"
+			"vse8.v		v1, (%[dp])\n"
+			: :
+			[px]"r"(p),
+			[dp]"r"(dp),
+			[qx]"r"(q),
+			[dq]"r"(dq),
+			[qm0]"r"(qmul),
+			[qm1]"r"(qmul + 16),
+			[pm0]"r"(pbmul),
+			[pm1]"r"(pbmul + 16)
+			:);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+	}
+
+	asm volatile (".option pop\n");
+}
+
+static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul)
+{
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		: :
+		[avl]"r"(16)
+	);
+
+	/*
+	 * while (bytes--) {
+	 *  *p++ ^= *dq = qmul[*q ^ *dq];
+	 *  q++; dq++;
+	 * }
+	 */
+	while (bytes) {
+		/*
+		 * v0:vx, v1:vy,
+		 * v2:dq, v3:p,
+		 * v4:qm0, v5:qm1,
+		 * v10:m[vx], v11:m[vy]
+		 */
+		asm volatile (
+			"vle8.v		v0, (%[vx])\n"
+			"vle8.v		v2, (%[dq])\n"
+			"vxor.vv	v0, v0, v2\n"
+			"vsrl.vi	v1, v0, 4\n"
+			"vand.vi	v0, v0, 0xf\n"
+			"vle8.v		v4, (%[qm0])\n"
+			"vle8.v		v5, (%[qm1])\n"
+			"vrgather.vv	v10, v4, v0\n"
+			"vrgather.vv	v11, v5, v1\n"
+			"vxor.vv	v0, v10, v11\n"
+			"vle8.v		v1, (%[vy])\n"
+			"vxor.vv	v1, v0, v1\n"
+			"vse8.v		v0, (%[dq])\n"
+			"vse8.v		v1, (%[vy])\n"
+			: :
+			[vx]"r"(q),
+			[vy]"r"(p),
+			[dq]"r"(dq),
+			[qm0]"r"(qmul),
+			[qm1]"r"(qmul + 16)
+			:);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+	}
+
+	asm volatile (".option pop\n");
+}
+
+
+static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dp;
+	ptrs[failb]     = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+					 raid6_gfexp[failb]]];
+
+	if (crypto_simd_usable()) {
+		kernel_vector_begin();
+		__raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
+		kernel_vector_end();
+	}
+}
+
+static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	if (crypto_simd_usable()) {
+		kernel_vector_begin();
+		__raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
+		kernel_vector_end();
+	}
+}
+
+const struct raid6_recov_calls raid6_recov_rvv = {
+	.data2		= raid6_2data_recov_rvv,
+	.datap		= raid6_datap_recov_rvv,
+	.valid		= NULL,
+	.name		= "rvv",
+	.priority	= 1,
+};
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
new file mode 100644
index 000000000000..21f5432506da
--- /dev/null
+++ b/lib/raid6/rvv.c
@@ -0,0 +1,715 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID-6 syndrome calculation using RISCV vector instructions
+ *
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ *
+ * Based on neon.uc:
+ *	Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+#include <linux/raid/pq.h>
+#include <linux/types.h>
+
+#define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
+
+static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	int d, z, z0;
+	u8 *p, *q;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+	);
+
+	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+	for (d = 0 ; d < bytes ; d += NSIZE*1) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			"vle8.v	v0, (%[wp0])\n"
+			"vle8.v	v1, (%[wp0])\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE])
+		);
+
+		for (z = z0-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (
+			"vse8.v		v0, (%[wp0])\n"
+			"vse8.v		v1, (%[wq0])\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0])
+		);
+	}
+
+	asm volatile (".option pop\n");
+}
+
+static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	asm volatile (
+		".option push\n"
+		".option arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+	);
+
+	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+	for (d = 0 ; d < bytes ; d += NSIZE*1) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			"vle8.v	v0, (%[wp0])\n"
+			"vle8.v	v1, (%[wp0])\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE])
+		);
+
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v1, v3, v2\n"
+				: :
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 */
+		asm volatile (
+			"vle8.v		v2, (%[wp0])\n"
+			"vle8.v		v3, (%[wq0])\n"
+			"vxor.vv	v2, v2, v0\n"
+			"vxor.vv	v3, v3, v1\n"
+			"vse8.v		v2, (%[wp0])\n"
+			"vse8.v		v3, (%[wq0])\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0])
+		);
+	}
+
+	asm volatile (".option pop\n");
+}
+
+static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	int d, z, z0;
+	u8 *p, *q;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*2) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			"vle8.v	v0, (%[wp0])\n"
+			"vle8.v	v1, (%[wp0])\n"
+			"vle8.v	v4, (%[wp1])\n"
+			"vle8.v	v5, (%[wp1])\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE])
+		);
+
+		for (z = z0-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (
+			"vse8.v		v0, (%[wp0])\n"
+			"vse8.v		v1, (%[wq0])\n"
+			"vse8.v		v4, (%[wp1])\n"
+			"vse8.v		v5, (%[wq1])\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1])
+		);
+	}
+
+	asm volatile (".option pop\n");
+}
+
+static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
+					 unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	asm volatile (
+		".option push\n"
+		".option arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*2) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			"vle8.v	v0, (%[wp0])\n"
+			"vle8.v	v1, (%[wp0])\n"
+			"vle8.v	v4, (%[wp1])\n"
+			"vle8.v	v5, (%[wp1])\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE])
+		);
+
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v1, v3, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v5, v7, v6\n"
+				: :
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 */
+		asm volatile (
+			"vle8.v		v2, (%[wp0])\n"
+			"vle8.v		v3, (%[wq0])\n"
+			"vxor.vv	v2, v2, v0\n"
+			"vxor.vv	v3, v3, v1\n"
+			"vse8.v		v2, (%[wp0])\n"
+			"vse8.v		v3, (%[wq0])\n"
+
+			"vle8.v		v6, (%[wp1])\n"
+			"vle8.v		v7, (%[wq1])\n"
+			"vxor.vv	v6, v6, v4\n"
+			"vxor.vv	v7, v7, v5\n"
+			"vse8.v		v6, (%[wp1])\n"
+			"vse8.v		v7, (%[wq1])\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1])
+		);
+	}
+
+	asm volatile (".option pop\n");
+}
+
+static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	int d, z, z0;
+	u8 *p, *q;
+
+	z0 = disks - 3;	/* Highest data disk */
+	p = dptr[z0+1];	/* XOR parity */
+	q = dptr[z0+2];	/* RS syndrome */
+
+	asm volatile (
+		".option	push\n"
+		".option	arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*4) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			"vle8.v v0, (%[wp0])\n"
+			"vle8.v v1, (%[wp0])\n"
+			"vle8.v v4, (%[wp1])\n"
+			"vle8.v v5, (%[wp1])\n"
+			"vle8.v v8, (%[wp2])\n"
+			"vle8.v v9, (%[wp2])\n"
+			"vle8.v v12, (%[wp3])\n"
+			"vle8.v v13, (%[wp3])\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
+			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
+			[wp3]"r"(&dptr[z0][d+3*NSIZE])
+		);
+
+		for (z = z0-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v11, v11, v10\n"
+				"vle8.v		v10, (%[wd2])\n"
+				"vxor.vv	v9, v11, v10\n"
+				"vxor.vv	v8, v8, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v15, v15, v14\n"
+				"vle8.v		v14, (%[wd3])\n"
+				"vxor.vv	v13, v15, v14\n"
+				"vxor.vv	v12, v12, v14\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[wd2]"r"(&dptr[z][d+2*NSIZE]),
+				[wd3]"r"(&dptr[z][d+3*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+		 */
+		asm volatile (
+			"vse8.v	v0, (%[wp0])\n"
+			"vse8.v	v1, (%[wq0])\n"
+			"vse8.v	v4, (%[wp1])\n"
+			"vse8.v	v5, (%[wq1])\n"
+			"vse8.v	v8, (%[wp2])\n"
+			"vse8.v	v9, (%[wq2])\n"
+			"vse8.v	v12, (%[wp3])\n"
+			"vse8.v	v13, (%[wq3])\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1]),
+			[wp2]"r"(&p[d+NSIZE*2]),
+			[wq2]"r"(&q[d+NSIZE*2]),
+			[wp3]"r"(&p[d+NSIZE*3]),
+			[wq3]"r"(&q[d+NSIZE*3])
+		);
+	}
+
+	asm volatile (".option pop\n");
+}
+
+static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
+					unsigned long bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	asm volatile (
+		".option push\n"
+		".option arch,+v\n"
+		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+	);
+
+	/*
+	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+	 */
+	for (d = 0 ; d < bytes ; d += NSIZE*4) {
+		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile (
+			"vle8.v v0, (%[wp0])\n"
+			"vle8.v v1, (%[wp0])\n"
+			"vle8.v v4, (%[wp1])\n"
+			"vle8.v v5, (%[wp1])\n"
+			"vle8.v v8, (%[wp2])\n"
+			"vle8.v v9, (%[wp2])\n"
+			"vle8.v v12, (%[wp3])\n"
+			"vle8.v v13, (%[wp3])\n"
+			: :
+			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
+			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
+			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
+			[wp3]"r"(&dptr[z0][d+3*NSIZE])
+		);
+
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * w1$$ ^= w2$$;
+			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+			 * wq$$ = w1$$ ^ wd$$;
+			 * wp$$ ^= wd$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v3, v3, v2\n"
+				"vle8.v		v2, (%[wd0])\n"
+				"vxor.vv	v1, v3, v2\n"
+				"vxor.vv	v0, v0, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v7, v7, v6\n"
+				"vle8.v		v6, (%[wd1])\n"
+				"vxor.vv	v5, v7, v6\n"
+				"vxor.vv	v4, v4, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v11, v11, v10\n"
+				"vle8.v		v10, (%[wd2])\n"
+				"vxor.vv	v9, v11, v10\n"
+				"vxor.vv	v8, v8, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v15, v15, v14\n"
+				"vle8.v		v14, (%[wd3])\n"
+				"vxor.vv	v13, v15, v14\n"
+				"vxor.vv	v12, v12, v14\n"
+				: :
+				[wd0]"r"(&dptr[z][d+0*NSIZE]),
+				[wd1]"r"(&dptr[z][d+1*NSIZE]),
+				[wd2]"r"(&dptr[z][d+2*NSIZE]),
+				[wd3]"r"(&dptr[z][d+3*NSIZE]),
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			/*
+			 * w2$$ = MASK(wq$$);
+			 * w1$$ = SHLBYTE(wq$$);
+			 * w2$$ &= NBYTES(0x1d);
+			 * wq$$ = w1$$ ^ w2$$;
+			 */
+			asm volatile (
+				"vsra.vi	v2, v1, 7\n"
+				"vsll.vi	v3, v1, 1\n"
+				"vand.vx	v2, v2, %[x1d]\n"
+				"vxor.vv	v1, v3, v2\n"
+
+				"vsra.vi	v6, v5, 7\n"
+				"vsll.vi	v7, v5, 1\n"
+				"vand.vx	v6, v6, %[x1d]\n"
+				"vxor.vv	v5, v7, v6\n"
+
+				"vsra.vi	v10, v9, 7\n"
+				"vsll.vi	v11, v9, 1\n"
+				"vand.vx	v10, v10, %[x1d]\n"
+				"vxor.vv	v9, v11, v10\n"
+
+				"vsra.vi	v14, v13, 7\n"
+				"vsll.vi	v15, v13, 1\n"
+				"vand.vx	v14, v14, %[x1d]\n"
+				"vxor.vv	v13, v15, v14\n"
+				: :
+				[x1d]"r"(0x1d)
+			);
+		}
+
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 * v0:wp0, v1:wq0, v2:p0, v3:q0
+		 * v4:wp1, v5:wq1, v6:p1, v7:q1
+		 * v8:wp2, v9:wq2, v10:p2, v11:q2
+		 * v12:wp3, v13:wq3, v14:p3, v15:q3
+		 */
+		asm volatile (
+			"vle8.v		v2, (%[wp0])\n"
+			"vle8.v		v3, (%[wq0])\n"
+			"vxor.vv	v2, v2, v0\n"
+			"vxor.vv	v3, v3, v1\n"
+			"vse8.v		v2, (%[wp0])\n"
+			"vse8.v		v3, (%[wq0])\n"
+
+			"vle8.v		v6, (%[wp1])\n"
+			"vle8.v		v7, (%[wq1])\n"
+			"vxor.vv	v6, v6, v4\n"
+			"vxor.vv	v7, v7, v5\n"
+			"vse8.v		v6, (%[wp1])\n"
+			"vse8.v		v7, (%[wq1])\n"
+
+			"vle8.v		v10, (%[wp2])\n"
+			"vle8.v		v11, (%[wq2])\n"
+			"vxor.vv	v10, v10, v8\n"
+			"vxor.vv	v11, v11, v9\n"
+			"vse8.v		v10, (%[wp2])\n"
+			"vse8.v		v11, (%[wq2])\n"
+
+			"vle8.v		v14, (%[wp3])\n"
+			"vle8.v		v15, (%[wq3])\n"
+			"vxor.vv	v14, v14, v12\n"
+			"vxor.vv	v15, v15, v13\n"
+			"vse8.v		v14, (%[wp3])\n"
+			"vse8.v		v15, (%[wq3])\n"
+			: :
+			[wp0]"r"(&p[d+NSIZE*0]),
+			[wq0]"r"(&q[d+NSIZE*0]),
+			[wp1]"r"(&p[d+NSIZE*1]),
+			[wq1]"r"(&q[d+NSIZE*1]),
+			[wp2]"r"(&p[d+NSIZE*2]),
+			[wq2]"r"(&q[d+NSIZE*2]),
+			[wp3]"r"(&p[d+NSIZE*3]),
+			[wq3]"r"(&q[d+NSIZE*3])
+		);
+	}
+
+	asm volatile (".option pop\n");
+}
+
+#define RAID6_RVV_WRAPPER(_n)						\
+	static void raid6_rvv ## _n ## _gen_syndrome(int disks,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_rvv ## _n  ## _gen_syndrome_real(int,	\
+						unsigned long, void**);	\
+		if (crypto_simd_usable()) {				\
+			kernel_vector_begin();				\
+			raid6_rvv ## _n ## _gen_syndrome_real(disks,	\
+					(unsigned long)bytes, ptrs);	\
+			kernel_vector_end();				\
+		}							\
+	}								\
+	static void raid6_rvv ## _n ## _xor_syndrome(int disks,		\
+					int start, int stop,		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_rvv ## _n  ## _xor_syndrome_real(int,	\
+				int, int, unsigned long, void**);	\
+		if (crypto_simd_usable()) {				\
+			kernel_vector_begin();				\
+		raid6_rvv ## _n ## _xor_syndrome_real(disks,		\
+			start, stop, (unsigned long)bytes, ptrs);	\
+			kernel_vector_end();				\
+		}							\
+	}								\
+	struct raid6_calls const raid6_rvvx ## _n = {			\
+		raid6_rvv ## _n ## _gen_syndrome,			\
+		raid6_rvv ## _n ## _xor_syndrome,			\
+		NULL,							\
+		"rvvx" #_n,						\
+		0							\
+	}
+
+RAID6_RVV_WRAPPER(1);
+RAID6_RVV_WRAPPER(2);
+RAID6_RVV_WRAPPER(4);
-- 
2.34.1

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Charlie Jenkins 1 year, 1 month ago

On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
> The assembly is originally based on the ARM NEON and int.uc, but uses
> RISC-V vector instructions to implement the RAID6 syndrome and
> recovery calculations.
> 
> The functions are tested on QEMU.
> 
> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> ---
>  include/linux/raid/pq.h |   4 +
>  lib/raid6/Makefile      |   3 +
>  lib/raid6/algos.c       |   8 +
>  lib/raid6/recov_rvv.c   | 229 +++++++++++++
>  lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 959 insertions(+)
>  create mode 100644 lib/raid6/recov_rvv.c
>  create mode 100644 lib/raid6/rvv.c
> 
> diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> index 98030accf641..4c21f06c662a 100644
> --- a/include/linux/raid/pq.h
> +++ b/include/linux/raid/pq.h
> @@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
>  extern const struct raid6_calls raid6_vpermxor8;
>  extern const struct raid6_calls raid6_lsx;
>  extern const struct raid6_calls raid6_lasx;
> +extern const struct raid6_calls raid6_rvvx1;
> +extern const struct raid6_calls raid6_rvvx2;
> +extern const struct raid6_calls raid6_rvvx4;
>  
>  struct raid6_recov_calls {
>  	void (*data2)(int, size_t, int, int, void **);
> @@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
>  extern const struct raid6_recov_calls raid6_recov_neon;
>  extern const struct raid6_recov_calls raid6_recov_lsx;
>  extern const struct raid6_recov_calls raid6_recov_lasx;
> +extern const struct raid6_recov_calls raid6_recov_rvv;
>  
>  extern const struct raid6_calls raid6_neonx1;
>  extern const struct raid6_calls raid6_neonx2;
> diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> index 29127dd05d63..e62fb7cd773e 100644
> --- a/lib/raid6/Makefile
> +++ b/lib/raid6/Makefile
> @@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
>  raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
>  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
>  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> +CFLAGS_rvv.o += -march=rv64gcv
> +CFLAGS_recov_rvv.o += -march=rv64gcv
>  
>  hostprogs	+= mktables
>  
> diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
> index cd2e88ee1f14..0a388a605131 100644
> --- a/lib/raid6/algos.c
> +++ b/lib/raid6/algos.c
> @@ -80,6 +80,11 @@ const struct raid6_calls * const raid6_algos[] = {
>  #ifdef CONFIG_CPU_HAS_LSX
>  	&raid6_lsx,
>  #endif
> +#endif
> +#ifdef CONFIG_RISCV_ISA_V
> +	&raid6_rvvx1,
> +	&raid6_rvvx2,
> +	&raid6_rvvx4,
>  #endif
>  	&raid6_intx8,
>  	&raid6_intx4,
> @@ -115,6 +120,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
>  #ifdef CONFIG_CPU_HAS_LSX
>  	&raid6_recov_lsx,
>  #endif
> +#endif
> +#ifdef CONFIG_RISCV_ISA_V
> +	&raid6_recov_rvv,
>  #endif
>  	&raid6_recov_intx1,
>  	NULL
> diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> new file mode 100644
> index 000000000000..8ae74803ea7f
> --- /dev/null
> +++ b/lib/raid6/recov_rvv.c
> @@ -0,0 +1,229 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright 2024 Institute of Software, CAS.
> + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> + */
> +
> +#include <asm/simd.h>
> +#include <asm/vector.h>
> +#include <crypto/internal/simd.h>
> +#include <linux/raid/pq.h>
> +
> +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> +			      u8 *dq, const u8 *pbmul,
> +			      const u8 *qmul)
> +{
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
> +		: :
> +		[avl]"r"(16)
> +	);
> +
> +	/*
> +	 * while ( bytes-- ) {
> +	 *	uint8_t px, qx, db;
> +	 *
> +	 *	px	  = *p ^ *dp;
> +	 *	qx	  = qmul[*q ^ *dq];
> +	 *	*dq++ = db = pbmul[px] ^ qx;
> +	 *	*dp++ = db ^ px;
> +	 *	p++; q++;
> +	 * }
> +	 */
> +	while (bytes) {
> +		/*
> +		 * v0:px, v1:dp,
> +		 * v2:qx, v3:dq,
> +		 * v4:vx, v5:vy,
> +		 * v6:qm0, v7:qm1,
> +		 * v8:pm0, v9:pm1,
> +		 * v14:p/qm[vx], v15:p/qm[vy]
> +		 */
> +		asm volatile (
> +			"vle8.v		v0, (%[px])\n"
> +			"vle8.v		v1, (%[dp])\n"
> +			"vxor.vv	v0, v0, v1\n"
> +			"vle8.v		v2, (%[qx])\n"
> +			"vle8.v		v3, (%[dq])\n"
> +			"vxor.vv	v4, v2, v3\n"
> +			"vsrl.vi	v5, v4, 4\n"
> +			"vand.vi	v4, v4, 0xf\n"
> +			"vle8.v		v6, (%[qm0])\n"
> +			"vle8.v		v7, (%[qm1])\n"
> +			"vrgather.vv	v14, v6, v4\n" /* v14 = qm[vx] */
> +			"vrgather.vv	v15, v7, v5\n" /* v15 = qm[vy] */
> +			"vxor.vv	v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */
> +
> +			"vsrl.vi	v5, v0, 4\n"
> +			"vand.vi	v4, v0, 0xf\n"
> +			"vle8.v		v8, (%[pm0])\n"
> +			"vle8.v		v9, (%[pm1])\n"
> +			"vrgather.vv	v14, v8, v4\n" /* v14 = pm[vx] */
> +			"vrgather.vv	v15, v9, v5\n" /* v15 = pm[vy] */
> +			"vxor.vv	v4, v14, v15\n" /* v4 = pbmul[px] */
> +			"vxor.vv	v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */
> +			"vxor.vv	v1, v3, v0\n" /* v1 = db ^ px; */
> +			"vse8.v		v3, (%[dq])\n"
> +			"vse8.v		v1, (%[dp])\n"
> +			: :
> +			[px]"r"(p),
> +			[dp]"r"(dp),
> +			[qx]"r"(q),
> +			[dq]"r"(dq),
> +			[qm0]"r"(qmul),
> +			[qm1]"r"(qmul + 16),
> +			[pm0]"r"(pbmul),
> +			[pm1]"r"(pbmul + 16)
> +			:);
> +
> +		bytes -= 16;
> +		p += 16;
> +		q += 16;
> +		dp += 16;
> +		dq += 16;
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
> +			      const uint8_t *qmul)
> +{
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
> +		: :
> +		[avl]"r"(16)
> +	);
> +
> +	/*
> +	 * while (bytes--) {
> +	 *  *p++ ^= *dq = qmul[*q ^ *dq];
> +	 *  q++; dq++;
> +	 * }
> +	 */
> +	while (bytes) {
> +		/*
> +		 * v0:vx, v1:vy,
> +		 * v2:dq, v3:p,
> +		 * v4:qm0, v5:qm1,
> +		 * v10:m[vx], v11:m[vy]
> +		 */
> +		asm volatile (
> +			"vle8.v		v0, (%[vx])\n"
> +			"vle8.v		v2, (%[dq])\n"
> +			"vxor.vv	v0, v0, v2\n"
> +			"vsrl.vi	v1, v0, 4\n"
> +			"vand.vi	v0, v0, 0xf\n"
> +			"vle8.v		v4, (%[qm0])\n"
> +			"vle8.v		v5, (%[qm1])\n"
> +			"vrgather.vv	v10, v4, v0\n"
> +			"vrgather.vv	v11, v5, v1\n"
> +			"vxor.vv	v0, v10, v11\n"
> +			"vle8.v		v1, (%[vy])\n"
> +			"vxor.vv	v1, v0, v1\n"
> +			"vse8.v		v0, (%[dq])\n"
> +			"vse8.v		v1, (%[vy])\n"
> +			: :
> +			[vx]"r"(q),
> +			[vy]"r"(p),
> +			[dq]"r"(dq),
> +			[qm0]"r"(qmul),
> +			[qm1]"r"(qmul + 16)
> +			:);
> +
> +		bytes -= 16;
> +		p += 16;
> +		q += 16;
> +		dq += 16;
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +
> +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
> +		int failb, void **ptrs)
> +{
> +	u8 *p, *q, *dp, *dq;
> +	const u8 *pbmul;	/* P multiplier table for B data */
> +	const u8 *qmul;		/* Q multiplier table (for both) */
> +
> +	p = (u8 *)ptrs[disks - 2];
> +	q = (u8 *)ptrs[disks - 1];
> +
> +	/*
> +	 * Compute syndrome with zero for the missing data pages
> +	 * Use the dead data pages as temporary storage for
> +	 * delta p and delta q
> +	 */
> +	dp = (u8 *)ptrs[faila];
> +	ptrs[faila] = (void *)raid6_empty_zero_page;
> +	ptrs[disks - 2] = dp;
> +	dq = (u8 *)ptrs[failb];
> +	ptrs[failb] = (void *)raid6_empty_zero_page;
> +	ptrs[disks - 1] = dq;
> +
> +	raid6_call.gen_syndrome(disks, bytes, ptrs);
> +
> +	/* Restore pointer table */
> +	ptrs[faila]     = dp;
> +	ptrs[failb]     = dq;
> +	ptrs[disks - 2] = p;
> +	ptrs[disks - 1] = q;
> +
> +	/* Now, pick the proper data tables */
> +	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
> +	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
> +					 raid6_gfexp[failb]]];
> +
> +	if (crypto_simd_usable()) {

There should be an alternate recovery mechanism if it's not currently
usable right? I don't know what case could happen when this function is
called but crypto_simd_usable() returns false.

> +		kernel_vector_begin();
> +		__raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
> +		kernel_vector_end();
> +	}
> +}
> +
> +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
> +		void **ptrs)
> +{
> +	u8 *p, *q, *dq;
> +	const u8 *qmul;		/* Q multiplier table */
> +
> +	p = (u8 *)ptrs[disks - 2];
> +	q = (u8 *)ptrs[disks - 1];
> +
> +	/*
> +	 * Compute syndrome with zero for the missing data page
> +	 * Use the dead data page as temporary storage for delta q
> +	 */
> +	dq = (u8 *)ptrs[faila];
> +	ptrs[faila] = (void *)raid6_empty_zero_page;
> +	ptrs[disks - 1] = dq;
> +
> +	raid6_call.gen_syndrome(disks, bytes, ptrs);
> +
> +	/* Restore pointer table */
> +	ptrs[faila]     = dq;
> +	ptrs[disks - 1] = q;
> +
> +	/* Now, pick the proper data tables */
> +	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
> +
> +	if (crypto_simd_usable()) {

Same here

> +		kernel_vector_begin();
> +		__raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
> +		kernel_vector_end();
> +	}
> +}
> +
> +const struct raid6_recov_calls raid6_recov_rvv = {
> +	.data2		= raid6_2data_recov_rvv,
> +	.datap		= raid6_datap_recov_rvv,
> +	.valid		= NULL,

These functions should only be called if vector is enabled, so this
valid bit should call has_vector(). has_vector() returns a bool and
valid expects an int so you can wrap it in something like:

static int check_vector(void)
{
	return has_vector();
}

Just casting has_vector to int (*)(void) doesn't work, I get:

warning: cast between incompatible function types from ‘bool (*)(void)’ {aka ‘_Bool (*)(void)’} to ‘int (*)(void)’ [-Wcast-function-type]


> +	.name		= "rvv",
> +	.priority	= 1,
> +};
> diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
> new file mode 100644
> index 000000000000..21f5432506da
> --- /dev/null
> +++ b/lib/raid6/rvv.c
> @@ -0,0 +1,715 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * RAID-6 syndrome calculation using RISCV vector instructions
> + *
> + * Copyright 2024 Institute of Software, CAS.
> + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> + *
> + * Based on neon.uc:
> + *	Copyright 2002-2004 H. Peter Anvin
> + */
> +
> +#include <asm/simd.h>
> +#include <asm/vector.h>
> +#include <crypto/internal/simd.h>
> +#include <linux/raid/pq.h>
> +#include <linux/types.h>
> +
> +#define NSIZE	(riscv_v_vsize / 32) /* NSIZE = vlenb */
> +
> +static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	int d, z, z0;
> +	u8 *p, *q;
> +
> +	z0 = disks - 3;		/* Highest data disk */
> +	p = dptr[z0+1];		/* XOR parity */
> +	q = dptr[z0+2];		/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +	);
> +
> +	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*1) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			"vle8.v	v0, (%[wp0])\n"
> +			"vle8.v	v1, (%[wp0])\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE])
> +		);
> +
> +		for (z = z0-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> +		 */
> +		asm volatile (
> +			"vse8.v		v0, (%[wp0])\n"
> +			"vse8.v		v1, (%[wq0])\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0])
> +		);
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> +				    unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	u8 *p, *q;
> +	int d, z, z0;
> +
> +	z0 = stop;		/* P/Q right side optimization */
> +	p = dptr[disks-2];	/* XOR parity */
> +	q = dptr[disks-1];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option push\n"
> +		".option arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +	);
> +
> +	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*1) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			"vle8.v	v0, (%[wp0])\n"
> +			"vle8.v	v1, (%[wp0])\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE])
> +		);
> +
> +		/* P/Q data pages */
> +		for (z = z0-1 ; z >= start ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/* P/Q left side optimization */
> +		for (z = start-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * wq$$ = w1$$ ^ w2$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				: :
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> +		 * v0:wp0, v1:wq0, v2:p0, v3:q0
> +		 */
> +		asm volatile (
> +			"vle8.v		v2, (%[wp0])\n"
> +			"vle8.v		v3, (%[wq0])\n"
> +			"vxor.vv	v2, v2, v0\n"
> +			"vxor.vv	v3, v3, v1\n"
> +			"vse8.v		v2, (%[wp0])\n"
> +			"vse8.v		v3, (%[wq0])\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0])
> +		);
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	int d, z, z0;
> +	u8 *p, *q;
> +
> +	z0 = disks - 3;		/* Highest data disk */
> +	p = dptr[z0+1];		/* XOR parity */
> +	q = dptr[z0+2];		/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*2) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			"vle8.v	v0, (%[wp0])\n"
> +			"vle8.v	v1, (%[wp0])\n"
> +			"vle8.v	v4, (%[wp1])\n"
> +			"vle8.v	v5, (%[wp1])\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE])
> +		);
> +
> +		for (z = z0-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> +		 */
> +		asm volatile (
> +			"vse8.v		v0, (%[wp0])\n"
> +			"vse8.v		v1, (%[wq0])\n"
> +			"vse8.v		v4, (%[wp1])\n"
> +			"vse8.v		v5, (%[wq1])\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1])
> +		);
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> +					 unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	u8 *p, *q;
> +	int d, z, z0;
> +
> +	z0 = stop;		/* P/Q right side optimization */
> +	p = dptr[disks-2];	/* XOR parity */
> +	q = dptr[disks-1];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option push\n"
> +		".option arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*2) {
> +		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			"vle8.v	v0, (%[wp0])\n"
> +			"vle8.v	v1, (%[wp0])\n"
> +			"vle8.v	v4, (%[wp1])\n"
> +			"vle8.v	v5, (%[wp1])\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE])
> +		);
> +
> +		/* P/Q data pages */
> +		for (z = z0-1 ; z >= start ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/* P/Q left side optimization */
> +		for (z = start-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * wq$$ = w1$$ ^ w2$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v1, v3, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				: :
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> +		 * v0:wp0, v1:wq0, v2:p0, v3:q0
> +		 * v4:wp1, v5:wq1, v6:p1, v7:q1
> +		 */
> +		asm volatile (
> +			"vle8.v		v2, (%[wp0])\n"
> +			"vle8.v		v3, (%[wq0])\n"
> +			"vxor.vv	v2, v2, v0\n"
> +			"vxor.vv	v3, v3, v1\n"
> +			"vse8.v		v2, (%[wp0])\n"
> +			"vse8.v		v3, (%[wq0])\n"
> +
> +			"vle8.v		v6, (%[wp1])\n"
> +			"vle8.v		v7, (%[wq1])\n"
> +			"vxor.vv	v6, v6, v4\n"
> +			"vxor.vv	v7, v7, v5\n"
> +			"vse8.v		v6, (%[wp1])\n"
> +			"vse8.v		v7, (%[wq1])\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1])
> +		);
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	int d, z, z0;
> +	u8 *p, *q;
> +
> +	z0 = disks - 3;	/* Highest data disk */
> +	p = dptr[z0+1];	/* XOR parity */
> +	q = dptr[z0+2];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> +	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*4) {
> +		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			"vle8.v v0, (%[wp0])\n"
> +			"vle8.v v1, (%[wp0])\n"
> +			"vle8.v v4, (%[wp1])\n"
> +			"vle8.v v5, (%[wp1])\n"
> +			"vle8.v v8, (%[wp2])\n"
> +			"vle8.v v9, (%[wp2])\n"
> +			"vle8.v v12, (%[wp3])\n"
> +			"vle8.v v13, (%[wp3])\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
> +			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
> +			[wp3]"r"(&dptr[z0][d+3*NSIZE])
> +		);
> +
> +		for (z = z0-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v11, v11, v10\n"
> +				"vle8.v		v10, (%[wd2])\n"
> +				"vxor.vv	v9, v11, v10\n"
> +				"vxor.vv	v8, v8, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v15, v15, v14\n"
> +				"vle8.v		v14, (%[wd3])\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				"vxor.vv	v12, v12, v14\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[wd2]"r"(&dptr[z][d+2*NSIZE]),
> +				[wd3]"r"(&dptr[z][d+3*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> +		 */
> +		asm volatile (
> +			"vse8.v	v0, (%[wp0])\n"
> +			"vse8.v	v1, (%[wq0])\n"
> +			"vse8.v	v4, (%[wp1])\n"
> +			"vse8.v	v5, (%[wq1])\n"
> +			"vse8.v	v8, (%[wp2])\n"
> +			"vse8.v	v9, (%[wq2])\n"
> +			"vse8.v	v12, (%[wp3])\n"
> +			"vse8.v	v13, (%[wq3])\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1]),
> +			[wp2]"r"(&p[d+NSIZE*2]),
> +			[wq2]"r"(&q[d+NSIZE*2]),
> +			[wp3]"r"(&p[d+NSIZE*3]),
> +			[wq3]"r"(&q[d+NSIZE*3])
> +		);
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> +					unsigned long bytes, void **ptrs)
> +{
> +	u8 **dptr = (u8 **)ptrs;
> +	u8 *p, *q;
> +	int d, z, z0;
> +
> +	z0 = stop;		/* P/Q right side optimization */
> +	p = dptr[disks-2];	/* XOR parity */
> +	q = dptr[disks-1];	/* RS syndrome */
> +
> +	asm volatile (
> +		".option push\n"
> +		".option arch,+v\n"
> +		"vsetvli	t0, x0, e8, m1, ta, ma\n"
> +	);
> +
> +	/*
> +	 * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
> +	 * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
> +	 * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
> +	 * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
> +	 */
> +	for (d = 0 ; d < bytes ; d += NSIZE*4) {
> +		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> +		asm volatile (
> +			"vle8.v v0, (%[wp0])\n"
> +			"vle8.v v1, (%[wp0])\n"
> +			"vle8.v v4, (%[wp1])\n"
> +			"vle8.v v5, (%[wp1])\n"
> +			"vle8.v v8, (%[wp2])\n"
> +			"vle8.v v9, (%[wp2])\n"
> +			"vle8.v v12, (%[wp3])\n"
> +			"vle8.v v13, (%[wp3])\n"
> +			: :
> +			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
> +			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
> +			[wp2]"r"(&dptr[z0][d+2*NSIZE]),
> +			[wp3]"r"(&dptr[z0][d+3*NSIZE])
> +		);
> +
> +		/* P/Q data pages */
> +		for (z = z0-1 ; z >= start ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * w1$$ ^= w2$$;
> +			 * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
> +			 * wq$$ = w1$$ ^ wd$$;
> +			 * wp$$ ^= wd$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v3, v3, v2\n"
> +				"vle8.v		v2, (%[wd0])\n"
> +				"vxor.vv	v1, v3, v2\n"
> +				"vxor.vv	v0, v0, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v7, v7, v6\n"
> +				"vle8.v		v6, (%[wd1])\n"
> +				"vxor.vv	v5, v7, v6\n"
> +				"vxor.vv	v4, v4, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v11, v11, v10\n"
> +				"vle8.v		v10, (%[wd2])\n"
> +				"vxor.vv	v9, v11, v10\n"
> +				"vxor.vv	v8, v8, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v15, v15, v14\n"
> +				"vle8.v		v14, (%[wd3])\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				"vxor.vv	v12, v12, v14\n"
> +				: :
> +				[wd0]"r"(&dptr[z][d+0*NSIZE]),
> +				[wd1]"r"(&dptr[z][d+1*NSIZE]),
> +				[wd2]"r"(&dptr[z][d+2*NSIZE]),
> +				[wd3]"r"(&dptr[z][d+3*NSIZE]),
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/* P/Q left side optimization */
> +		for (z = start-1 ; z >= 0 ; z--) {
> +			/*
> +			 * w2$$ = MASK(wq$$);
> +			 * w1$$ = SHLBYTE(wq$$);
> +			 * w2$$ &= NBYTES(0x1d);
> +			 * wq$$ = w1$$ ^ w2$$;
> +			 */
> +			asm volatile (
> +				"vsra.vi	v2, v1, 7\n"
> +				"vsll.vi	v3, v1, 1\n"
> +				"vand.vx	v2, v2, %[x1d]\n"
> +				"vxor.vv	v1, v3, v2\n"
> +
> +				"vsra.vi	v6, v5, 7\n"
> +				"vsll.vi	v7, v5, 1\n"
> +				"vand.vx	v6, v6, %[x1d]\n"
> +				"vxor.vv	v5, v7, v6\n"
> +
> +				"vsra.vi	v10, v9, 7\n"
> +				"vsll.vi	v11, v9, 1\n"
> +				"vand.vx	v10, v10, %[x1d]\n"
> +				"vxor.vv	v9, v11, v10\n"
> +
> +				"vsra.vi	v14, v13, 7\n"
> +				"vsll.vi	v15, v13, 1\n"
> +				"vand.vx	v14, v14, %[x1d]\n"
> +				"vxor.vv	v13, v15, v14\n"
> +				: :
> +				[x1d]"r"(0x1d)
> +			);
> +		}
> +
> +		/*
> +		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
> +		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
> +		 * v0:wp0, v1:wq0, v2:p0, v3:q0
> +		 * v4:wp1, v5:wq1, v6:p1, v7:q1
> +		 * v8:wp2, v9:wq2, v10:p2, v11:q2
> +		 * v12:wp3, v13:wq3, v14:p3, v15:q3
> +		 */
> +		asm volatile (
> +			"vle8.v		v2, (%[wp0])\n"
> +			"vle8.v		v3, (%[wq0])\n"
> +			"vxor.vv	v2, v2, v0\n"
> +			"vxor.vv	v3, v3, v1\n"
> +			"vse8.v		v2, (%[wp0])\n"
> +			"vse8.v		v3, (%[wq0])\n"
> +
> +			"vle8.v		v6, (%[wp1])\n"
> +			"vle8.v		v7, (%[wq1])\n"
> +			"vxor.vv	v6, v6, v4\n"
> +			"vxor.vv	v7, v7, v5\n"
> +			"vse8.v		v6, (%[wp1])\n"
> +			"vse8.v		v7, (%[wq1])\n"
> +
> +			"vle8.v		v10, (%[wp2])\n"
> +			"vle8.v		v11, (%[wq2])\n"
> +			"vxor.vv	v10, v10, v8\n"
> +			"vxor.vv	v11, v11, v9\n"
> +			"vse8.v		v10, (%[wp2])\n"
> +			"vse8.v		v11, (%[wq2])\n"
> +
> +			"vle8.v		v14, (%[wp3])\n"
> +			"vle8.v		v15, (%[wq3])\n"
> +			"vxor.vv	v14, v14, v12\n"
> +			"vxor.vv	v15, v15, v13\n"
> +			"vse8.v		v14, (%[wp3])\n"
> +			"vse8.v		v15, (%[wq3])\n"
> +			: :
> +			[wp0]"r"(&p[d+NSIZE*0]),
> +			[wq0]"r"(&q[d+NSIZE*0]),
> +			[wp1]"r"(&p[d+NSIZE*1]),
> +			[wq1]"r"(&q[d+NSIZE*1]),
> +			[wp2]"r"(&p[d+NSIZE*2]),
> +			[wq2]"r"(&q[d+NSIZE*2]),
> +			[wp3]"r"(&p[d+NSIZE*3]),
> +			[wq3]"r"(&q[d+NSIZE*3])
> +		);
> +	}
> +
> +	asm volatile (".option pop\n");
> +}
> +
> +#define RAID6_RVV_WRAPPER(_n)						\
> +	static void raid6_rvv ## _n ## _gen_syndrome(int disks,		\
> +					size_t bytes, void **ptrs)	\
> +	{								\
> +		void raid6_rvv ## _n  ## _gen_syndrome_real(int,	\
> +						unsigned long, void**);	\
> +		if (crypto_simd_usable()) {				\

Same note about crypto_simd_usable as above

> +			kernel_vector_begin();				\
> +			raid6_rvv ## _n ## _gen_syndrome_real(disks,	\
> +					(unsigned long)bytes, ptrs);	\
> +			kernel_vector_end();				\
> +		}							\
> +	}								\
> +	static void raid6_rvv ## _n ## _xor_syndrome(int disks,		\
> +					int start, int stop,		\
> +					size_t bytes, void **ptrs)	\
> +	{								\
> +		void raid6_rvv ## _n  ## _xor_syndrome_real(int,	\
> +				int, int, unsigned long, void**);	\
> +		if (crypto_simd_usable()) {				\

... and here

> +			kernel_vector_begin();				\
> +		raid6_rvv ## _n ## _xor_syndrome_real(disks,		\
> +			start, stop, (unsigned long)bytes, ptrs);	\
> +			kernel_vector_end();				\
> +		}							\
> +	}								\
> +	struct raid6_calls const raid6_rvvx ## _n = {			\
> +		raid6_rvv ## _n ## _gen_syndrome,			\
> +		raid6_rvv ## _n ## _xor_syndrome,			\
> +		NULL,							\

Same note about calling has_vector here.

> +		"rvvx" #_n,						\
> +		0							\
> +	}
> +
> +RAID6_RVV_WRAPPER(1);
> +RAID6_RVV_WRAPPER(2);
> +RAID6_RVV_WRAPPER(4);
> -- 
> 2.34.1
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

Some interesting results, on QEMU (vlen=256) these vectorized versions
are around 6x faster on my CPU. Vector in QEMU is not optimized so I am
surprised that there is this much speedup.

# modprobe raid6_pq
[   36.238377] raid6: rvvx1    gen()  2668 MB/s
[   36.306381] raid6: rvvx2    gen()  3097 MB/s
[   36.374376] raid6: rvvx4    gen()  3366 MB/s
[   36.442385] raid6: int64x8  gen()   548 MB/s
[   36.510397] raid6: int64x4  gen()   600 MB/s
[   36.578388] raid6: int64x2  gen()   585 MB/s
[   36.646384] raid6: int64x1  gen()   518 MB/s
[   36.646395] raid6: using algorithm rvvx4 gen() 3366 MB/s
[   36.714377] raid6: .... xor() 1942 MB/s, rmw enabled
[   36.714387] raid6: using rvv recovery algorithm

I also ran the raid6tests:

raid6test: complete (2429 tests, 0 failures)

I am not familiar with this algorithm, but since it passed all of the
test cases and shows a remarkable speedup, this patch seems like a great
improvement.

As Jessica pointed out, please put the vector pop/push in the same block
as your vector instructions. While testing this code, I threw together a
patch for this that you can squash:

From 32117c0a5b2bbba7439af37e55631e0e38b63a7c Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Wed, 8 Jan 2025 14:32:26 -0800
Subject: [PATCH] Fixup vector options

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
---
 lib/raid6/Makefile    |  2 --
 lib/raid6/recov_rvv.c | 12 ++++---
 lib/raid6/rvv.c       | 81 ++++++++++++++++++++++++++++++++++++-------
 3 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index e62fb7cd773e..5be0a4e60ab1 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -11,8 +11,6 @@ raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o re
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
 raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
-CFLAGS_rvv.o += -march=rv64gcv
-CFLAGS_recov_rvv.o += -march=rv64gcv
 
 hostprogs	+= mktables
 
diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
index 8ae74803ea7f..02b97d885510 100644
--- a/lib/raid6/recov_rvv.c
+++ b/lib/raid6/recov_rvv.c
@@ -17,6 +17,7 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
 		".option	push\n"
 		".option	arch,+v\n"
 		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		".option	pop\n"
 		: :
 		[avl]"r"(16)
 	);
@@ -42,6 +43,8 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
 		 * v14:p/qm[vx], v15:p/qm[vy]
 		 */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vle8.v		v0, (%[px])\n"
 			"vle8.v		v1, (%[dp])\n"
 			"vxor.vv	v0, v0, v1\n"
@@ -67,6 +70,7 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
 			"vxor.vv	v1, v3, v0\n" /* v1 = db ^ px; */
 			"vse8.v		v3, (%[dq])\n"
 			"vse8.v		v1, (%[dp])\n"
+			".option	pop\n"
 			: :
 			[px]"r"(p),
 			[dp]"r"(dp),
@@ -84,8 +88,6 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
 		dp += 16;
 		dq += 16;
 	}
-
-	asm volatile (".option pop\n");
 }
 
 static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
@@ -95,6 +97,7 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
 		".option	push\n"
 		".option	arch,+v\n"
 		"vsetvli	x0, %[avl], e8, m1, ta, ma\n"
+		".option	pop\n"
 		: :
 		[avl]"r"(16)
 	);
@@ -113,6 +116,8 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
 		 * v10:m[vx], v11:m[vy]
 		 */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vle8.v		v0, (%[vx])\n"
 			"vle8.v		v2, (%[dq])\n"
 			"vxor.vv	v0, v0, v2\n"
@@ -127,6 +132,7 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
 			"vxor.vv	v1, v0, v1\n"
 			"vse8.v		v0, (%[dq])\n"
 			"vse8.v		v1, (%[vy])\n"
+			".option	pop\n"
 			: :
 			[vx]"r"(q),
 			[vy]"r"(p),
@@ -140,8 +146,6 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
 		q += 16;
 		dq += 16;
 	}
-
-	asm volatile (".option pop\n");
 }
 
 
diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
index 21f5432506da..81b38dcafeb6 100644
--- a/lib/raid6/rvv.c
+++ b/lib/raid6/rvv.c
@@ -31,14 +31,18 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
 		".option	push\n"
 		".option	arch,+v\n"
 		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
 	);
 
 	 /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
 	for (d = 0 ; d < bytes ; d += NSIZE*1) {
 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vle8.v	v0, (%[wp0])\n"
 			"vle8.v	v1, (%[wp0])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&dptr[z0][d+0*NSIZE])
 		);
@@ -54,6 +58,8 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
 			 * wp$$ ^= wd$$;
 			 */
 			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -61,6 +67,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
 				"vle8.v		v2, (%[wd0])\n"
 				"vxor.vv	v1, v3, v2\n"
 				"vxor.vv	v0, v0, v2\n"
+				".option	pop\n"
 				: :
 				[wd0]"r"(&dptr[z][d+0*NSIZE]),
 				[x1d]"r"(0x1d)
@@ -72,15 +79,16 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
 		 */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vse8.v		v0, (%[wp0])\n"
 			"vse8.v		v1, (%[wq0])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&p[d+NSIZE*0]),
 			[wq0]"r"(&q[d+NSIZE*0])
 		);
 	}
-
-	asm volatile (".option pop\n");
 }
 
 static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
@@ -98,14 +106,18 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
 		".option push\n"
 		".option arch,+v\n"
 		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
 	);
 
 	/* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
 	for (d = 0 ; d < bytes ; d += NSIZE*1) {
 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vle8.v	v0, (%[wp0])\n"
 			"vle8.v	v1, (%[wp0])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&dptr[z0][d+0*NSIZE])
 		);
@@ -122,6 +134,8 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
 			 * wp$$ ^= wd$$;
 			 */
 			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -129,6 +143,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
 				"vle8.v		v2, (%[wd0])\n"
 				"vxor.vv	v1, v3, v2\n"
 				"vxor.vv	v0, v0, v2\n"
+				".option	pop\n"
 				: :
 				[wd0]"r"(&dptr[z][d+0*NSIZE]),
 				[x1d]"r"(0x1d)
@@ -144,10 +159,13 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
 			 * wq$$ = w1$$ ^ w2$$;
 			 */
 			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
 				"vxor.vv	v1, v3, v2\n"
+				".option	pop\n"
 				: :
 				[x1d]"r"(0x1d)
 			);
@@ -159,19 +177,20 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
 		 * v0:wp0, v1:wq0, v2:p0, v3:q0
 		 */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vle8.v		v2, (%[wp0])\n"
 			"vle8.v		v3, (%[wq0])\n"
 			"vxor.vv	v2, v2, v0\n"
 			"vxor.vv	v3, v3, v1\n"
 			"vse8.v		v2, (%[wp0])\n"
 			"vse8.v		v3, (%[wq0])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&p[d+NSIZE*0]),
 			[wq0]"r"(&q[d+NSIZE*0])
 		);
 	}
-
-	asm volatile (".option pop\n");
 }
 
 static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
@@ -188,6 +207,7 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 		".option	push\n"
 		".option	arch,+v\n"
 		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
 	);
 
 	/*
@@ -197,10 +217,13 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 	for (d = 0 ; d < bytes ; d += NSIZE*2) {
 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vle8.v	v0, (%[wp0])\n"
 			"vle8.v	v1, (%[wp0])\n"
 			"vle8.v	v4, (%[wp1])\n"
 			"vle8.v	v5, (%[wp1])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
 			[wp1]"r"(&dptr[z0][d+1*NSIZE])
@@ -217,6 +240,8 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 			 * wp$$ ^= wd$$;
 			 */
 			asm volatile (
+				".option	push\n"
+				".option	arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -232,6 +257,7 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 				"vle8.v		v6, (%[wd1])\n"
 				"vxor.vv	v5, v7, v6\n"
 				"vxor.vv	v4, v4, v6\n"
+				".option	pop\n"
 				: :
 				[wd0]"r"(&dptr[z][d+0*NSIZE]),
 				[wd1]"r"(&dptr[z][d+1*NSIZE]),
@@ -244,10 +270,13 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
 		 */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vse8.v		v0, (%[wp0])\n"
 			"vse8.v		v1, (%[wq0])\n"
 			"vse8.v		v4, (%[wp1])\n"
 			"vse8.v		v5, (%[wq1])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&p[d+NSIZE*0]),
 			[wq0]"r"(&q[d+NSIZE*0]),
@@ -255,8 +284,6 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
 			[wq1]"r"(&q[d+NSIZE*1])
 		);
 	}
-
-	asm volatile (".option pop\n");
 }
 
 static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
@@ -274,6 +301,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 		".option push\n"
 		".option arch,+v\n"
 		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
 	);
 
 	/*
@@ -283,10 +311,13 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 	for (d = 0 ; d < bytes ; d += NSIZE*2) {
 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
 		asm volatile (
+			".option	push\n"
+			".option	arch,+v\n"
 			"vle8.v	v0, (%[wp0])\n"
 			"vle8.v	v1, (%[wp0])\n"
 			"vle8.v	v4, (%[wp1])\n"
 			"vle8.v	v5, (%[wp1])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
 			[wp1]"r"(&dptr[z0][d+1*NSIZE])
@@ -304,6 +335,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 			 * wp$$ ^= wd$$;
 			 */
 			asm volatile (
+				".option push\n"
+				".option arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -319,6 +352,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 				"vle8.v		v6, (%[wd1])\n"
 				"vxor.vv	v5, v7, v6\n"
 				"vxor.vv	v4, v4, v6\n"
+				".option	pop\n"
 				: :
 				[wd0]"r"(&dptr[z][d+0*NSIZE]),
 				[wd1]"r"(&dptr[z][d+1*NSIZE]),
@@ -335,6 +369,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 			 * wq$$ = w1$$ ^ w2$$;
 			 */
 			asm volatile (
+				".option push\n"
+				".option arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -344,6 +380,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 				"vsll.vi	v7, v5, 1\n"
 				"vand.vx	v6, v6, %[x1d]\n"
 				"vxor.vv	v5, v7, v6\n"
+				".option	pop\n"
 				: :
 				[x1d]"r"(0x1d)
 			);
@@ -356,6 +393,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 		 * v4:wp1, v5:wq1, v6:p1, v7:q1
 		 */
 		asm volatile (
+			".option push\n"
+			".option arch,+v\n"
 			"vle8.v		v2, (%[wp0])\n"
 			"vle8.v		v3, (%[wq0])\n"
 			"vxor.vv	v2, v2, v0\n"
@@ -369,6 +408,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 			"vxor.vv	v7, v7, v5\n"
 			"vse8.v		v6, (%[wp1])\n"
 			"vse8.v		v7, (%[wq1])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&p[d+NSIZE*0]),
 			[wq0]"r"(&q[d+NSIZE*0]),
@@ -376,8 +416,6 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
 			[wq1]"r"(&q[d+NSIZE*1])
 		);
 	}
-
-	asm volatile (".option pop\n");
 }
 
 static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
@@ -394,6 +432,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 		".option	push\n"
 		".option	arch,+v\n"
 		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
 	);
 
 	/*
@@ -405,6 +444,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 	for (d = 0 ; d < bytes ; d += NSIZE*4) {
 		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
 		asm volatile (
+			".option push\n"
+			".option arch,+v\n"
 			"vle8.v v0, (%[wp0])\n"
 			"vle8.v v1, (%[wp0])\n"
 			"vle8.v v4, (%[wp1])\n"
@@ -413,6 +454,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 			"vle8.v v9, (%[wp2])\n"
 			"vle8.v v12, (%[wp3])\n"
 			"vle8.v v13, (%[wp3])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
 			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
@@ -431,6 +473,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 			 * wp$$ ^= wd$$;
 			 */
 			asm volatile (
+				".option push\n"
+				".option arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -462,6 +506,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 				"vle8.v		v14, (%[wd3])\n"
 				"vxor.vv	v13, v15, v14\n"
 				"vxor.vv	v12, v12, v14\n"
+				".option	pop\n"
 				: :
 				[wd0]"r"(&dptr[z][d+0*NSIZE]),
 				[wd1]"r"(&dptr[z][d+1*NSIZE]),
@@ -476,6 +521,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 		 * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
 		 */
 		asm volatile (
+			".option push\n"
+			".option arch,+v\n"
 			"vse8.v	v0, (%[wp0])\n"
 			"vse8.v	v1, (%[wq0])\n"
 			"vse8.v	v4, (%[wp1])\n"
@@ -484,6 +531,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 			"vse8.v	v9, (%[wq2])\n"
 			"vse8.v	v12, (%[wp3])\n"
 			"vse8.v	v13, (%[wq3])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&p[d+NSIZE*0]),
 			[wq0]"r"(&q[d+NSIZE*0]),
@@ -495,8 +543,6 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
 			[wq3]"r"(&q[d+NSIZE*3])
 		);
 	}
-
-	asm volatile (".option pop\n");
 }
 
 static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
@@ -514,6 +560,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 		".option push\n"
 		".option arch,+v\n"
 		"vsetvli	t0, x0, e8, m1, ta, ma\n"
+		".option	pop\n"
 	);
 
 	/*
@@ -525,6 +572,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 	for (d = 0 ; d < bytes ; d += NSIZE*4) {
 		 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
 		asm volatile (
+			".option push\n"
+			".option arch,+v\n"
 			"vle8.v v0, (%[wp0])\n"
 			"vle8.v v1, (%[wp0])\n"
 			"vle8.v v4, (%[wp1])\n"
@@ -533,6 +582,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 			"vle8.v v9, (%[wp2])\n"
 			"vle8.v v12, (%[wp3])\n"
 			"vle8.v v13, (%[wp3])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&dptr[z0][d+0*NSIZE]),
 			[wp1]"r"(&dptr[z0][d+1*NSIZE]),
@@ -552,6 +602,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 			 * wp$$ ^= wd$$;
 			 */
 			asm volatile (
+				".option push\n"
+				".option arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -583,6 +635,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 				"vle8.v		v14, (%[wd3])\n"
 				"vxor.vv	v13, v15, v14\n"
 				"vxor.vv	v12, v12, v14\n"
+				".option	pop\n"
 				: :
 				[wd0]"r"(&dptr[z][d+0*NSIZE]),
 				[wd1]"r"(&dptr[z][d+1*NSIZE]),
@@ -601,6 +654,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 			 * wq$$ = w1$$ ^ w2$$;
 			 */
 			asm volatile (
+				".option push\n"
+				".option arch,+v\n"
 				"vsra.vi	v2, v1, 7\n"
 				"vsll.vi	v3, v1, 1\n"
 				"vand.vx	v2, v2, %[x1d]\n"
@@ -620,6 +675,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 				"vsll.vi	v15, v13, 1\n"
 				"vand.vx	v14, v14, %[x1d]\n"
 				"vxor.vv	v13, v15, v14\n"
+				".option	pop\n"
 				: :
 				[x1d]"r"(0x1d)
 			);
@@ -634,6 +690,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 		 * v12:wp3, v13:wq3, v14:p3, v15:q3
 		 */
 		asm volatile (
+			".option push\n"
+			".option arch,+v\n"
 			"vle8.v		v2, (%[wp0])\n"
 			"vle8.v		v3, (%[wq0])\n"
 			"vxor.vv	v2, v2, v0\n"
@@ -661,6 +719,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 			"vxor.vv	v15, v15, v13\n"
 			"vse8.v		v14, (%[wp3])\n"
 			"vse8.v		v15, (%[wq3])\n"
+			".option	pop\n"
 			: :
 			[wp0]"r"(&p[d+NSIZE*0]),
 			[wq0]"r"(&q[d+NSIZE*0]),
@@ -672,8 +731,6 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
 			[wq3]"r"(&q[d+NSIZE*3])
 		);
 	}
-
-	asm volatile (".option pop\n");
 }
 
 #define RAID6_RVV_WRAPPER(_n)						\
-- 
2.34.1


- Charlie

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Chunyan Zhang 1 year ago

Hi Charlie,

On Thu, 9 Jan 2025 at 07:45, Charlie Jenkins <charlie@rivosinc.com> wrote:
>
> On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
[snip]
> > +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
> > +             int failb, void **ptrs)
> > +{
> > +     u8 *p, *q, *dp, *dq;
> > +     const u8 *pbmul;        /* P multiplier table for B data */
> > +     const u8 *qmul;         /* Q multiplier table (for both) */
> > +
> > +     p = (u8 *)ptrs[disks - 2];
> > +     q = (u8 *)ptrs[disks - 1];
> > +
> > +     /*
> > +      * Compute syndrome with zero for the missing data pages
> > +      * Use the dead data pages as temporary storage for
> > +      * delta p and delta q
> > +      */
> > +     dp = (u8 *)ptrs[faila];
> > +     ptrs[faila] = (void *)raid6_empty_zero_page;
> > +     ptrs[disks - 2] = dp;
> > +     dq = (u8 *)ptrs[failb];
> > +     ptrs[failb] = (void *)raid6_empty_zero_page;
> > +     ptrs[disks - 1] = dq;
> > +
> > +     raid6_call.gen_syndrome(disks, bytes, ptrs);
> > +
> > +     /* Restore pointer table */
> > +     ptrs[faila]     = dp;
> > +     ptrs[failb]     = dq;
> > +     ptrs[disks - 2] = p;
> > +     ptrs[disks - 1] = q;
> > +
> > +     /* Now, pick the proper data tables */
> > +     pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
> > +     qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
> > +                                      raid6_gfexp[failb]]];
> > +
> > +     if (crypto_simd_usable()) {
>
> There should be an alternate recovery mechanism if it's not currently
> usable right? I don't know what case could happen when this function is
> called but crypto_simd_usable() returns false.

crypto_simd_usable() looks like not needed here.
The callers would call preempt_disable() before calling this function.
And I will add .valid callback like you commented below.

>
> > +             kernel_vector_begin();
> > +             __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
> > +             kernel_vector_end();
> > +     }
> > +}
> > +
> > +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
> > +             void **ptrs)
> > +{
> > +     u8 *p, *q, *dq;
> > +     const u8 *qmul;         /* Q multiplier table */
> > +
> > +     p = (u8 *)ptrs[disks - 2];
> > +     q = (u8 *)ptrs[disks - 1];
> > +
> > +     /*
> > +      * Compute syndrome with zero for the missing data page
> > +      * Use the dead data page as temporary storage for delta q
> > +      */
> > +     dq = (u8 *)ptrs[faila];
> > +     ptrs[faila] = (void *)raid6_empty_zero_page;
> > +     ptrs[disks - 1] = dq;
> > +
> > +     raid6_call.gen_syndrome(disks, bytes, ptrs);
> > +
> > +     /* Restore pointer table */
> > +     ptrs[faila]     = dq;
> > +     ptrs[disks - 1] = q;
> > +
> > +     /* Now, pick the proper data tables */
> > +     qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
> > +
> > +     if (crypto_simd_usable()) {
>
> Same here
>
> > +             kernel_vector_begin();
> > +             __raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
> > +             kernel_vector_end();
> > +     }
> > +}
> > +
> > +const struct raid6_recov_calls raid6_recov_rvv = {
> > +     .data2          = raid6_2data_recov_rvv,
> > +     .datap          = raid6_datap_recov_rvv,
> > +     .valid          = NULL,
>
> These functions should only be called if vector is enabled, so this
> valid bit should call has_vector(). has_vector() returns a bool and
> valid expects an int so you can wrap it in something like:

Ok, will add.
>
> static int check_vector(void)
> {
>         return has_vector();
> }
>
> Just casting has_vector to int (*)(void) doesn't work, I get:
>
> warning: cast between incompatible function types from ‘bool (*)(void)’ {aka ‘_Bool (*)(void)’} to ‘int (*)(void)’ [-Wcast-function-type]
>
>
[snip]
> > +#define RAID6_RVV_WRAPPER(_n)                                                \
> > +     static void raid6_rvv ## _n ## _gen_syndrome(int disks,         \
> > +                                     size_t bytes, void **ptrs)      \
> > +     {                                                               \
> > +             void raid6_rvv ## _n  ## _gen_syndrome_real(int,        \
> > +                                             unsigned long, void**); \
> > +             if (crypto_simd_usable()) {                             \
>
> Same note about crypto_simd_usable as above

Ok.

>
> > +                     kernel_vector_begin();                          \
> > +                     raid6_rvv ## _n ## _gen_syndrome_real(disks,    \
> > +                                     (unsigned long)bytes, ptrs);    \
> > +                     kernel_vector_end();                            \
> > +             }                                                       \
> > +     }                                                               \
> > +     static void raid6_rvv ## _n ## _xor_syndrome(int disks,         \
> > +                                     int start, int stop,            \
> > +                                     size_t bytes, void **ptrs)      \
> > +     {                                                               \
> > +             void raid6_rvv ## _n  ## _xor_syndrome_real(int,        \
> > +                             int, int, unsigned long, void**);       \
> > +             if (crypto_simd_usable()) {                             \
>
> ... and here

Ok.

>
> > +                     kernel_vector_begin();                          \
> > +             raid6_rvv ## _n ## _xor_syndrome_real(disks,            \
> > +                     start, stop, (unsigned long)bytes, ptrs);       \
> > +                     kernel_vector_end();                            \
> > +             }                                                       \
> > +     }                                                               \
> > +     struct raid6_calls const raid6_rvvx ## _n = {                   \
> > +             raid6_rvv ## _n ## _gen_syndrome,                       \
> > +             raid6_rvv ## _n ## _xor_syndrome,                       \
> > +             NULL,                                                   \
>
> Same note about calling has_vector here.

Yes.

>
> > +             "rvvx" #_n,                                             \
> > +             0                                                       \
> > +     }
> > +
> > +RAID6_RVV_WRAPPER(1);
> > +RAID6_RVV_WRAPPER(2);
> > +RAID6_RVV_WRAPPER(4);
> > --
> > 2.34.1
> >
> >
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv
>
> Some interesting results, on QEMU (vlen=256) these vectorized versions
> are around 6x faster on my CPU. Vector in QEMU is not optimized so I am
> surprised that there is this much speedup.

Which version of QEMU did you use? What options were used when running QEMU?

I want to try on my side, since this test didn't run as fast like your
test result on my QEMU (I'm using v9.0.0, vlen=128).

>
> # modprobe raid6_pq
> [   36.238377] raid6: rvvx1    gen()  2668 MB/s
> [   36.306381] raid6: rvvx2    gen()  3097 MB/s
> [   36.374376] raid6: rvvx4    gen()  3366 MB/s
> [   36.442385] raid6: int64x8  gen()   548 MB/s
> [   36.510397] raid6: int64x4  gen()   600 MB/s
> [   36.578388] raid6: int64x2  gen()   585 MB/s
> [   36.646384] raid6: int64x1  gen()   518 MB/s
> [   36.646395] raid6: using algorithm rvvx4 gen() 3366 MB/s
> [   36.714377] raid6: .... xor() 1942 MB/s, rmw enabled
> [   36.714387] raid6: using rvv recovery algorithm
>
> I also ran the raid6tests:
>
> raid6test: complete (2429 tests, 0 failures)
>
> I am not familiar with this algorithm, but since it passed all of the
> test cases and shows a remarkable speedup, this patch seems like a great
> improvement.
>
> As Jessica pointed out, please put the vector pop/push in the same block
> as your vector instructions. While testing this code, I threw together a
> patch for this that you can squash:

Ok, will do.

>
> From 32117c0a5b2bbba7439af37e55631e0e38b63a7c Mon Sep 17 00:00:00 2001
> From: Charlie Jenkins <charlie@rivosinc.com>
> Date: Wed, 8 Jan 2025 14:32:26 -0800
> Subject: [PATCH] Fixup vector options
>
> Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>

Thanks for the test, review and the patch.
Chunyan

> ---
>  lib/raid6/Makefile    |  2 --
>  lib/raid6/recov_rvv.c | 12 ++++---
>  lib/raid6/rvv.c       | 81 ++++++++++++++++++++++++++++++++++++-------
>  3 files changed, 77 insertions(+), 18 deletions(-)
>
> diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> index e62fb7cd773e..5be0a4e60ab1 100644
> --- a/lib/raid6/Makefile
> +++ b/lib/raid6/Makefile
> @@ -11,8 +11,6 @@ raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o re
>  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
>  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
>  raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> -CFLAGS_rvv.o += -march=rv64gcv
> -CFLAGS_recov_rvv.o += -march=rv64gcv
>
>  hostprogs      += mktables
>
> diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> index 8ae74803ea7f..02b97d885510 100644
> --- a/lib/raid6/recov_rvv.c
> +++ b/lib/raid6/recov_rvv.c
> @@ -17,6 +17,7 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
>                 ".option        push\n"
>                 ".option        arch,+v\n"
>                 "vsetvli        x0, %[avl], e8, m1, ta, ma\n"
> +               ".option        pop\n"
>                 : :
>                 [avl]"r"(16)
>         );
> @@ -42,6 +43,8 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
>                  * v14:p/qm[vx], v15:p/qm[vy]
>                  */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vle8.v         v0, (%[px])\n"
>                         "vle8.v         v1, (%[dp])\n"
>                         "vxor.vv        v0, v0, v1\n"
> @@ -67,6 +70,7 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
>                         "vxor.vv        v1, v3, v0\n" /* v1 = db ^ px; */
>                         "vse8.v         v3, (%[dq])\n"
>                         "vse8.v         v1, (%[dp])\n"
> +                       ".option        pop\n"
>                         : :
>                         [px]"r"(p),
>                         [dp]"r"(dp),
> @@ -84,8 +88,6 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
>                 dp += 16;
>                 dq += 16;
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>  static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
> @@ -95,6 +97,7 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
>                 ".option        push\n"
>                 ".option        arch,+v\n"
>                 "vsetvli        x0, %[avl], e8, m1, ta, ma\n"
> +               ".option        pop\n"
>                 : :
>                 [avl]"r"(16)
>         );
> @@ -113,6 +116,8 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
>                  * v10:m[vx], v11:m[vy]
>                  */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vle8.v         v0, (%[vx])\n"
>                         "vle8.v         v2, (%[dq])\n"
>                         "vxor.vv        v0, v0, v2\n"
> @@ -127,6 +132,7 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
>                         "vxor.vv        v1, v0, v1\n"
>                         "vse8.v         v0, (%[dq])\n"
>                         "vse8.v         v1, (%[vy])\n"
> +                       ".option        pop\n"
>                         : :
>                         [vx]"r"(q),
>                         [vy]"r"(p),
> @@ -140,8 +146,6 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
>                 q += 16;
>                 dq += 16;
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>
> diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
> index 21f5432506da..81b38dcafeb6 100644
> --- a/lib/raid6/rvv.c
> +++ b/lib/raid6/rvv.c
> @@ -31,14 +31,18 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
>                 ".option        push\n"
>                 ".option        arch,+v\n"
>                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> +               ".option        pop\n"
>         );
>
>          /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
>         for (d = 0 ; d < bytes ; d += NSIZE*1) {
>                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vle8.v v0, (%[wp0])\n"
>                         "vle8.v v1, (%[wp0])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&dptr[z0][d+0*NSIZE])
>                 );
> @@ -54,6 +58,8 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
>                          * wp$$ ^= wd$$;
>                          */
>                         asm volatile (
> +                               ".option        push\n"
> +                               ".option        arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -61,6 +67,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
>                                 "vle8.v         v2, (%[wd0])\n"
>                                 "vxor.vv        v1, v3, v2\n"
>                                 "vxor.vv        v0, v0, v2\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
>                                 [x1d]"r"(0x1d)
> @@ -72,15 +79,16 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
>                  * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
>                  */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vse8.v         v0, (%[wp0])\n"
>                         "vse8.v         v1, (%[wq0])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&p[d+NSIZE*0]),
>                         [wq0]"r"(&q[d+NSIZE*0])
>                 );
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>  static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> @@ -98,14 +106,18 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
>                 ".option push\n"
>                 ".option arch,+v\n"
>                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> +               ".option        pop\n"
>         );
>
>         /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
>         for (d = 0 ; d < bytes ; d += NSIZE*1) {
>                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vle8.v v0, (%[wp0])\n"
>                         "vle8.v v1, (%[wp0])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&dptr[z0][d+0*NSIZE])
>                 );
> @@ -122,6 +134,8 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
>                          * wp$$ ^= wd$$;
>                          */
>                         asm volatile (
> +                               ".option        push\n"
> +                               ".option        arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -129,6 +143,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
>                                 "vle8.v         v2, (%[wd0])\n"
>                                 "vxor.vv        v1, v3, v2\n"
>                                 "vxor.vv        v0, v0, v2\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
>                                 [x1d]"r"(0x1d)
> @@ -144,10 +159,13 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
>                          * wq$$ = w1$$ ^ w2$$;
>                          */
>                         asm volatile (
> +                               ".option        push\n"
> +                               ".option        arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
>                                 "vxor.vv        v1, v3, v2\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [x1d]"r"(0x1d)
>                         );
> @@ -159,19 +177,20 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
>                  * v0:wp0, v1:wq0, v2:p0, v3:q0
>                  */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vle8.v         v2, (%[wp0])\n"
>                         "vle8.v         v3, (%[wq0])\n"
>                         "vxor.vv        v2, v2, v0\n"
>                         "vxor.vv        v3, v3, v1\n"
>                         "vse8.v         v2, (%[wp0])\n"
>                         "vse8.v         v3, (%[wq0])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&p[d+NSIZE*0]),
>                         [wq0]"r"(&q[d+NSIZE*0])
>                 );
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>  static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> @@ -188,6 +207,7 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
>                 ".option        push\n"
>                 ".option        arch,+v\n"
>                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> +               ".option        pop\n"
>         );
>
>         /*
> @@ -197,10 +217,13 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
>         for (d = 0 ; d < bytes ; d += NSIZE*2) {
>                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vle8.v v0, (%[wp0])\n"
>                         "vle8.v v1, (%[wp0])\n"
>                         "vle8.v v4, (%[wp1])\n"
>                         "vle8.v v5, (%[wp1])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
>                         [wp1]"r"(&dptr[z0][d+1*NSIZE])
> @@ -217,6 +240,8 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
>                          * wp$$ ^= wd$$;
>                          */
>                         asm volatile (
> +                               ".option        push\n"
> +                               ".option        arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -232,6 +257,7 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
>                                 "vle8.v         v6, (%[wd1])\n"
>                                 "vxor.vv        v5, v7, v6\n"
>                                 "vxor.vv        v4, v4, v6\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
>                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> @@ -244,10 +270,13 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
>                  * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
>                  */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vse8.v         v0, (%[wp0])\n"
>                         "vse8.v         v1, (%[wq0])\n"
>                         "vse8.v         v4, (%[wp1])\n"
>                         "vse8.v         v5, (%[wq1])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&p[d+NSIZE*0]),
>                         [wq0]"r"(&q[d+NSIZE*0]),
> @@ -255,8 +284,6 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
>                         [wq1]"r"(&q[d+NSIZE*1])
>                 );
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>  static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> @@ -274,6 +301,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                 ".option push\n"
>                 ".option arch,+v\n"
>                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> +               ".option        pop\n"
>         );
>
>         /*
> @@ -283,10 +311,13 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>         for (d = 0 ; d < bytes ; d += NSIZE*2) {
>                  /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
>                 asm volatile (
> +                       ".option        push\n"
> +                       ".option        arch,+v\n"
>                         "vle8.v v0, (%[wp0])\n"
>                         "vle8.v v1, (%[wp0])\n"
>                         "vle8.v v4, (%[wp1])\n"
>                         "vle8.v v5, (%[wp1])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
>                         [wp1]"r"(&dptr[z0][d+1*NSIZE])
> @@ -304,6 +335,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                          * wp$$ ^= wd$$;
>                          */
>                         asm volatile (
> +                               ".option push\n"
> +                               ".option arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -319,6 +352,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                                 "vle8.v         v6, (%[wd1])\n"
>                                 "vxor.vv        v5, v7, v6\n"
>                                 "vxor.vv        v4, v4, v6\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
>                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> @@ -335,6 +369,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                          * wq$$ = w1$$ ^ w2$$;
>                          */
>                         asm volatile (
> +                               ".option push\n"
> +                               ".option arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -344,6 +380,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                                 "vsll.vi        v7, v5, 1\n"
>                                 "vand.vx        v6, v6, %[x1d]\n"
>                                 "vxor.vv        v5, v7, v6\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [x1d]"r"(0x1d)
>                         );
> @@ -356,6 +393,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                  * v4:wp1, v5:wq1, v6:p1, v7:q1
>                  */
>                 asm volatile (
> +                       ".option push\n"
> +                       ".option arch,+v\n"
>                         "vle8.v         v2, (%[wp0])\n"
>                         "vle8.v         v3, (%[wq0])\n"
>                         "vxor.vv        v2, v2, v0\n"
> @@ -369,6 +408,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                         "vxor.vv        v7, v7, v5\n"
>                         "vse8.v         v6, (%[wp1])\n"
>                         "vse8.v         v7, (%[wq1])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&p[d+NSIZE*0]),
>                         [wq0]"r"(&q[d+NSIZE*0]),
> @@ -376,8 +416,6 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
>                         [wq1]"r"(&q[d+NSIZE*1])
>                 );
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>  static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> @@ -394,6 +432,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>                 ".option        push\n"
>                 ".option        arch,+v\n"
>                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> +               ".option        pop\n"
>         );
>
>         /*
> @@ -405,6 +444,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>         for (d = 0 ; d < bytes ; d += NSIZE*4) {
>                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
>                 asm volatile (
> +                       ".option push\n"
> +                       ".option arch,+v\n"
>                         "vle8.v v0, (%[wp0])\n"
>                         "vle8.v v1, (%[wp0])\n"
>                         "vle8.v v4, (%[wp1])\n"
> @@ -413,6 +454,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>                         "vle8.v v9, (%[wp2])\n"
>                         "vle8.v v12, (%[wp3])\n"
>                         "vle8.v v13, (%[wp3])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
>                         [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> @@ -431,6 +473,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>                          * wp$$ ^= wd$$;
>                          */
>                         asm volatile (
> +                               ".option push\n"
> +                               ".option arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -462,6 +506,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>                                 "vle8.v         v14, (%[wd3])\n"
>                                 "vxor.vv        v13, v15, v14\n"
>                                 "vxor.vv        v12, v12, v14\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
>                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> @@ -476,6 +521,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>                  * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
>                  */
>                 asm volatile (
> +                       ".option push\n"
> +                       ".option arch,+v\n"
>                         "vse8.v v0, (%[wp0])\n"
>                         "vse8.v v1, (%[wq0])\n"
>                         "vse8.v v4, (%[wp1])\n"
> @@ -484,6 +531,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>                         "vse8.v v9, (%[wq2])\n"
>                         "vse8.v v12, (%[wp3])\n"
>                         "vse8.v v13, (%[wq3])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&p[d+NSIZE*0]),
>                         [wq0]"r"(&q[d+NSIZE*0]),
> @@ -495,8 +543,6 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
>                         [wq3]"r"(&q[d+NSIZE*3])
>                 );
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>  static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> @@ -514,6 +560,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                 ".option push\n"
>                 ".option arch,+v\n"
>                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> +               ".option        pop\n"
>         );
>
>         /*
> @@ -525,6 +572,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>         for (d = 0 ; d < bytes ; d += NSIZE*4) {
>                  /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
>                 asm volatile (
> +                       ".option push\n"
> +                       ".option arch,+v\n"
>                         "vle8.v v0, (%[wp0])\n"
>                         "vle8.v v1, (%[wp0])\n"
>                         "vle8.v v4, (%[wp1])\n"
> @@ -533,6 +582,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                         "vle8.v v9, (%[wp2])\n"
>                         "vle8.v v12, (%[wp3])\n"
>                         "vle8.v v13, (%[wp3])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
>                         [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> @@ -552,6 +602,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                          * wp$$ ^= wd$$;
>                          */
>                         asm volatile (
> +                               ".option push\n"
> +                               ".option arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -583,6 +635,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                                 "vle8.v         v14, (%[wd3])\n"
>                                 "vxor.vv        v13, v15, v14\n"
>                                 "vxor.vv        v12, v12, v14\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
>                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> @@ -601,6 +654,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                          * wq$$ = w1$$ ^ w2$$;
>                          */
>                         asm volatile (
> +                               ".option push\n"
> +                               ".option arch,+v\n"
>                                 "vsra.vi        v2, v1, 7\n"
>                                 "vsll.vi        v3, v1, 1\n"
>                                 "vand.vx        v2, v2, %[x1d]\n"
> @@ -620,6 +675,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                                 "vsll.vi        v15, v13, 1\n"
>                                 "vand.vx        v14, v14, %[x1d]\n"
>                                 "vxor.vv        v13, v15, v14\n"
> +                               ".option        pop\n"
>                                 : :
>                                 [x1d]"r"(0x1d)
>                         );
> @@ -634,6 +690,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                  * v12:wp3, v13:wq3, v14:p3, v15:q3
>                  */
>                 asm volatile (
> +                       ".option push\n"
> +                       ".option arch,+v\n"
>                         "vle8.v         v2, (%[wp0])\n"
>                         "vle8.v         v3, (%[wq0])\n"
>                         "vxor.vv        v2, v2, v0\n"
> @@ -661,6 +719,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                         "vxor.vv        v15, v15, v13\n"
>                         "vse8.v         v14, (%[wp3])\n"
>                         "vse8.v         v15, (%[wq3])\n"
> +                       ".option        pop\n"
>                         : :
>                         [wp0]"r"(&p[d+NSIZE*0]),
>                         [wq0]"r"(&q[d+NSIZE*0]),
> @@ -672,8 +731,6 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
>                         [wq3]"r"(&q[d+NSIZE*3])
>                 );
>         }
> -
> -       asm volatile (".option pop\n");
>  }
>
>  #define RAID6_RVV_WRAPPER(_n)                                          \
> --
> 2.34.1
>
>
> - Charlie
>

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Charlie Jenkins 1 year ago

On Mon, Jan 20, 2025 at 11:33:52AM +0800, Chunyan Zhang wrote:
> Hi Charlie,
> 
> On Thu, 9 Jan 2025 at 07:45, Charlie Jenkins <charlie@rivosinc.com> wrote:
> >
> > On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
> [snip]
> > > +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
> > > +             int failb, void **ptrs)
> > > +{
> > > +     u8 *p, *q, *dp, *dq;
> > > +     const u8 *pbmul;        /* P multiplier table for B data */
> > > +     const u8 *qmul;         /* Q multiplier table (for both) */
> > > +
> > > +     p = (u8 *)ptrs[disks - 2];
> > > +     q = (u8 *)ptrs[disks - 1];
> > > +
> > > +     /*
> > > +      * Compute syndrome with zero for the missing data pages
> > > +      * Use the dead data pages as temporary storage for
> > > +      * delta p and delta q
> > > +      */
> > > +     dp = (u8 *)ptrs[faila];
> > > +     ptrs[faila] = (void *)raid6_empty_zero_page;
> > > +     ptrs[disks - 2] = dp;
> > > +     dq = (u8 *)ptrs[failb];
> > > +     ptrs[failb] = (void *)raid6_empty_zero_page;
> > > +     ptrs[disks - 1] = dq;
> > > +
> > > +     raid6_call.gen_syndrome(disks, bytes, ptrs);
> > > +
> > > +     /* Restore pointer table */
> > > +     ptrs[faila]     = dp;
> > > +     ptrs[failb]     = dq;
> > > +     ptrs[disks - 2] = p;
> > > +     ptrs[disks - 1] = q;
> > > +
> > > +     /* Now, pick the proper data tables */
> > > +     pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
> > > +     qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
> > > +                                      raid6_gfexp[failb]]];
> > > +
> > > +     if (crypto_simd_usable()) {
> >
> > There should be an alternate recovery mechanism if it's not currently
> > usable right? I don't know what case could happen when this function is
> > called but crypto_simd_usable() returns false.
> 
> crypto_simd_usable() looks like not needed here.
> The callers would call preempt_disable() before calling this function.
> And I will add .valid callback like you commented below.
> 
> >
> > > +             kernel_vector_begin();
> > > +             __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
> > > +             kernel_vector_end();
> > > +     }
> > > +}
> > > +
> > > +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
> > > +             void **ptrs)
> > > +{
> > > +     u8 *p, *q, *dq;
> > > +     const u8 *qmul;         /* Q multiplier table */
> > > +
> > > +     p = (u8 *)ptrs[disks - 2];
> > > +     q = (u8 *)ptrs[disks - 1];
> > > +
> > > +     /*
> > > +      * Compute syndrome with zero for the missing data page
> > > +      * Use the dead data page as temporary storage for delta q
> > > +      */
> > > +     dq = (u8 *)ptrs[faila];
> > > +     ptrs[faila] = (void *)raid6_empty_zero_page;
> > > +     ptrs[disks - 1] = dq;
> > > +
> > > +     raid6_call.gen_syndrome(disks, bytes, ptrs);
> > > +
> > > +     /* Restore pointer table */
> > > +     ptrs[faila]     = dq;
> > > +     ptrs[disks - 1] = q;
> > > +
> > > +     /* Now, pick the proper data tables */
> > > +     qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
> > > +
> > > +     if (crypto_simd_usable()) {
> >
> > Same here
> >
> > > +             kernel_vector_begin();
> > > +             __raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
> > > +             kernel_vector_end();
> > > +     }
> > > +}
> > > +
> > > +const struct raid6_recov_calls raid6_recov_rvv = {
> > > +     .data2          = raid6_2data_recov_rvv,
> > > +     .datap          = raid6_datap_recov_rvv,
> > > +     .valid          = NULL,
> >
> > These functions should only be called if vector is enabled, so this
> > valid bit should call has_vector(). has_vector() returns a bool and
> > valid expects an int so you can wrap it in something like:
> 
> Ok, will add.
> >
> > static int check_vector(void)
> > {
> >         return has_vector();
> > }
> >
> > Just casting has_vector to int (*)(void) doesn't work, I get:
> >
> > warning: cast between incompatible function types from ‘bool (*)(void)’ {aka ‘_Bool (*)(void)’} to ‘int (*)(void)’ [-Wcast-function-type]
> >
> >
> [snip]
> > > +#define RAID6_RVV_WRAPPER(_n)                                                \
> > > +     static void raid6_rvv ## _n ## _gen_syndrome(int disks,         \
> > > +                                     size_t bytes, void **ptrs)      \
> > > +     {                                                               \
> > > +             void raid6_rvv ## _n  ## _gen_syndrome_real(int,        \
> > > +                                             unsigned long, void**); \
> > > +             if (crypto_simd_usable()) {                             \
> >
> > Same note about crypto_simd_usable as above
> 
> Ok.
> 
> >
> > > +                     kernel_vector_begin();                          \
> > > +                     raid6_rvv ## _n ## _gen_syndrome_real(disks,    \
> > > +                                     (unsigned long)bytes, ptrs);    \
> > > +                     kernel_vector_end();                            \
> > > +             }                                                       \
> > > +     }                                                               \
> > > +     static void raid6_rvv ## _n ## _xor_syndrome(int disks,         \
> > > +                                     int start, int stop,            \
> > > +                                     size_t bytes, void **ptrs)      \
> > > +     {                                                               \
> > > +             void raid6_rvv ## _n  ## _xor_syndrome_real(int,        \
> > > +                             int, int, unsigned long, void**);       \
> > > +             if (crypto_simd_usable()) {                             \
> >
> > ... and here
> 
> Ok.
> 
> >
> > > +                     kernel_vector_begin();                          \
> > > +             raid6_rvv ## _n ## _xor_syndrome_real(disks,            \
> > > +                     start, stop, (unsigned long)bytes, ptrs);       \
> > > +                     kernel_vector_end();                            \
> > > +             }                                                       \
> > > +     }                                                               \
> > > +     struct raid6_calls const raid6_rvvx ## _n = {                   \
> > > +             raid6_rvv ## _n ## _gen_syndrome,                       \
> > > +             raid6_rvv ## _n ## _xor_syndrome,                       \
> > > +             NULL,                                                   \
> >
> > Same note about calling has_vector here.
> 
> Yes.
> 
> >
> > > +             "rvvx" #_n,                                             \
> > > +             0                                                       \
> > > +     }
> > > +
> > > +RAID6_RVV_WRAPPER(1);
> > > +RAID6_RVV_WRAPPER(2);
> > > +RAID6_RVV_WRAPPER(4);
> > > --
> > > 2.34.1
> > >
> > >
> > > _______________________________________________
> > > linux-riscv mailing list
> > > linux-riscv@lists.infradead.org
> > > http://lists.infradead.org/mailman/listinfo/linux-riscv
> >
> > Some interesting results, on QEMU (vlen=256) these vectorized versions
> > are around 6x faster on my CPU. Vector in QEMU is not optimized so I am
> > surprised that there is this much speedup.
> 
> Which version of QEMU did you use? What options were used when running QEMU?
> 
> I want to try on my side, since this test didn't run as fast like your
> test result on my QEMU (I'm using v9.0.0, vlen=128).

Oh interesting, I had the "icount" plugin enabled and that apparently
really messes up this performance counting.

My original test was on QEMU 9.1.0.

I upgraded my QEMU to v9.2.0 and tested again.

With the option "-icount shift=0" I get:

[    0.128304] raid6: rvvx1    gen()  2671 MB/s
[    0.196358] raid6: rvvx2    gen()  3101 MB/s
[    0.264410] raid6: rvvx4    gen()  3370 MB/s
[    0.332488] raid6: int64x8  gen()   548 MB/s
[    0.400552] raid6: int64x4  gen()   601 MB/s
[    0.468600] raid6: int64x2  gen()   585 MB/s
[    0.536661] raid6: int64x1  gen()   519 MB/s
[    0.536673] raid6: using algorithm rvvx4 gen() 3370 MB/s
[    0.604688] raid6: .... xor() 1944 MB/s, rmw enabled

Without it I get:

[    0.366142] raid6: rvvx1    gen()   712 MB/s
[    0.440205] raid6: rvvx2    gen()   733 MB/s
[    0.508751] raid6: rvvx4    gen()   739 MB/s
[    0.577269] raid6: int64x8  gen()  1475 MB/s
[    0.645781] raid6: int64x4  gen()  2164 MB/s
[    0.714363] raid6: int64x2  gen()  1149 MB/s
[    0.782837] raid6: int64x1  gen()  1709 MB/s
[    0.782986] raid6: using algorithm int64x4 gen() 2164 MB/s
[    0.851910] raid6: .... xor() 1131 MB/s, rmw enabled

I will need to keep that in mind when comparing vector performance!
Seems like this option does something weird.

- Charlie

> 
> >
> > # modprobe raid6_pq
> > [   36.238377] raid6: rvvx1    gen()  2668 MB/s
> > [   36.306381] raid6: rvvx2    gen()  3097 MB/s
> > [   36.374376] raid6: rvvx4    gen()  3366 MB/s
> > [   36.442385] raid6: int64x8  gen()   548 MB/s
> > [   36.510397] raid6: int64x4  gen()   600 MB/s
> > [   36.578388] raid6: int64x2  gen()   585 MB/s
> > [   36.646384] raid6: int64x1  gen()   518 MB/s
> > [   36.646395] raid6: using algorithm rvvx4 gen() 3366 MB/s
> > [   36.714377] raid6: .... xor() 1942 MB/s, rmw enabled
> > [   36.714387] raid6: using rvv recovery algorithm
> >
> > I also ran the raid6tests:
> >
> > raid6test: complete (2429 tests, 0 failures)
> >
> > I am not familiar with this algorithm, but since it passed all of the
> > test cases and shows a remarkable speedup, this patch seems like a great
> > improvement.
> >
> > As Jessica pointed out, please put the vector pop/push in the same block
> > as your vector instructions. While testing this code, I threw together a
> > patch for this that you can squash:
> 
> Ok, will do.
> 
> >
> > From 32117c0a5b2bbba7439af37e55631e0e38b63a7c Mon Sep 17 00:00:00 2001
> > From: Charlie Jenkins <charlie@rivosinc.com>
> > Date: Wed, 8 Jan 2025 14:32:26 -0800
> > Subject: [PATCH] Fixup vector options
> >
> > Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
> 
> Thanks for the test, review and the patch.
> Chunyan
> 
> > ---
> >  lib/raid6/Makefile    |  2 --
> >  lib/raid6/recov_rvv.c | 12 ++++---
> >  lib/raid6/rvv.c       | 81 ++++++++++++++++++++++++++++++++++++-------
> >  3 files changed, 77 insertions(+), 18 deletions(-)
> >
> > diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> > index e62fb7cd773e..5be0a4e60ab1 100644
> > --- a/lib/raid6/Makefile
> > +++ b/lib/raid6/Makefile
> > @@ -11,8 +11,6 @@ raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o re
> >  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
> >  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> >  raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> > -CFLAGS_rvv.o += -march=rv64gcv
> > -CFLAGS_recov_rvv.o += -march=rv64gcv
> >
> >  hostprogs      += mktables
> >
> > diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> > index 8ae74803ea7f..02b97d885510 100644
> > --- a/lib/raid6/recov_rvv.c
> > +++ b/lib/raid6/recov_rvv.c
> > @@ -17,6 +17,7 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> >                 ".option        push\n"
> >                 ".option        arch,+v\n"
> >                 "vsetvli        x0, %[avl], e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >                 : :
> >                 [avl]"r"(16)
> >         );
> > @@ -42,6 +43,8 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> >                  * v14:p/qm[vx], v15:p/qm[vy]
> >                  */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vle8.v         v0, (%[px])\n"
> >                         "vle8.v         v1, (%[dp])\n"
> >                         "vxor.vv        v0, v0, v1\n"
> > @@ -67,6 +70,7 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> >                         "vxor.vv        v1, v3, v0\n" /* v1 = db ^ px; */
> >                         "vse8.v         v3, (%[dq])\n"
> >                         "vse8.v         v1, (%[dp])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [px]"r"(p),
> >                         [dp]"r"(dp),
> > @@ -84,8 +88,6 @@ static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> >                 dp += 16;
> >                 dq += 16;
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >  static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
> > @@ -95,6 +97,7 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
> >                 ".option        push\n"
> >                 ".option        arch,+v\n"
> >                 "vsetvli        x0, %[avl], e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >                 : :
> >                 [avl]"r"(16)
> >         );
> > @@ -113,6 +116,8 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
> >                  * v10:m[vx], v11:m[vy]
> >                  */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vle8.v         v0, (%[vx])\n"
> >                         "vle8.v         v2, (%[dq])\n"
> >                         "vxor.vv        v0, v0, v2\n"
> > @@ -127,6 +132,7 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
> >                         "vxor.vv        v1, v0, v1\n"
> >                         "vse8.v         v0, (%[dq])\n"
> >                         "vse8.v         v1, (%[vy])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [vx]"r"(q),
> >                         [vy]"r"(p),
> > @@ -140,8 +146,6 @@ static void __raid6_datap_recov_rvv(int bytes, uint8_t *p, uint8_t *q, uint8_t *
> >                 q += 16;
> >                 dq += 16;
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >
> > diff --git a/lib/raid6/rvv.c b/lib/raid6/rvv.c
> > index 21f5432506da..81b38dcafeb6 100644
> > --- a/lib/raid6/rvv.c
> > +++ b/lib/raid6/rvv.c
> > @@ -31,14 +31,18 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                 ".option        push\n"
> >                 ".option        arch,+v\n"
> >                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >         );
> >
> >          /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> >         for (d = 0 ; d < bytes ; d += NSIZE*1) {
> >                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vle8.v v0, (%[wp0])\n"
> >                         "vle8.v v1, (%[wp0])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&dptr[z0][d+0*NSIZE])
> >                 );
> > @@ -54,6 +58,8 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                          * wp$$ ^= wd$$;
> >                          */
> >                         asm volatile (
> > +                               ".option        push\n"
> > +                               ".option        arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -61,6 +67,7 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                                 "vle8.v         v2, (%[wd0])\n"
> >                                 "vxor.vv        v1, v3, v2\n"
> >                                 "vxor.vv        v0, v0, v2\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
> >                                 [x1d]"r"(0x1d)
> > @@ -72,15 +79,16 @@ static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                  * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> >                  */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vse8.v         v0, (%[wp0])\n"
> >                         "vse8.v         v1, (%[wq0])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&p[d+NSIZE*0]),
> >                         [wq0]"r"(&q[d+NSIZE*0])
> >                 );
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >  static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> > @@ -98,14 +106,18 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> >                 ".option push\n"
> >                 ".option arch,+v\n"
> >                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >         );
> >
> >         /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
> >         for (d = 0 ; d < bytes ; d += NSIZE*1) {
> >                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vle8.v v0, (%[wp0])\n"
> >                         "vle8.v v1, (%[wp0])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&dptr[z0][d+0*NSIZE])
> >                 );
> > @@ -122,6 +134,8 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> >                          * wp$$ ^= wd$$;
> >                          */
> >                         asm volatile (
> > +                               ".option        push\n"
> > +                               ".option        arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -129,6 +143,7 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> >                                 "vle8.v         v2, (%[wd0])\n"
> >                                 "vxor.vv        v1, v3, v2\n"
> >                                 "vxor.vv        v0, v0, v2\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
> >                                 [x1d]"r"(0x1d)
> > @@ -144,10 +159,13 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> >                          * wq$$ = w1$$ ^ w2$$;
> >                          */
> >                         asm volatile (
> > +                               ".option        push\n"
> > +                               ".option        arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> >                                 "vxor.vv        v1, v3, v2\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [x1d]"r"(0x1d)
> >                         );
> > @@ -159,19 +177,20 @@ static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
> >                  * v0:wp0, v1:wq0, v2:p0, v3:q0
> >                  */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vle8.v         v2, (%[wp0])\n"
> >                         "vle8.v         v3, (%[wq0])\n"
> >                         "vxor.vv        v2, v2, v0\n"
> >                         "vxor.vv        v3, v3, v1\n"
> >                         "vse8.v         v2, (%[wp0])\n"
> >                         "vse8.v         v3, (%[wq0])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&p[d+NSIZE*0]),
> >                         [wq0]"r"(&q[d+NSIZE*0])
> >                 );
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >  static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> > @@ -188,6 +207,7 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                 ".option        push\n"
> >                 ".option        arch,+v\n"
> >                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >         );
> >
> >         /*
> > @@ -197,10 +217,13 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
> >         for (d = 0 ; d < bytes ; d += NSIZE*2) {
> >                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vle8.v v0, (%[wp0])\n"
> >                         "vle8.v v1, (%[wp0])\n"
> >                         "vle8.v v4, (%[wp1])\n"
> >                         "vle8.v v5, (%[wp1])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> >                         [wp1]"r"(&dptr[z0][d+1*NSIZE])
> > @@ -217,6 +240,8 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                          * wp$$ ^= wd$$;
> >                          */
> >                         asm volatile (
> > +                               ".option        push\n"
> > +                               ".option        arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -232,6 +257,7 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                                 "vle8.v         v6, (%[wd1])\n"
> >                                 "vxor.vv        v5, v7, v6\n"
> >                                 "vxor.vv        v4, v4, v6\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
> >                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > @@ -244,10 +270,13 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                  * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> >                  */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vse8.v         v0, (%[wp0])\n"
> >                         "vse8.v         v1, (%[wq0])\n"
> >                         "vse8.v         v4, (%[wp1])\n"
> >                         "vse8.v         v5, (%[wq1])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&p[d+NSIZE*0]),
> >                         [wq0]"r"(&q[d+NSIZE*0]),
> > @@ -255,8 +284,6 @@ static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                         [wq1]"r"(&q[d+NSIZE*1])
> >                 );
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >  static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> > @@ -274,6 +301,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                 ".option push\n"
> >                 ".option arch,+v\n"
> >                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >         );
> >
> >         /*
> > @@ -283,10 +311,13 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >         for (d = 0 ; d < bytes ; d += NSIZE*2) {
> >                  /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> >                 asm volatile (
> > +                       ".option        push\n"
> > +                       ".option        arch,+v\n"
> >                         "vle8.v v0, (%[wp0])\n"
> >                         "vle8.v v1, (%[wp0])\n"
> >                         "vle8.v v4, (%[wp1])\n"
> >                         "vle8.v v5, (%[wp1])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> >                         [wp1]"r"(&dptr[z0][d+1*NSIZE])
> > @@ -304,6 +335,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                          * wp$$ ^= wd$$;
> >                          */
> >                         asm volatile (
> > +                               ".option push\n"
> > +                               ".option arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -319,6 +352,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                                 "vle8.v         v6, (%[wd1])\n"
> >                                 "vxor.vv        v5, v7, v6\n"
> >                                 "vxor.vv        v4, v4, v6\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
> >                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > @@ -335,6 +369,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                          * wq$$ = w1$$ ^ w2$$;
> >                          */
> >                         asm volatile (
> > +                               ".option push\n"
> > +                               ".option arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -344,6 +380,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                                 "vsll.vi        v7, v5, 1\n"
> >                                 "vand.vx        v6, v6, %[x1d]\n"
> >                                 "vxor.vv        v5, v7, v6\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [x1d]"r"(0x1d)
> >                         );
> > @@ -356,6 +393,8 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                  * v4:wp1, v5:wq1, v6:p1, v7:q1
> >                  */
> >                 asm volatile (
> > +                       ".option push\n"
> > +                       ".option arch,+v\n"
> >                         "vle8.v         v2, (%[wp0])\n"
> >                         "vle8.v         v3, (%[wq0])\n"
> >                         "vxor.vv        v2, v2, v0\n"
> > @@ -369,6 +408,7 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                         "vxor.vv        v7, v7, v5\n"
> >                         "vse8.v         v6, (%[wp1])\n"
> >                         "vse8.v         v7, (%[wq1])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&p[d+NSIZE*0]),
> >                         [wq0]"r"(&q[d+NSIZE*0]),
> > @@ -376,8 +416,6 @@ static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
> >                         [wq1]"r"(&q[d+NSIZE*1])
> >                 );
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >  static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
> > @@ -394,6 +432,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                 ".option        push\n"
> >                 ".option        arch,+v\n"
> >                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >         );
> >
> >         /*
> > @@ -405,6 +444,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >         for (d = 0 ; d < bytes ; d += NSIZE*4) {
> >                 /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> >                 asm volatile (
> > +                       ".option push\n"
> > +                       ".option arch,+v\n"
> >                         "vle8.v v0, (%[wp0])\n"
> >                         "vle8.v v1, (%[wp0])\n"
> >                         "vle8.v v4, (%[wp1])\n"
> > @@ -413,6 +454,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                         "vle8.v v9, (%[wp2])\n"
> >                         "vle8.v v12, (%[wp3])\n"
> >                         "vle8.v v13, (%[wp3])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> >                         [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> > @@ -431,6 +473,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                          * wp$$ ^= wd$$;
> >                          */
> >                         asm volatile (
> > +                               ".option push\n"
> > +                               ".option arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -462,6 +506,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                                 "vle8.v         v14, (%[wd3])\n"
> >                                 "vxor.vv        v13, v15, v14\n"
> >                                 "vxor.vv        v12, v12, v14\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
> >                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > @@ -476,6 +521,8 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                  * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
> >                  */
> >                 asm volatile (
> > +                       ".option push\n"
> > +                       ".option arch,+v\n"
> >                         "vse8.v v0, (%[wp0])\n"
> >                         "vse8.v v1, (%[wq0])\n"
> >                         "vse8.v v4, (%[wp1])\n"
> > @@ -484,6 +531,7 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                         "vse8.v v9, (%[wq2])\n"
> >                         "vse8.v v12, (%[wp3])\n"
> >                         "vse8.v v13, (%[wq3])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&p[d+NSIZE*0]),
> >                         [wq0]"r"(&q[d+NSIZE*0]),
> > @@ -495,8 +543,6 @@ static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **
> >                         [wq3]"r"(&q[d+NSIZE*3])
> >                 );
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >  static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> > @@ -514,6 +560,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                 ".option push\n"
> >                 ".option arch,+v\n"
> >                 "vsetvli        t0, x0, e8, m1, ta, ma\n"
> > +               ".option        pop\n"
> >         );
> >
> >         /*
> > @@ -525,6 +572,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >         for (d = 0 ; d < bytes ; d += NSIZE*4) {
> >                  /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
> >                 asm volatile (
> > +                       ".option push\n"
> > +                       ".option arch,+v\n"
> >                         "vle8.v v0, (%[wp0])\n"
> >                         "vle8.v v1, (%[wp0])\n"
> >                         "vle8.v v4, (%[wp1])\n"
> > @@ -533,6 +582,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                         "vle8.v v9, (%[wp2])\n"
> >                         "vle8.v v12, (%[wp3])\n"
> >                         "vle8.v v13, (%[wp3])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&dptr[z0][d+0*NSIZE]),
> >                         [wp1]"r"(&dptr[z0][d+1*NSIZE]),
> > @@ -552,6 +602,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                          * wp$$ ^= wd$$;
> >                          */
> >                         asm volatile (
> > +                               ".option push\n"
> > +                               ".option arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -583,6 +635,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                                 "vle8.v         v14, (%[wd3])\n"
> >                                 "vxor.vv        v13, v15, v14\n"
> >                                 "vxor.vv        v12, v12, v14\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [wd0]"r"(&dptr[z][d+0*NSIZE]),
> >                                 [wd1]"r"(&dptr[z][d+1*NSIZE]),
> > @@ -601,6 +654,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                          * wq$$ = w1$$ ^ w2$$;
> >                          */
> >                         asm volatile (
> > +                               ".option push\n"
> > +                               ".option arch,+v\n"
> >                                 "vsra.vi        v2, v1, 7\n"
> >                                 "vsll.vi        v3, v1, 1\n"
> >                                 "vand.vx        v2, v2, %[x1d]\n"
> > @@ -620,6 +675,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                                 "vsll.vi        v15, v13, 1\n"
> >                                 "vand.vx        v14, v14, %[x1d]\n"
> >                                 "vxor.vv        v13, v15, v14\n"
> > +                               ".option        pop\n"
> >                                 : :
> >                                 [x1d]"r"(0x1d)
> >                         );
> > @@ -634,6 +690,8 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                  * v12:wp3, v13:wq3, v14:p3, v15:q3
> >                  */
> >                 asm volatile (
> > +                       ".option push\n"
> > +                       ".option arch,+v\n"
> >                         "vle8.v         v2, (%[wp0])\n"
> >                         "vle8.v         v3, (%[wq0])\n"
> >                         "vxor.vv        v2, v2, v0\n"
> > @@ -661,6 +719,7 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                         "vxor.vv        v15, v15, v13\n"
> >                         "vse8.v         v14, (%[wp3])\n"
> >                         "vse8.v         v15, (%[wq3])\n"
> > +                       ".option        pop\n"
> >                         : :
> >                         [wp0]"r"(&p[d+NSIZE*0]),
> >                         [wq0]"r"(&q[d+NSIZE*0]),
> > @@ -672,8 +731,6 @@ static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
> >                         [wq3]"r"(&q[d+NSIZE*3])
> >                 );
> >         }
> > -
> > -       asm volatile (".option pop\n");
> >  }
> >
> >  #define RAID6_RVV_WRAPPER(_n)                                          \
> > --
> > 2.34.1
> >
> >
> > - Charlie
> >

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Conor Dooley 1 year, 1 month ago

On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
> The assembly is originally based on the ARM NEON and int.uc, but uses
> RISC-V vector instructions to implement the RAID6 syndrome and
> recovery calculations.
> 
> The functions are tested on QEMU.
> 
> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> ---
>  include/linux/raid/pq.h |   4 +
>  lib/raid6/Makefile      |   3 +
>  lib/raid6/algos.c       |   8 +
>  lib/raid6/recov_rvv.c   | 229 +++++++++++++
>  lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 959 insertions(+)
>  create mode 100644 lib/raid6/recov_rvv.c
>  create mode 100644 lib/raid6/rvv.c
> 
> diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> index 98030accf641..4c21f06c662a 100644
> --- a/include/linux/raid/pq.h
> +++ b/include/linux/raid/pq.h
> @@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
>  extern const struct raid6_calls raid6_vpermxor8;
>  extern const struct raid6_calls raid6_lsx;
>  extern const struct raid6_calls raid6_lasx;
> +extern const struct raid6_calls raid6_rvvx1;
> +extern const struct raid6_calls raid6_rvvx2;
> +extern const struct raid6_calls raid6_rvvx4;
>  
>  struct raid6_recov_calls {
>  	void (*data2)(int, size_t, int, int, void **);
> @@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
>  extern const struct raid6_recov_calls raid6_recov_neon;
>  extern const struct raid6_recov_calls raid6_recov_lsx;
>  extern const struct raid6_recov_calls raid6_recov_lasx;
> +extern const struct raid6_recov_calls raid6_recov_rvv;
>  
>  extern const struct raid6_calls raid6_neonx1;
>  extern const struct raid6_calls raid6_neonx2;
> diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> index 29127dd05d63..e62fb7cd773e 100644
> --- a/lib/raid6/Makefile
> +++ b/lib/raid6/Makefile
> @@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
>  raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
>  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
>  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> +CFLAGS_rvv.o += -march=rv64gcv
> +CFLAGS_recov_rvv.o += -march=rv64gcv

I'm curious - why do you need this when you're using .option arch,+v
below?

>  hostprogs	+= mktables
>  
> diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
> index cd2e88ee1f14..0a388a605131 100644
> --- a/lib/raid6/algos.c
> +++ b/lib/raid6/algos.c
> @@ -80,6 +80,11 @@ const struct raid6_calls * const raid6_algos[] = {
>  #ifdef CONFIG_CPU_HAS_LSX
>  	&raid6_lsx,
>  #endif
> +#endif
> +#ifdef CONFIG_RISCV_ISA_V
> +	&raid6_rvvx1,
> +	&raid6_rvvx2,
> +	&raid6_rvvx4,
>  #endif
>  	&raid6_intx8,
>  	&raid6_intx4,
> @@ -115,6 +120,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
>  #ifdef CONFIG_CPU_HAS_LSX
>  	&raid6_recov_lsx,
>  #endif
> +#endif
> +#ifdef CONFIG_RISCV_ISA_V
> +	&raid6_recov_rvv,
>  #endif
>  	&raid6_recov_intx1,
>  	NULL
> diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> new file mode 100644
> index 000000000000..8ae74803ea7f
> --- /dev/null
> +++ b/lib/raid6/recov_rvv.c
> @@ -0,0 +1,229 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright 2024 Institute of Software, CAS.
> + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> + */
> +
> +#include <asm/simd.h>
> +#include <asm/vector.h>
> +#include <crypto/internal/simd.h>
> +#include <linux/raid/pq.h>
> +
> +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> +			      u8 *dq, const u8 *pbmul,
> +			      const u8 *qmul)
> +{
> +	asm volatile (
> +		".option	push\n"
> +		".option	arch,+v\n"

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Chunyan Zhang 1 year, 1 month ago

Hi Conor,

On Sat, 21 Dec 2024 at 06:52, Conor Dooley <conor@kernel.org> wrote:
>
> On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
> > The assembly is originally based on the ARM NEON and int.uc, but uses
> > RISC-V vector instructions to implement the RAID6 syndrome and
> > recovery calculations.
> >
> > The functions are tested on QEMU.
> >
> > Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > ---
> >  include/linux/raid/pq.h |   4 +
> >  lib/raid6/Makefile      |   3 +
> >  lib/raid6/algos.c       |   8 +
> >  lib/raid6/recov_rvv.c   | 229 +++++++++++++
> >  lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 959 insertions(+)
> >  create mode 100644 lib/raid6/recov_rvv.c
> >  create mode 100644 lib/raid6/rvv.c
> >
> > diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> > index 98030accf641..4c21f06c662a 100644
> > --- a/include/linux/raid/pq.h
> > +++ b/include/linux/raid/pq.h
> > @@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
> >  extern const struct raid6_calls raid6_vpermxor8;
> >  extern const struct raid6_calls raid6_lsx;
> >  extern const struct raid6_calls raid6_lasx;
> > +extern const struct raid6_calls raid6_rvvx1;
> > +extern const struct raid6_calls raid6_rvvx2;
> > +extern const struct raid6_calls raid6_rvvx4;
> >
> >  struct raid6_recov_calls {
> >       void (*data2)(int, size_t, int, int, void **);
> > @@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
> >  extern const struct raid6_recov_calls raid6_recov_neon;
> >  extern const struct raid6_recov_calls raid6_recov_lsx;
> >  extern const struct raid6_recov_calls raid6_recov_lasx;
> > +extern const struct raid6_recov_calls raid6_recov_rvv;
> >
> >  extern const struct raid6_calls raid6_neonx1;
> >  extern const struct raid6_calls raid6_neonx2;
> > diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> > index 29127dd05d63..e62fb7cd773e 100644
> > --- a/lib/raid6/Makefile
> > +++ b/lib/raid6/Makefile
> > @@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
> >  raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
> >  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
> >  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> > +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> > +CFLAGS_rvv.o += -march=rv64gcv
> > +CFLAGS_recov_rvv.o += -march=rv64gcv
>
> I'm curious - why do you need this when you're using .option arch,+v
> below?

Compiler would complain the errors like below without this flag:

Error: unrecognized opcode `vle8.v v0,(a3)', extension `v' or `zve64x'
or `zve32x' required

>
> >  hostprogs    += mktables
> >
> > diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
> > index cd2e88ee1f14..0a388a605131 100644
> > --- a/lib/raid6/algos.c
> > +++ b/lib/raid6/algos.c
> > @@ -80,6 +80,11 @@ const struct raid6_calls * const raid6_algos[] = {
> >  #ifdef CONFIG_CPU_HAS_LSX
> >       &raid6_lsx,
> >  #endif
> > +#endif
> > +#ifdef CONFIG_RISCV_ISA_V
> > +     &raid6_rvvx1,
> > +     &raid6_rvvx2,
> > +     &raid6_rvvx4,
> >  #endif
> >       &raid6_intx8,
> >       &raid6_intx4,
> > @@ -115,6 +120,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
> >  #ifdef CONFIG_CPU_HAS_LSX
> >       &raid6_recov_lsx,
> >  #endif
> > +#endif
> > +#ifdef CONFIG_RISCV_ISA_V
> > +     &raid6_recov_rvv,
> >  #endif
> >       &raid6_recov_intx1,
> >       NULL
> > diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> > new file mode 100644
> > index 000000000000..8ae74803ea7f
> > --- /dev/null
> > +++ b/lib/raid6/recov_rvv.c
> > @@ -0,0 +1,229 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright 2024 Institute of Software, CAS.
> > + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > + */
> > +
> > +#include <asm/simd.h>
> > +#include <asm/vector.h>
> > +#include <crypto/internal/simd.h>
> > +#include <linux/raid/pq.h>
> > +
> > +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> > +                           u8 *dq, const u8 *pbmul,
> > +                           const u8 *qmul)
> > +{
> > +     asm volatile (
> > +             ".option        push\n"
> > +             ".option        arch,+v\n"

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Conor Dooley 1 year, 1 month ago

On Mon, Dec 23, 2024 at 09:16:38AM +0800, Chunyan Zhang wrote:
> Hi Conor,
> 
> On Sat, 21 Dec 2024 at 06:52, Conor Dooley <conor@kernel.org> wrote:
> >
> > On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
> > > The assembly is originally based on the ARM NEON and int.uc, but uses
> > > RISC-V vector instructions to implement the RAID6 syndrome and
> > > recovery calculations.
> > >
> > > The functions are tested on QEMU.
> > >
> > > Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > > ---
> > >  include/linux/raid/pq.h |   4 +
> > >  lib/raid6/Makefile      |   3 +
> > >  lib/raid6/algos.c       |   8 +
> > >  lib/raid6/recov_rvv.c   | 229 +++++++++++++
> > >  lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
> > >  5 files changed, 959 insertions(+)
> > >  create mode 100644 lib/raid6/recov_rvv.c
> > >  create mode 100644 lib/raid6/rvv.c
> > >
> > > diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> > > index 98030accf641..4c21f06c662a 100644
> > > --- a/include/linux/raid/pq.h
> > > +++ b/include/linux/raid/pq.h
> > > @@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
> > >  extern const struct raid6_calls raid6_vpermxor8;
> > >  extern const struct raid6_calls raid6_lsx;
> > >  extern const struct raid6_calls raid6_lasx;
> > > +extern const struct raid6_calls raid6_rvvx1;
> > > +extern const struct raid6_calls raid6_rvvx2;
> > > +extern const struct raid6_calls raid6_rvvx4;
> > >
> > >  struct raid6_recov_calls {
> > >       void (*data2)(int, size_t, int, int, void **);
> > > @@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
> > >  extern const struct raid6_recov_calls raid6_recov_neon;
> > >  extern const struct raid6_recov_calls raid6_recov_lsx;
> > >  extern const struct raid6_recov_calls raid6_recov_lasx;
> > > +extern const struct raid6_recov_calls raid6_recov_rvv;
> > >
> > >  extern const struct raid6_calls raid6_neonx1;
> > >  extern const struct raid6_calls raid6_neonx2;
> > > diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> > > index 29127dd05d63..e62fb7cd773e 100644
> > > --- a/lib/raid6/Makefile
> > > +++ b/lib/raid6/Makefile
> > > @@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
> > >  raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
> > >  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
> > >  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> > > +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> > > +CFLAGS_rvv.o += -march=rv64gcv
> > > +CFLAGS_recov_rvv.o += -march=rv64gcv
> >
> > I'm curious - why do you need this when you're using .option arch,+v
> > below?
> 
> Compiler would complain the errors like below without this flag:
> 
> Error: unrecognized opcode `vle8.v v0,(a3)', extension `v' or `zve64x'
> or `zve32x' required

Right, but the reason for using .option arch,+v elsewhere in the kernel
is because we don't want the compiler to generate vector code at all,
and the directive lets the assembler handle the vector instructions. If
I recall correctly, the error you pasted above is from the assembler,
not the compiler. You should be able to just set AFLAGS, given that all
of the vector code you're adding is hand written as far as I can see.

> > >  hostprogs    += mktables
> > >
> > > diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
> > > index cd2e88ee1f14..0a388a605131 100644
> > > --- a/lib/raid6/algos.c
> > > +++ b/lib/raid6/algos.c
> > > @@ -80,6 +80,11 @@ const struct raid6_calls * const raid6_algos[] = {
> > >  #ifdef CONFIG_CPU_HAS_LSX
> > >       &raid6_lsx,
> > >  #endif
> > > +#endif
> > > +#ifdef CONFIG_RISCV_ISA_V
> > > +     &raid6_rvvx1,
> > > +     &raid6_rvvx2,
> > > +     &raid6_rvvx4,
> > >  #endif
> > >       &raid6_intx8,
> > >       &raid6_intx4,
> > > @@ -115,6 +120,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
> > >  #ifdef CONFIG_CPU_HAS_LSX
> > >       &raid6_recov_lsx,
> > >  #endif
> > > +#endif
> > > +#ifdef CONFIG_RISCV_ISA_V
> > > +     &raid6_recov_rvv,
> > >  #endif
> > >       &raid6_recov_intx1,
> > >       NULL
> > > diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> > > new file mode 100644
> > > index 000000000000..8ae74803ea7f
> > > --- /dev/null
> > > +++ b/lib/raid6/recov_rvv.c
> > > @@ -0,0 +1,229 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * Copyright 2024 Institute of Software, CAS.
> > > + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > > + */
> > > +
> > > +#include <asm/simd.h>
> > > +#include <asm/vector.h>
> > > +#include <crypto/internal/simd.h>
> > > +#include <linux/raid/pq.h>
> > > +
> > > +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> > > +                           u8 *dq, const u8 *pbmul,
> > > +                           const u8 *qmul)
> > > +{
> > > +     asm volatile (
> > > +             ".option        push\n"
> > > +             ".option        arch,+v\n"

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Chunyan Zhang 1 year, 1 month ago

On Mon, 23 Dec 2024 at 09:35, Conor Dooley <conor@kernel.org> wrote:
>
> On Mon, Dec 23, 2024 at 09:16:38AM +0800, Chunyan Zhang wrote:
> > Hi Conor,
> >
> > On Sat, 21 Dec 2024 at 06:52, Conor Dooley <conor@kernel.org> wrote:
> > >
> > > On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
> > > > The assembly is originally based on the ARM NEON and int.uc, but uses
> > > > RISC-V vector instructions to implement the RAID6 syndrome and
> > > > recovery calculations.
> > > >
> > > > The functions are tested on QEMU.
> > > >
> > > > Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > > > ---
> > > >  include/linux/raid/pq.h |   4 +
> > > >  lib/raid6/Makefile      |   3 +
> > > >  lib/raid6/algos.c       |   8 +
> > > >  lib/raid6/recov_rvv.c   | 229 +++++++++++++
> > > >  lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
> > > >  5 files changed, 959 insertions(+)
> > > >  create mode 100644 lib/raid6/recov_rvv.c
> > > >  create mode 100644 lib/raid6/rvv.c
> > > >
> > > > diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> > > > index 98030accf641..4c21f06c662a 100644
> > > > --- a/include/linux/raid/pq.h
> > > > +++ b/include/linux/raid/pq.h
> > > > @@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
> > > >  extern const struct raid6_calls raid6_vpermxor8;
> > > >  extern const struct raid6_calls raid6_lsx;
> > > >  extern const struct raid6_calls raid6_lasx;
> > > > +extern const struct raid6_calls raid6_rvvx1;
> > > > +extern const struct raid6_calls raid6_rvvx2;
> > > > +extern const struct raid6_calls raid6_rvvx4;
> > > >
> > > >  struct raid6_recov_calls {
> > > >       void (*data2)(int, size_t, int, int, void **);
> > > > @@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
> > > >  extern const struct raid6_recov_calls raid6_recov_neon;
> > > >  extern const struct raid6_recov_calls raid6_recov_lsx;
> > > >  extern const struct raid6_recov_calls raid6_recov_lasx;
> > > > +extern const struct raid6_recov_calls raid6_recov_rvv;
> > > >
> > > >  extern const struct raid6_calls raid6_neonx1;
> > > >  extern const struct raid6_calls raid6_neonx2;
> > > > diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> > > > index 29127dd05d63..e62fb7cd773e 100644
> > > > --- a/lib/raid6/Makefile
> > > > +++ b/lib/raid6/Makefile
> > > > @@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
> > > >  raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
> > > >  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
> > > >  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> > > > +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> > > > +CFLAGS_rvv.o += -march=rv64gcv
> > > > +CFLAGS_recov_rvv.o += -march=rv64gcv
> > >
> > > I'm curious - why do you need this when you're using .option arch,+v
> > > below?
> >
> > Compiler would complain the errors like below without this flag:
> >
> > Error: unrecognized opcode `vle8.v v0,(a3)', extension `v' or `zve64x'
> > or `zve32x' required
>
> Right, but the reason for using .option arch,+v elsewhere in the kernel
> is because we don't want the compiler to generate vector code at all,
> and the directive lets the assembler handle the vector instructions. If
> I recall correctly, the error you pasted above is from the assembler,

Yes, it is from the assembler.

> not the compiler. You should be able to just set AFLAGS, given that all

It complains the same errors after simply replacing CFLAGS with AFLAGS
here. What am I missing?

Thanks,
Chunyan

> of the vector code you're adding is hand written as far as I can see.
>
> > > >  hostprogs    += mktables
> > > >
> > > > diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
> > > > index cd2e88ee1f14..0a388a605131 100644
> > > > --- a/lib/raid6/algos.c
> > > > +++ b/lib/raid6/algos.c
> > > > @@ -80,6 +80,11 @@ const struct raid6_calls * const raid6_algos[] = {
> > > >  #ifdef CONFIG_CPU_HAS_LSX
> > > >       &raid6_lsx,
> > > >  #endif
> > > > +#endif
> > > > +#ifdef CONFIG_RISCV_ISA_V
> > > > +     &raid6_rvvx1,
> > > > +     &raid6_rvvx2,
> > > > +     &raid6_rvvx4,
> > > >  #endif
> > > >       &raid6_intx8,
> > > >       &raid6_intx4,
> > > > @@ -115,6 +120,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
> > > >  #ifdef CONFIG_CPU_HAS_LSX
> > > >       &raid6_recov_lsx,
> > > >  #endif
> > > > +#endif
> > > > +#ifdef CONFIG_RISCV_ISA_V
> > > > +     &raid6_recov_rvv,
> > > >  #endif
> > > >       &raid6_recov_intx1,
> > > >       NULL
> > > > diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c
> > > > new file mode 100644
> > > > index 000000000000..8ae74803ea7f
> > > > --- /dev/null
> > > > +++ b/lib/raid6/recov_rvv.c
> > > > @@ -0,0 +1,229 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +/*
> > > > + * Copyright 2024 Institute of Software, CAS.
> > > > + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > > > + */
> > > > +
> > > > +#include <asm/simd.h>
> > > > +#include <asm/vector.h>
> > > > +#include <crypto/internal/simd.h>
> > > > +#include <linux/raid/pq.h>
> > > > +
> > > > +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
> > > > +                           u8 *dq, const u8 *pbmul,
> > > > +                           const u8 *qmul)
> > > > +{
> > > > +     asm volatile (
> > > > +             ".option        push\n"
> > > > +             ".option        arch,+v\n"

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Conor Dooley 1 year, 1 month ago

On Mon, Dec 23, 2024 at 10:16:46AM +0800, Chunyan Zhang wrote:
> On Mon, 23 Dec 2024 at 09:35, Conor Dooley <conor@kernel.org> wrote:
> >
> > On Mon, Dec 23, 2024 at 09:16:38AM +0800, Chunyan Zhang wrote:
> > > Hi Conor,
> > >
> > > On Sat, 21 Dec 2024 at 06:52, Conor Dooley <conor@kernel.org> wrote:
> > > >
> > > > On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
> > > > > The assembly is originally based on the ARM NEON and int.uc, but uses
> > > > > RISC-V vector instructions to implement the RAID6 syndrome and
> > > > > recovery calculations.
> > > > >
> > > > > The functions are tested on QEMU.
> > > > >
> > > > > Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
> > > > > ---
> > > > >  include/linux/raid/pq.h |   4 +
> > > > >  lib/raid6/Makefile      |   3 +
> > > > >  lib/raid6/algos.c       |   8 +
> > > > >  lib/raid6/recov_rvv.c   | 229 +++++++++++++
> > > > >  lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
> > > > >  5 files changed, 959 insertions(+)
> > > > >  create mode 100644 lib/raid6/recov_rvv.c
> > > > >  create mode 100644 lib/raid6/rvv.c
> > > > >
> > > > > diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
> > > > > index 98030accf641..4c21f06c662a 100644
> > > > > --- a/include/linux/raid/pq.h
> > > > > +++ b/include/linux/raid/pq.h
> > > > > @@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
> > > > >  extern const struct raid6_calls raid6_vpermxor8;
> > > > >  extern const struct raid6_calls raid6_lsx;
> > > > >  extern const struct raid6_calls raid6_lasx;
> > > > > +extern const struct raid6_calls raid6_rvvx1;
> > > > > +extern const struct raid6_calls raid6_rvvx2;
> > > > > +extern const struct raid6_calls raid6_rvvx4;
> > > > >
> > > > >  struct raid6_recov_calls {
> > > > >       void (*data2)(int, size_t, int, int, void **);
> > > > > @@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
> > > > >  extern const struct raid6_recov_calls raid6_recov_neon;
> > > > >  extern const struct raid6_recov_calls raid6_recov_lsx;
> > > > >  extern const struct raid6_recov_calls raid6_recov_lasx;
> > > > > +extern const struct raid6_recov_calls raid6_recov_rvv;
> > > > >
> > > > >  extern const struct raid6_calls raid6_neonx1;
> > > > >  extern const struct raid6_calls raid6_neonx2;
> > > > > diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
> > > > > index 29127dd05d63..e62fb7cd773e 100644
> > > > > --- a/lib/raid6/Makefile
> > > > > +++ b/lib/raid6/Makefile
> > > > > @@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
> > > > >  raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
> > > > >  raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
> > > > >  raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
> > > > > +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
> > > > > +CFLAGS_rvv.o += -march=rv64gcv
> > > > > +CFLAGS_recov_rvv.o += -march=rv64gcv
> > > >
> > > > I'm curious - why do you need this when you're using .option arch,+v
> > > > below?
> > >
> > > Compiler would complain the errors like below without this flag:
> > >
> > > Error: unrecognized opcode `vle8.v v0,(a3)', extension `v' or `zve64x'
> > > or `zve32x' required
> >
> > Right, but the reason for using .option arch,+v elsewhere in the kernel
> > is because we don't want the compiler to generate vector code at all,
> > and the directive lets the assembler handle the vector instructions. If
> > I recall correctly, the error you pasted above is from the assembler,
> 
> Yes, it is from the assembler.
> 
> > not the compiler. You should be able to just set AFLAGS, given that all
> 
> It complains the same errors after simply replacing CFLAGS with AFLAGS
> here. What am I missing?
> 

I don't know what you're missing unfortunately, sorry.

Re: [RFC PATCH] raid6: Add RISC-V SIMD syndrome and recovery calculations

Posted by Jessica Clarke 1 year, 1 month ago

On 8 Jan 2025, at 18:57, Conor Dooley <conor@kernel.org> wrote:
> 
> On Mon, Dec 23, 2024 at 10:16:46AM +0800, Chunyan Zhang wrote:
>> On Mon, 23 Dec 2024 at 09:35, Conor Dooley <conor@kernel.org> wrote:
>>> 
>>> On Mon, Dec 23, 2024 at 09:16:38AM +0800, Chunyan Zhang wrote:
>>>> Hi Conor,
>>>> 
>>>> On Sat, 21 Dec 2024 at 06:52, Conor Dooley <conor@kernel.org> wrote:
>>>>> 
>>>>> On Fri, Dec 20, 2024 at 07:40:23PM +0800, Chunyan Zhang wrote:
>>>>>> The assembly is originally based on the ARM NEON and int.uc, but uses
>>>>>> RISC-V vector instructions to implement the RAID6 syndrome and
>>>>>> recovery calculations.
>>>>>> 
>>>>>> The functions are tested on QEMU.
>>>>>> 
>>>>>> Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
>>>>>> ---
>>>>>> include/linux/raid/pq.h |   4 +
>>>>>> lib/raid6/Makefile      |   3 +
>>>>>> lib/raid6/algos.c       |   8 +
>>>>>> lib/raid6/recov_rvv.c   | 229 +++++++++++++
>>>>>> lib/raid6/rvv.c         | 715 ++++++++++++++++++++++++++++++++++++++++
>>>>>> 5 files changed, 959 insertions(+)
>>>>>> create mode 100644 lib/raid6/recov_rvv.c
>>>>>> create mode 100644 lib/raid6/rvv.c
>>>>>> 
>>>>>> diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
>>>>>> index 98030accf641..4c21f06c662a 100644
>>>>>> --- a/include/linux/raid/pq.h
>>>>>> +++ b/include/linux/raid/pq.h
>>>>>> @@ -108,6 +108,9 @@ extern const struct raid6_calls raid6_vpermxor4;
>>>>>> extern const struct raid6_calls raid6_vpermxor8;
>>>>>> extern const struct raid6_calls raid6_lsx;
>>>>>> extern const struct raid6_calls raid6_lasx;
>>>>>> +extern const struct raid6_calls raid6_rvvx1;
>>>>>> +extern const struct raid6_calls raid6_rvvx2;
>>>>>> +extern const struct raid6_calls raid6_rvvx4;
>>>>>> 
>>>>>> struct raid6_recov_calls {
>>>>>>      void (*data2)(int, size_t, int, int, void **);
>>>>>> @@ -125,6 +128,7 @@ extern const struct raid6_recov_calls raid6_recov_s390xc;
>>>>>> extern const struct raid6_recov_calls raid6_recov_neon;
>>>>>> extern const struct raid6_recov_calls raid6_recov_lsx;
>>>>>> extern const struct raid6_recov_calls raid6_recov_lasx;
>>>>>> +extern const struct raid6_recov_calls raid6_recov_rvv;
>>>>>> 
>>>>>> extern const struct raid6_calls raid6_neonx1;
>>>>>> extern const struct raid6_calls raid6_neonx2;
>>>>>> diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
>>>>>> index 29127dd05d63..e62fb7cd773e 100644
>>>>>> --- a/lib/raid6/Makefile
>>>>>> +++ b/lib/raid6/Makefile
>>>>>> @@ -10,6 +10,9 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
>>>>>> raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
>>>>>> raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
>>>>>> raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
>>>>>> +raid6_pq-$(CONFIG_RISCV_ISA_V) += rvv.o recov_rvv.o
>>>>>> +CFLAGS_rvv.o += -march=rv64gcv
>>>>>> +CFLAGS_recov_rvv.o += -march=rv64gcv
>>>>> 
>>>>> I'm curious - why do you need this when you're using .option arch,+v
>>>>> below?
>>>> 
>>>> Compiler would complain the errors like below without this flag:
>>>> 
>>>> Error: unrecognized opcode `vle8.v v0,(a3)', extension `v' or `zve64x'
>>>> or `zve32x' required
>>> 
>>> Right, but the reason for using .option arch,+v elsewhere in the kernel
>>> is because we don't want the compiler to generate vector code at all,
>>> and the directive lets the assembler handle the vector instructions. If
>>> I recall correctly, the error you pasted above is from the assembler,
>> 
>> Yes, it is from the assembler.
>> 
>>> not the compiler. You should be able to just set AFLAGS, given that all
>> 
>> It complains the same errors after simply replacing CFLAGS with AFLAGS
>> here. What am I missing?
>> 
> 
> I don't know what you're missing unfortunately, sorry.

.option push should be paired with .option pop within the same inline
asm statement. You cannot generally split them up, as the compiler will
not guarantee that the inline asm statements appear in that order
textually, since it may reorder blocks, and may even clone or delete
them. Plus in some cases you could end up affecting the compiler’s own
generated code, although that shouldn’t matter here.

Jess