:p
atchew
Login
The following changes since commit 005ad32358f12fe9313a4a01918a55e60d4f39e5: Merge tag 'pull-tpm-2023-09-12-3' of https://github.com/stefanberger/qemu-tpm into staging (2023-09-13 13:41:57 -0400) are available in the Git repository at: https://gitlab.com/rth7680/qemu.git tags/pull-crypto-20230915 for you to fetch changes up to 055c99015a4ec3c608d0260592368adc604429ea: host/include/aarch64: Implement clmul.h (2023-09-15 13:57:00 +0000) ---------------------------------------------------------------- Unify implementation of carry-less multiply. Accelerate carry-less multiply for 64x64->128. ---------------------------------------------------------------- Richard Henderson (19): crypto: Add generic 8-bit carry-less multiply routines target/arm: Use clmul_8* routines target/s390x: Use clmul_8* routines target/ppc: Use clmul_8* routines crypto: Add generic 16-bit carry-less multiply routines target/arm: Use clmul_16* routines target/s390x: Use clmul_16* routines target/ppc: Use clmul_16* routines crypto: Add generic 32-bit carry-less multiply routines target/arm: Use clmul_32* routines target/s390x: Use clmul_32* routines target/ppc: Use clmul_32* routines crypto: Add generic 64-bit carry-less multiply routine target/arm: Use clmul_64 target/i386: Use clmul_64 target/s390x: Use clmul_64 target/ppc: Use clmul_64 host/include/i386: Implement clmul.h host/include/aarch64: Implement clmul.h host/include/aarch64/host/cpuinfo.h | 1 + host/include/aarch64/host/crypto/clmul.h | 41 +++++++ host/include/generic/host/crypto/clmul.h | 15 +++ host/include/i386/host/cpuinfo.h | 1 + host/include/i386/host/crypto/clmul.h | 29 +++++ host/include/x86_64/host/crypto/clmul.h | 1 + include/crypto/clmul.h | 83 ++++++++++++++ include/qemu/cpuid.h | 3 + target/arm/tcg/vec_internal.h | 11 -- target/i386/ops_sse.h | 40 ++----- crypto/clmul.c | 111 ++++++++++++++++++ target/arm/tcg/mve_helper.c | 16 +-- target/arm/tcg/vec_helper.c | 102 ++--------------- target/ppc/int_helper.c | 64 +++++------ target/s390x/tcg/vec_int_helper.c | 186 ++++++++++++++----------------- util/cpuinfo-aarch64.c | 4 +- util/cpuinfo-i386.c | 1 + crypto/meson.build | 9 +- 18 files changed, 433 insertions(+), 285 deletions(-) create mode 100644 host/include/aarch64/host/crypto/clmul.h create mode 100644 host/include/generic/host/crypto/clmul.h create mode 100644 host/include/i386/host/crypto/clmul.h create mode 100644 host/include/x86_64/host/crypto/clmul.h create mode 100644 include/crypto/clmul.h create mode 100644 crypto/clmul.c
Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/crypto/clmul.h | 41 +++++++++++++++++++++++++++++ crypto/clmul.c | 60 ++++++++++++++++++++++++++++++++++++++++++ crypto/meson.build | 9 ++++--- 3 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 include/crypto/clmul.h create mode 100644 crypto/clmul.c diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * Carry-less multiply operations. + * SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#ifndef CRYPTO_CLMUL_H +#define CRYPTO_CLMUL_H + +/** + * clmul_8x8_low: + * + * Perform eight 8x8->8 carry-less multiplies. + */ +uint64_t clmul_8x8_low(uint64_t, uint64_t); + +/** + * clmul_8x4_even: + * + * Perform four 8x8->16 carry-less multiplies. + * The odd bytes of the inputs are ignored. + */ +uint64_t clmul_8x4_even(uint64_t, uint64_t); + +/** + * clmul_8x4_odd: + * + * Perform four 8x8->16 carry-less multiplies. + * The even bytes of the inputs are ignored. + */ +uint64_t clmul_8x4_odd(uint64_t, uint64_t); + +/** + * clmul_8x4_packed: + * + * Perform four 8x8->16 carry-less multiplies. + */ +uint64_t clmul_8x4_packed(uint32_t, uint32_t); + +#endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ +/* + * Carry-less multiply operations. + * SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#include "qemu/osdep.h" +#include "crypto/clmul.h" + +uint64_t clmul_8x8_low(uint64_t n, uint64_t m) +{ + uint64_t r = 0; + + for (int i = 0; i < 8; ++i) { + uint64_t mask = (n & 0x0101010101010101ull) * 0xff; + r ^= m & mask; + m = (m << 1) & 0xfefefefefefefefeull; + n >>= 1; + } + return r; +} + +static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m) +{ + uint64_t r = 0; + + for (int i = 0; i < 8; ++i) { + uint64_t mask = (n & 0x0001000100010001ull) * 0xffff; + r ^= m & mask; + n >>= 1; + m <<= 1; + } + return r; +} + +uint64_t clmul_8x4_even(uint64_t n, uint64_t m) +{ + n &= 0x00ff00ff00ff00ffull; + m &= 0x00ff00ff00ff00ffull; + return clmul_8x4_even_int(n, m); +} + +uint64_t clmul_8x4_odd(uint64_t n, uint64_t m) +{ + return clmul_8x4_even(n >> 8, m >> 8); +} + +static uint64_t unpack_8_to_16(uint64_t x) +{ + return (x & 0x000000ff) + | ((x & 0x0000ff00) << 8) + | ((x & 0x00ff0000) << 16) + | ((x & 0xff000000) << 24); +} + +uint64_t clmul_8x4_packed(uint32_t n, uint32_t m) +{ + return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m)); +} diff --git a/crypto/meson.build b/crypto/meson.build index XXXXXXX..XXXXXXX 100644 --- a/crypto/meson.build +++ b/crypto/meson.build @@ -XXX,XX +XXX,XX @@ if have_afalg endif crypto_ss.add(when: gnutls, if_true: files('tls-cipher-suites.c')) -util_ss.add(files('sm4.c')) -util_ss.add(files('aes.c')) -util_ss.add(files('init.c')) +util_ss.add(files( + 'aes.c', + 'clmul.c', + 'init.c', + 'sm4.c', +)) if gnutls.found() util_ss.add(gnutls) endif -- 2.34.1
Use generic routines for 8-bit carry-less multiply. Remove our local version of pmull_h. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_internal.h | 5 ---- target/arm/tcg/mve_helper.c | 8 ++---- target/arm/tcg/vec_helper.c | 53 ++++------------------------------- 3 files changed, 9 insertions(+), 57 deletions(-) diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -XXX,XX +XXX,XX @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); -/* - * 8 x 8 -> 16 vector polynomial multiply where the inputs are - * in the low 8 bits of each 16-bit element -*/ -uint64_t pmull_h(uint64_t op1, uint64_t op2); /* * 16 x 16 -> 32 vector polynomial multiply where the inputs are * in the low 16 bits of each 32-bit element diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -XXX,XX +XXX,XX @@ #include "exec/exec-all.h" #include "tcg/tcg.h" #include "fpu/softfloat.h" +#include "crypto/clmul.h" static uint16_t mve_eci_mask(CPUARMState *env) { @@ -XXX,XX +XXX,XX @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) * Polynomial multiply. We can always do this generating 64 bits * of the result at a time, so we don't need to use DO_2OP_L. */ -#define VMULLPH_MASK 0x00ff00ff00ff00ffULL #define VMULLPW_MASK 0x0000ffff0000ffffULL -#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK) -#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8) #define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) #define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) -DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH) -DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH) +DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even) +DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd) DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" #include "qemu/int128.h" +#include "crypto/clmul.h" #include "vec_internal.h" /* @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) */ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) { - intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t i, opr_sz = simd_oprsz(desc); uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = n[i]; - uint64_t mm = m[i]; - uint64_t rr = 0; - - for (j = 0; j < 8; ++j) { - uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; - rr ^= mm & mask; - mm = (mm << 1) & 0xfefefefefefefefeull; - nn >>= 1; - } - d[i] = rr; + d[i] = clmul_8x8_low(n[i], m[i]); } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) clear_tail(d, opr_sz, simd_maxsz(desc)); } -/* - * 8x8->16 polynomial multiply. - * - * The byte inputs are expanded to (or extracted from) half-words. - * Note that neon and sve2 get the inputs from different positions. - * This allows 4 bytes to be processed in parallel with uint64_t. - */ - -static uint64_t expand_byte_to_half(uint64_t x) -{ - return (x & 0x000000ff) - | ((x & 0x0000ff00) << 8) - | ((x & 0x00ff0000) << 16) - | ((x & 0xff000000) << 24); -} - uint64_t pmull_w(uint64_t op1, uint64_t op2) { uint64_t result = 0; @@ -XXX,XX +XXX,XX @@ uint64_t pmull_w(uint64_t op1, uint64_t op2) return result; } -uint64_t pmull_h(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 8; ++i) { - uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) { int hi = simd_data(desc); uint64_t *d = vd, *n = vn, *m = vm; uint64_t nn = n[hi], mm = m[hi]; - d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + d[0] = clmul_8x4_packed(nn, mm); nn >>= 32; mm >>= 32; - d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + d[1] = clmul_8x4_packed(nn, mm); clear_tail(d, 16, simd_maxsz(desc)); } @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; - uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; - - d[i] = pmull_h(nn, mm); + d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); } } -- 2.34.1
Use generic routines for 8-bit carry-less multiply. Remove our local version of galois_multiply8. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 32 ++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ #include "vec.h" #include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" +#include "crypto/clmul.h" static bool s390_vec_is_zero(const S390Vector *v) { @@ -XXX,XX +XXX,XX @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \ } \ return res; \ } -DEF_GALOIS_MULTIPLY(8, 16) DEF_GALOIS_MULTIPLY(16, 32) DEF_GALOIS_MULTIPLY(32, 64) @@ -XXX,XX +XXX,XX @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b) return res; } +/* + * There is no carry across the two doublewords, so their order does + * not matter. Nor is there partial overlap between registers. + */ +static inline uint64_t do_gfma8(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_8x4_even(n, m) ^ clmul_8x4_odd(n, m) ^ a; +} + +void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma8(q2[0], q3[0], 0); + q1[1] = do_gfma8(q2[1], q3[1], 0); +} + +void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t desc) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma8(q2[0], q3[0], q4[0]); + q1[1] = do_gfma8(q2[1], q3[1], q4[1]); +} + #define DEF_VGFM(BITS, TBITS) \ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ uint32_t desc) \ @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFM(8, 16) DEF_VGFM(16, 32) DEF_VGFM(32, 64) @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFMA(8, 16) DEF_VGFMA(16, 32) DEF_VGFMA(32, 64) -- 2.34.1
Use generic routines for 8-bit carry-less multiply. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ #include "exec/helper-proto.h" #include "crypto/aes.h" #include "crypto/aes-round.h" +#include "crypto/clmul.h" #include "fpu/softfloat.h" #include "qapi/error.h" #include "qemu/guest-random.h" @@ -XXX,XX +XXX,XX @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) #undef VBPERMQ_INDEX #undef VBPERMQ_DW +/* + * There is no carry across the two doublewords, so their order does + * not matter. Nor is there partial overlap between registers. + */ +void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +{ + for (int i = 0; i < 2; ++i) { + uint64_t aa = a->u64[i], bb = b->u64[i]; + r->u64[i] = clmul_8x4_even(aa, bb) ^ clmul_8x4_odd(aa, bb); + } +} + #define PMSUM(name, srcfld, trgfld, trgtyp) \ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ { \ @@ -XXX,XX +XXX,XX @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ } \ } -PMSUM(vpmsumb, u8, u16, uint16_t) PMSUM(vpmsumh, u16, u32, uint32_t) PMSUM(vpmsumw, u32, u64, uint64_t) -- 2.34.1
Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/crypto/clmul.h | 16 ++++++++++++++++ crypto/clmul.c | 21 +++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h index XXXXXXX..XXXXXXX 100644 --- a/include/crypto/clmul.h +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_odd(uint64_t, uint64_t); */ uint64_t clmul_8x4_packed(uint32_t, uint32_t); +/** + * clmul_16x2_even: + * + * Perform two 16x16->32 carry-less multiplies. + * The odd words of the inputs are ignored. + */ +uint64_t clmul_16x2_even(uint64_t, uint64_t); + +/** + * clmul_16x2_odd: + * + * Perform two 16x16->32 carry-less multiplies. + * The even words of the inputs are ignored. + */ +uint64_t clmul_16x2_odd(uint64_t, uint64_t); + #endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c index XXXXXXX..XXXXXXX 100644 --- a/crypto/clmul.c +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_packed(uint32_t n, uint32_t m) { return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m)); } + +uint64_t clmul_16x2_even(uint64_t n, uint64_t m) +{ + uint64_t r = 0; + + n &= 0x0000ffff0000ffffull; + m &= 0x0000ffff0000ffffull; + + for (int i = 0; i < 16; ++i) { + uint64_t mask = (n & 0x0000000100000001ull) * 0xffffffffull; + r ^= m & mask; + n >>= 1; + m <<= 1; + } + return r; +} + +uint64_t clmul_16x2_odd(uint64_t n, uint64_t m) +{ + return clmul_16x2_even(n >> 16, m >> 16); +} -- 2.34.1
Use generic routines for 16-bit carry-less multiply. Remove our local version of pmull_w. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_internal.h | 6 ------ target/arm/tcg/mve_helper.c | 8 ++------ target/arm/tcg/vec_helper.c | 13 ------------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -XXX,XX +XXX,XX @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); -/* - * 16 x 16 -> 32 vector polynomial multiply where the inputs are - * in the low 16 bits of each 32-bit element - */ -uint64_t pmull_w(uint64_t op1, uint64_t op2); - /** * bfdotadd: * @sum: addend diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -XXX,XX +XXX,XX @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) * Polynomial multiply. We can always do this generating 64 bits * of the result at a time, so we don't need to use DO_2OP_L. */ -#define VMULLPW_MASK 0x0000ffff0000ffffULL -#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) -#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) - DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even) DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd) -DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) -DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) +DO_2OP(vmullpbw, 8, uint64_t, clmul_16x2_even) +DO_2OP(vmullptw, 8, uint64_t, clmul_16x2_odd) /* * Because the computation type is at least twice as large as required, diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) clear_tail(d, opr_sz, simd_maxsz(desc)); } -uint64_t pmull_w(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 16; ++i) { - uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) { int hi = simd_data(desc); -- 2.34.1
Use generic routines for 16-bit carry-less multiply. Remove our local version of galois_multiply16. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \ } \ return res; \ } -DEF_GALOIS_MULTIPLY(16, 32) DEF_GALOIS_MULTIPLY(32, 64) static S390Vector galois_multiply64(uint64_t a, uint64_t b) @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, q1[1] = do_gfma8(q2[1], q3[1], q4[1]); } +static inline uint64_t do_gfma16(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_16x2_even(n, m) ^ clmul_16x2_odd(n, m) ^ a; +} + +void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma16(q2[0], q3[0], 0); + q1[1] = do_gfma16(q2[1], q3[1], 0); +} + +void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma16(q2[0], q3[0], q4[0]); + q1[1] = do_gfma16(q2[1], q3[1], q4[1]); +} + #define DEF_VGFM(BITS, TBITS) \ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ uint32_t desc) \ @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFM(16, 32) DEF_VGFM(32, 64) void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFMA(16, 32) DEF_VGFMA(32, 64) void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, -- 2.34.1
Use generic routines for 16-bit carry-less multiply. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) } } +void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +{ + for (int i = 0; i < 2; ++i) { + uint64_t aa = a->u64[i], bb = b->u64[i]; + r->u64[i] = clmul_16x2_even(aa, bb) ^ clmul_16x2_odd(aa, bb); + } +} + #define PMSUM(name, srcfld, trgfld, trgtyp) \ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ { \ @@ -XXX,XX +XXX,XX @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ } \ } -PMSUM(vpmsumh, u16, u32, uint32_t) PMSUM(vpmsumw, u32, u64, uint64_t) void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) -- 2.34.1
Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/crypto/clmul.h | 7 +++++++ crypto/clmul.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h index XXXXXXX..XXXXXXX 100644 --- a/include/crypto/clmul.h +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_even(uint64_t, uint64_t); */ uint64_t clmul_16x2_odd(uint64_t, uint64_t); +/** + * clmul_32: + * + * Perform a 32x32->64 carry-less multiply. + */ +uint64_t clmul_32(uint32_t, uint32_t); + #endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c index XXXXXXX..XXXXXXX 100644 --- a/crypto/clmul.c +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t n, uint64_t m) { return clmul_16x2_even(n >> 16, m >> 16); } + +uint64_t clmul_32(uint32_t n, uint32_t m32) +{ + uint64_t r = 0; + uint64_t m = m32; + + for (int i = 0; i < 32; ++i) { + r ^= n & 1 ? m : 0; + n >>= 1; + m <<= 1; + } + return r; +} -- 2.34.1
Use generic routines for 32-bit carry-less multiply. Remove our local version of pmull_d. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_helper.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) } } -static uint64_t pmull_d(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - - for (i = 0; i < 32; ++i) { - uint64_t mask = -((op1 >> i) & 1); - result ^= (op2 << i) & mask; - } - return result; -} - void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) { intptr_t sel = H4(simd_data(desc)); @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) uint64_t *d = vd; for (i = 0; i < opr_sz / 8; ++i) { - d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); + d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); } } #endif -- 2.34.1
Use generic routines for 32-bit carry-less multiply. Remove our local version of galois_multiply32. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 75 +++++++++---------------------- 1 file changed, 22 insertions(+), 53 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ DEF_VCTZ(8) DEF_VCTZ(16) /* like binary multiplication, but XOR instead of addition */ -#define DEF_GALOIS_MULTIPLY(BITS, TBITS) \ -static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \ - uint##TBITS##_t b) \ -{ \ - uint##TBITS##_t res = 0; \ - \ - while (b) { \ - if (b & 0x1) { \ - res = res ^ a; \ - } \ - a = a << 1; \ - b = b >> 1; \ - } \ - return res; \ -} -DEF_GALOIS_MULTIPLY(32, 64) static S390Vector galois_multiply64(uint64_t a, uint64_t b) { @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, q1[1] = do_gfma16(q2[1], q3[1], q4[1]); } -#define DEF_VGFM(BITS, TBITS) \ -void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ - uint32_t desc) \ -{ \ - int i; \ - \ - for (i = 0; i < (128 / TBITS); i++) { \ - uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \ - uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \ - uint##TBITS##_t d = galois_multiply##BITS(a, b); \ - \ - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ - d = d ^ galois_multiply32(a, b); \ - s390_vec_write_element##TBITS(v1, i, d); \ - } \ +static inline uint64_t do_gfma32(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_32(n, m) ^ clmul_32(n >> 32, m >> 32) ^ a; +} + +void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma32(q2[0], q3[0], 0); + q1[1] = do_gfma32(q2[1], q3[1], 0); +} + +void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma32(q2[0], q3[0], q4[0]); + q1[1] = do_gfma32(q2[1], q3[1], q4[1]); } -DEF_VGFM(32, 64) void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, uint32_t desc) @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, s390_vec_xor(v1, &tmp1, &tmp2); } -#define DEF_VGFMA(BITS, TBITS) \ -void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \ - const void *v4, uint32_t desc) \ -{ \ - int i; \ - \ - for (i = 0; i < (128 / TBITS); i++) { \ - uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \ - uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \ - uint##TBITS##_t d = galois_multiply##BITS(a, b); \ - \ - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ - d = d ^ galois_multiply32(a, b); \ - d = d ^ s390_vec_read_element##TBITS(v4, i); \ - s390_vec_write_element##TBITS(v1, i, d); \ - } \ -} -DEF_VGFMA(32, 64) - void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, const void *v4, uint32_t desc) { -- 2.34.1
Use generic routines for 32-bit carry-less multiply. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) } } -#define PMSUM(name, srcfld, trgfld, trgtyp) \ -void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ -{ \ - int i, j; \ - trgtyp prod[sizeof(ppc_avr_t) / sizeof(a->srcfld[0])]; \ - \ - VECTOR_FOR_INORDER_I(i, srcfld) { \ - prod[i] = 0; \ - for (j = 0; j < sizeof(a->srcfld[0]) * 8; j++) { \ - if (a->srcfld[i] & (1ull << j)) { \ - prod[i] ^= ((trgtyp)b->srcfld[i] << j); \ - } \ - } \ - } \ - \ - VECTOR_FOR_INORDER_I(i, trgfld) { \ - r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \ - } \ +void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +{ + for (int i = 0; i < 2; ++i) { + uint64_t aa = a->u64[i], bb = b->u64[i]; + r->u64[i] = clmul_32(aa, bb) ^ clmul_32(aa >> 32, bb >> 32); + } } -PMSUM(vpmsumw, u32, u64, uint64_t) - void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int i, j; -- 2.34.1
Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- host/include/generic/host/crypto/clmul.h | 15 +++++++++++++++ include/crypto/clmul.h | 19 +++++++++++++++++++ crypto/clmul.c | 17 +++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 host/include/generic/host/crypto/clmul.h diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/generic/host/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * No host specific carry-less multiply acceleration. + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef GENERIC_HOST_CRYPTO_CLMUL_H +#define GENERIC_HOST_CRYPTO_CLMUL_H + +#define HAVE_CLMUL_ACCEL false +#define ATTR_CLMUL_ACCEL + +Int128 clmul_64_accel(uint64_t, uint64_t) + QEMU_ERROR("unsupported accel"); + +#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */ diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h index XXXXXXX..XXXXXXX 100644 --- a/include/crypto/clmul.h +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ #ifndef CRYPTO_CLMUL_H #define CRYPTO_CLMUL_H +#include "qemu/int128.h" +#include "host/crypto/clmul.h" + /** * clmul_8x8_low: * @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t, uint64_t); */ uint64_t clmul_32(uint32_t, uint32_t); +/** + * clmul_64: + * + * Perform a 64x64->128 carry-less multiply. + */ +Int128 clmul_64_gen(uint64_t, uint64_t); + +static inline Int128 clmul_64(uint64_t a, uint64_t b) +{ + if (HAVE_CLMUL_ACCEL) { + return clmul_64_accel(a, b); + } else { + return clmul_64_gen(a, b); + } +} + #endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c index XXXXXXX..XXXXXXX 100644 --- a/crypto/clmul.c +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ uint64_t clmul_32(uint32_t n, uint32_t m32) } return r; } + +Int128 clmul_64_gen(uint64_t n, uint64_t m) +{ + uint64_t rl = 0, rh = 0; + + /* Bit 0 can only influence the low 64-bit result. */ + if (n & 1) { + rl = m; + } + + for (int i = 1; i < 64; ++i) { + uint64_t mask = -((n >> i) & 1); + rl ^= (m << i) & mask; + rh ^= (m >> (64 - i)) & mask; + } + return int128_make128(rl, rh); +} -- 2.34.1
Use generic routine for 64-bit carry-less multiply. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_helper.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) */ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) { - intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t i, opr_sz = simd_oprsz(desc); intptr_t hi = simd_data(desc); uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; i += 2) { - uint64_t nn = n[i + hi]; - uint64_t mm = m[i + hi]; - uint64_t rhi = 0; - uint64_t rlo = 0; - - /* Bit 0 can only influence the low 64-bit result. */ - if (nn & 1) { - rlo = mm; - } - - for (j = 1; j < 64; ++j) { - uint64_t mask = -((nn >> j) & 1); - rlo ^= (mm << j) & mask; - rhi ^= (mm >> (64 - j)) & mask; - } - d[i] = rlo; - d[i + 1] = rhi; + Int128 r = clmul_64(n[i + hi], m[i + hi]); + d[i] = int128_getlo(r); + d[i + 1] = int128_gethi(r); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -- 2.34.1
Use generic routine for 64-bit carry-less multiply. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/i386/ops_sse.h | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index XXXXXXX..XXXXXXX 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -XXX,XX +XXX,XX @@ #include "crypto/aes.h" #include "crypto/aes-round.h" +#include "crypto/clmul.h" #if SHIFT == 0 #define Reg MMXReg @@ -XXX,XX +XXX,XX @@ target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) #endif -#if SHIFT == 1 -static void clmulq(uint64_t *dest_l, uint64_t *dest_h, - uint64_t a, uint64_t b) -{ - uint64_t al, ah, resh, resl; - - ah = 0; - al = a; - resh = resl = 0; - - while (b) { - if (b & 1) { - resl ^= al; - resh ^= ah; - } - ah = (ah << 1) | (al >> 63); - al <<= 1; - b >>= 1; - } - - *dest_l = resl; - *dest_h = resh; -} -#endif - void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t ctrl) { - uint64_t a, b; - int i; + int a_idx = (ctrl & 1) != 0; + int b_idx = (ctrl & 16) != 0; - for (i = 0; i < 1 << SHIFT; i += 2) { - a = v->Q(((ctrl & 1) != 0) + i); - b = s->Q(((ctrl & 16) != 0) + i); - clmulq(&d->Q(i), &d->Q(i + 1), a, b); + for (int i = 0; i < SHIFT; i++) { + uint64_t a = v->Q(2 * i + a_idx); + uint64_t b = s->Q(2 * i + b_idx); + Int128 *r = (Int128 *)&d->ZMM_X(i); + + *r = clmul_64(a, b); } } -- 2.34.1
Use the generic routine for 64-bit carry-less multiply. Remove our local version of galois_multiply64. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 58 +++++++------------------------ 1 file changed, 12 insertions(+), 46 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ static bool s390_vec_is_zero(const S390Vector *v) return !v->doubleword[0] && !v->doubleword[1]; } -static void s390_vec_xor(S390Vector *res, const S390Vector *a, - const S390Vector *b) -{ - res->doubleword[0] = a->doubleword[0] ^ b->doubleword[0]; - res->doubleword[1] = a->doubleword[1] ^ b->doubleword[1]; -} - static void s390_vec_and(S390Vector *res, const S390Vector *a, const S390Vector *b) { @@ -XXX,XX +XXX,XX @@ DEF_VCTZ(16) /* like binary multiplication, but XOR instead of addition */ -static S390Vector galois_multiply64(uint64_t a, uint64_t b) -{ - S390Vector res = {}; - S390Vector va = { - .doubleword[1] = a, - }; - S390Vector vb = { - .doubleword[1] = b, - }; - - while (!s390_vec_is_zero(&vb)) { - if (vb.doubleword[1] & 0x1) { - s390_vec_xor(&res, &res, &va); - } - s390_vec_shl(&va, &va, 1); - s390_vec_shr(&vb, &vb, 1); - } - return res; -} - /* * There is no carry across the two doublewords, so their order does * not matter. Nor is there partial overlap between registers. @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, uint32_t desc) { - S390Vector tmp1, tmp2; - uint64_t a, b; + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + Int128 r; - a = s390_vec_read_element64(v2, 0); - b = s390_vec_read_element64(v3, 0); - tmp1 = galois_multiply64(a, b); - a = s390_vec_read_element64(v2, 1); - b = s390_vec_read_element64(v3, 1); - tmp2 = galois_multiply64(a, b); - s390_vec_xor(v1, &tmp1, &tmp2); + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); + q1[0] = int128_gethi(r); + q1[1] = int128_getlo(r); } void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, const void *v4, uint32_t desc) { - S390Vector tmp1, tmp2; - uint64_t a, b; + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + Int128 r; - a = s390_vec_read_element64(v2, 0); - b = s390_vec_read_element64(v3, 0); - tmp1 = galois_multiply64(a, b); - a = s390_vec_read_element64(v2, 1); - b = s390_vec_read_element64(v3, 1); - tmp2 = galois_multiply64(a, b); - s390_vec_xor(&tmp1, &tmp1, &tmp2); - s390_vec_xor(v1, &tmp1, v4); + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); + q1[0] = q4[0] ^ int128_gethi(r); + q1[1] = q4[1] ^ int128_getlo(r); } #define DEF_VMAL(BITS) \ -- 2.34.1
Use generic routine for 64-bit carry-less multiply. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { - int i, j; - Int128 tmp, prod[2] = {int128_zero(), int128_zero()}; - - for (j = 0; j < 64; j++) { - for (i = 0; i < ARRAY_SIZE(r->u64); i++) { - if (a->VsrD(i) & (1ull << j)) { - tmp = int128_make64(b->VsrD(i)); - tmp = int128_lshift(tmp, j); - prod[i] = int128_xor(prod[i], tmp); - } - } - } - - r->s128 = int128_xor(prod[0], prod[1]); + Int128 e = clmul_64(a->u64[0], b->u64[0]); + Int128 o = clmul_64(a->u64[1], b->u64[1]); + r->s128 = int128_xor(e, o); } #if HOST_BIG_ENDIAN -- 2.34.1
Detect PCLMUL in cpuinfo; implement the accel hook. Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- host/include/i386/host/cpuinfo.h | 1 + host/include/i386/host/crypto/clmul.h | 29 +++++++++++++++++++++++++ host/include/x86_64/host/crypto/clmul.h | 1 + include/qemu/cpuid.h | 3 +++ util/cpuinfo-i386.c | 1 + 5 files changed, 35 insertions(+) create mode 100644 host/include/i386/host/crypto/clmul.h create mode 100644 host/include/x86_64/host/crypto/clmul.h diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h index XXXXXXX..XXXXXXX 100644 --- a/host/include/i386/host/cpuinfo.h +++ b/host/include/i386/host/cpuinfo.h @@ -XXX,XX +XXX,XX @@ #define CPUINFO_ATOMIC_VMOVDQA (1u << 16) #define CPUINFO_ATOMIC_VMOVDQU (1u << 17) #define CPUINFO_AES (1u << 18) +#define CPUINFO_PCLMUL (1u << 19) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/host/include/i386/host/crypto/clmul.h b/host/include/i386/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/i386/host/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * x86 specific clmul acceleration. + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef X86_HOST_CRYPTO_CLMUL_H +#define X86_HOST_CRYPTO_CLMUL_H + +#include "host/cpuinfo.h" +#include <immintrin.h> + +#if defined(__PCLMUL__) +# define HAVE_CLMUL_ACCEL true +# define ATTR_CLMUL_ACCEL +#else +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL) +# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul"))) +#endif + +static inline Int128 ATTR_CLMUL_ACCEL +clmul_64_accel(uint64_t n, uint64_t m) +{ + union { __m128i v; Int128 s; } u; + + u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0); + return u.s; +} + +#endif /* X86_HOST_CRYPTO_CLMUL_H */ diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/x86_64/host/crypto/clmul.h @@ -0,0 +1 @@ +#include "host/include/i386/host/crypto/clmul.h" diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h index XXXXXXX..XXXXXXX 100644 --- a/include/qemu/cpuid.h +++ b/include/qemu/cpuid.h @@ -XXX,XX +XXX,XX @@ #endif /* Leaf 1, %ecx */ +#ifndef bit_PCLMUL +#define bit_PCLMUL (1 << 1) +#endif #ifndef bit_SSE4_1 #define bit_SSE4_1 (1 << 19) #endif diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c index XXXXXXX..XXXXXXX 100644 --- a/util/cpuinfo-i386.c +++ b/util/cpuinfo-i386.c @@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void) info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0); info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0); info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0); + info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0); /* Our AES support requires PSHUFB as well. */ info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0); -- 2.34.1
Detect PMULL in cpuinfo; implement the accel hook. Acked-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- host/include/aarch64/host/cpuinfo.h | 1 + host/include/aarch64/host/crypto/clmul.h | 41 ++++++++++++++++++++++++ util/cpuinfo-aarch64.c | 4 ++- 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 host/include/aarch64/host/crypto/clmul.h diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h index XXXXXXX..XXXXXXX 100644 --- a/host/include/aarch64/host/cpuinfo.h +++ b/host/include/aarch64/host/cpuinfo.h @@ -XXX,XX +XXX,XX @@ #define CPUINFO_LSE (1u << 1) #define CPUINFO_LSE2 (1u << 2) #define CPUINFO_AES (1u << 3) +#define CPUINFO_PMULL (1u << 4) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/host/include/aarch64/host/crypto/clmul.h b/host/include/aarch64/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/aarch64/host/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * AArch64 specific clmul acceleration. + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef AARCH64_HOST_CRYPTO_CLMUL_H +#define AARCH64_HOST_CRYPTO_CLMUL_H + +#include "host/cpuinfo.h" +#include <arm_neon.h> + +/* + * 64x64->128 pmull is available with FEAT_PMULL. + * Both FEAT_AES and FEAT_PMULL are covered under the same macro. + */ +#ifdef __ARM_FEATURE_AES +# define HAVE_CLMUL_ACCEL true +#else +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL) +#endif +#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN) +# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto"))) +#else +# define ATTR_CLMUL_ACCEL +#endif + +static inline Int128 ATTR_CLMUL_ACCEL +clmul_64_accel(uint64_t n, uint64_t m) +{ + union { poly128_t v; Int128 s; } u; + +#ifdef CONFIG_ARM_AES_BUILTIN + u.v = vmull_p64((poly64_t)n, (poly64_t)m); +#else + asm(".arch_extension aes\n\t" + "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m)); +#endif + return u.s; +} + +#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */ diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c index XXXXXXX..XXXXXXX 100644 --- a/util/cpuinfo-aarch64.c +++ b/util/cpuinfo-aarch64.c @@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void) unsigned long hwcap = qemu_getauxval(AT_HWCAP); info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0); info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0); - info |= (hwcap & HWCAP_AES ? CPUINFO_AES: 0); + info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0); + info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0); #endif #ifdef CONFIG_DARWIN info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE; info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2; info |= sysctl_for_bool("hw.optional.arm.FEAT_AES") * CPUINFO_AES; + info |= sysctl_for_bool("hw.optional.arm.FEAT_PMULL") * CPUINFO_PMULL; #endif cpuinfo = info; -- 2.34.1
Inspired by Ard Biesheuvel's RFC patches [1] for accelerating carry-less multiply under emulation. Changes for v3: * Update target/i386 ops_sse.h. * Apply r-b. Changes for v2: * Only accelerate clmul_64; keep generic helpers for other sizes. * Drop most of the Int128 interfaces, except for clmul_64. * Use the same acceleration format as aes-round.h. r~ [1] https://patchew.org/QEMU/20230601123332.3297404-1-ardb@kernel.org/ Richard Henderson (19): crypto: Add generic 8-bit carry-less multiply routines target/arm: Use clmul_8* routines target/s390x: Use clmul_8* routines target/ppc: Use clmul_8* routines crypto: Add generic 16-bit carry-less multiply routines target/arm: Use clmul_16* routines target/s390x: Use clmul_16* routines target/ppc: Use clmul_16* routines crypto: Add generic 32-bit carry-less multiply routines target/arm: Use clmul_32* routines target/s390x: Use clmul_32* routines target/ppc: Use clmul_32* routines crypto: Add generic 64-bit carry-less multiply routine target/arm: Use clmul_64 target/i386: Use clmul_64 target/s390x: Use clmul_64 target/ppc: Use clmul_64 host/include/i386: Implement clmul.h host/include/aarch64: Implement clmul.h host/include/aarch64/host/cpuinfo.h | 1 + host/include/aarch64/host/crypto/clmul.h | 41 +++++ host/include/generic/host/crypto/clmul.h | 15 ++ host/include/i386/host/cpuinfo.h | 1 + host/include/i386/host/crypto/clmul.h | 29 ++++ host/include/x86_64/host/crypto/clmul.h | 1 + include/crypto/clmul.h | 83 ++++++++++ include/qemu/cpuid.h | 3 + target/arm/tcg/vec_internal.h | 11 -- target/i386/ops_sse.h | 40 ++--- crypto/clmul.c | 112 ++++++++++++++ target/arm/tcg/mve_helper.c | 16 +- target/arm/tcg/vec_helper.c | 102 ++----------- target/ppc/int_helper.c | 64 ++++---- target/s390x/tcg/vec_int_helper.c | 186 ++++++++++------------- util/cpuinfo-aarch64.c | 4 +- util/cpuinfo-i386.c | 1 + crypto/meson.build | 9 +- 18 files changed, 434 insertions(+), 285 deletions(-) create mode 100644 host/include/aarch64/host/crypto/clmul.h create mode 100644 host/include/generic/host/crypto/clmul.h create mode 100644 host/include/i386/host/crypto/clmul.h create mode 100644 host/include/x86_64/host/crypto/clmul.h create mode 100644 include/crypto/clmul.h create mode 100644 crypto/clmul.c -- 2.34.1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/crypto/clmul.h | 41 +++++++++++++++++++++++++++++ crypto/clmul.c | 60 ++++++++++++++++++++++++++++++++++++++++++ crypto/meson.build | 9 ++++--- 3 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 include/crypto/clmul.h create mode 100644 crypto/clmul.c diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * Carry-less multiply operations. + * SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#ifndef CRYPTO_CLMUL_H +#define CRYPTO_CLMUL_H + +/** + * clmul_8x8_low: + * + * Perform eight 8x8->8 carry-less multiplies. + */ +uint64_t clmul_8x8_low(uint64_t, uint64_t); + +/** + * clmul_8x4_even: + * + * Perform four 8x8->16 carry-less multiplies. + * The odd bytes of the inputs are ignored. + */ +uint64_t clmul_8x4_even(uint64_t, uint64_t); + +/** + * clmul_8x4_odd: + * + * Perform four 8x8->16 carry-less multiplies. + * The even bytes of the inputs are ignored. + */ +uint64_t clmul_8x4_odd(uint64_t, uint64_t); + +/** + * clmul_8x4_packed: + * + * Perform four 8x8->16 carry-less multiplies. + */ +uint64_t clmul_8x4_packed(uint32_t, uint32_t); + +#endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ +/* + * Carry-less multiply operations. + * SPDX-License-Identifier: GPL-2.0-or-later + * + * Copyright (C) 2023 Linaro, Ltd. + */ + +#include "qemu/osdep.h" +#include "crypto/clmul.h" + +uint64_t clmul_8x8_low(uint64_t n, uint64_t m) +{ + uint64_t r = 0; + + for (int i = 0; i < 8; ++i) { + uint64_t mask = (n & 0x0101010101010101ull) * 0xff; + r ^= m & mask; + m = (m << 1) & 0xfefefefefefefefeull; + n >>= 1; + } + return r; +} + +static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m) +{ + uint64_t r = 0; + + for (int i = 0; i < 8; ++i) { + uint64_t mask = (n & 0x0001000100010001ull) * 0xffff; + r ^= m & mask; + n >>= 1; + m <<= 1; + } + return r; +} + +uint64_t clmul_8x4_even(uint64_t n, uint64_t m) +{ + n &= 0x00ff00ff00ff00ffull; + m &= 0x00ff00ff00ff00ffull; + return clmul_8x4_even_int(n, m); +} + +uint64_t clmul_8x4_odd(uint64_t n, uint64_t m) +{ + return clmul_8x4_even(n >> 8, m >> 8); +} + +static uint64_t unpack_8_to_16(uint64_t x) +{ + return (x & 0x000000ff) + | ((x & 0x0000ff00) << 8) + | ((x & 0x00ff0000) << 16) + | ((x & 0xff000000) << 24); +} + +uint64_t clmul_8x4_packed(uint32_t n, uint32_t m) +{ + return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m)); +} diff --git a/crypto/meson.build b/crypto/meson.build index XXXXXXX..XXXXXXX 100644 --- a/crypto/meson.build +++ b/crypto/meson.build @@ -XXX,XX +XXX,XX @@ if have_afalg endif crypto_ss.add(when: gnutls, if_true: files('tls-cipher-suites.c')) -util_ss.add(files('sm4.c')) -util_ss.add(files('aes.c')) -util_ss.add(files('init.c')) +util_ss.add(files( + 'aes.c', + 'clmul.c', + 'init.c', + 'sm4.c', +)) if gnutls.found() util_ss.add(gnutls) endif -- 2.34.1
Use generic routines for 8-bit carry-less multiply. Remove our local version of pmull_h. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_internal.h | 5 ---- target/arm/tcg/mve_helper.c | 8 ++---- target/arm/tcg/vec_helper.c | 53 ++++------------------------------- 3 files changed, 9 insertions(+), 57 deletions(-) diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -XXX,XX +XXX,XX @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); -/* - * 8 x 8 -> 16 vector polynomial multiply where the inputs are - * in the low 8 bits of each 16-bit element -*/ -uint64_t pmull_h(uint64_t op1, uint64_t op2); /* * 16 x 16 -> 32 vector polynomial multiply where the inputs are * in the low 16 bits of each 32-bit element diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -XXX,XX +XXX,XX @@ #include "exec/exec-all.h" #include "tcg/tcg.h" #include "fpu/softfloat.h" +#include "crypto/clmul.h" static uint16_t mve_eci_mask(CPUARMState *env) { @@ -XXX,XX +XXX,XX @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) * Polynomial multiply. We can always do this generating 64 bits * of the result at a time, so we don't need to use DO_2OP_L. */ -#define VMULLPH_MASK 0x00ff00ff00ff00ffULL #define VMULLPW_MASK 0x0000ffff0000ffffULL -#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK) -#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8) #define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) #define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) -DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH) -DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH) +DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even) +DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd) DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" #include "qemu/int128.h" +#include "crypto/clmul.h" #include "vec_internal.h" /* @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) */ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) { - intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t i, opr_sz = simd_oprsz(desc); uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = n[i]; - uint64_t mm = m[i]; - uint64_t rr = 0; - - for (j = 0; j < 8; ++j) { - uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; - rr ^= mm & mask; - mm = (mm << 1) & 0xfefefefefefefefeull; - nn >>= 1; - } - d[i] = rr; + d[i] = clmul_8x8_low(n[i], m[i]); } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) clear_tail(d, opr_sz, simd_maxsz(desc)); } -/* - * 8x8->16 polynomial multiply. - * - * The byte inputs are expanded to (or extracted from) half-words. - * Note that neon and sve2 get the inputs from different positions. - * This allows 4 bytes to be processed in parallel with uint64_t. - */ - -static uint64_t expand_byte_to_half(uint64_t x) -{ - return (x & 0x000000ff) - | ((x & 0x0000ff00) << 8) - | ((x & 0x00ff0000) << 16) - | ((x & 0xff000000) << 24); -} - uint64_t pmull_w(uint64_t op1, uint64_t op2) { uint64_t result = 0; @@ -XXX,XX +XXX,XX @@ uint64_t pmull_w(uint64_t op1, uint64_t op2) return result; } -uint64_t pmull_h(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 8; ++i) { - uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) { int hi = simd_data(desc); uint64_t *d = vd, *n = vn, *m = vm; uint64_t nn = n[hi], mm = m[hi]; - d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + d[0] = clmul_8x4_packed(nn, mm); nn >>= 32; mm >>= 32; - d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + d[1] = clmul_8x4_packed(nn, mm); clear_tail(d, 16, simd_maxsz(desc)); } @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; - uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; - - d[i] = pmull_h(nn, mm); + d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); } } -- 2.34.1
Use generic routines for 8-bit carry-less multiply. Remove our local version of galois_multiply8. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 32 ++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ #include "vec.h" #include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" +#include "crypto/clmul.h" static bool s390_vec_is_zero(const S390Vector *v) { @@ -XXX,XX +XXX,XX @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \ } \ return res; \ } -DEF_GALOIS_MULTIPLY(8, 16) DEF_GALOIS_MULTIPLY(16, 32) DEF_GALOIS_MULTIPLY(32, 64) @@ -XXX,XX +XXX,XX @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b) return res; } +/* + * There is no carry across the two doublewords, so their order does + * not matter. Nor is there partial overlap between registers. + */ +static inline uint64_t do_gfma8(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_8x4_even(n, m) ^ clmul_8x4_odd(n, m) ^ a; +} + +void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma8(q2[0], q3[0], 0); + q1[1] = do_gfma8(q2[1], q3[1], 0); +} + +void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t desc) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma8(q2[0], q3[0], q4[0]); + q1[1] = do_gfma8(q2[1], q3[1], q4[1]); +} + #define DEF_VGFM(BITS, TBITS) \ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ uint32_t desc) \ @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFM(8, 16) DEF_VGFM(16, 32) DEF_VGFM(32, 64) @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFMA(8, 16) DEF_VGFMA(16, 32) DEF_VGFMA(32, 64) -- 2.34.1
Use generic routines for 8-bit carry-less multiply. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ #include "exec/helper-proto.h" #include "crypto/aes.h" #include "crypto/aes-round.h" +#include "crypto/clmul.h" #include "fpu/softfloat.h" #include "qapi/error.h" #include "qemu/guest-random.h" @@ -XXX,XX +XXX,XX @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) #undef VBPERMQ_INDEX #undef VBPERMQ_DW +/* + * There is no carry across the two doublewords, so their order does + * not matter. Nor is there partial overlap between registers. + */ +void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +{ + for (int i = 0; i < 2; ++i) { + uint64_t aa = a->u64[i], bb = b->u64[i]; + r->u64[i] = clmul_8x4_even(aa, bb) ^ clmul_8x4_odd(aa, bb); + } +} + #define PMSUM(name, srcfld, trgfld, trgtyp) \ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ { \ @@ -XXX,XX +XXX,XX @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ } \ } -PMSUM(vpmsumb, u8, u16, uint16_t) PMSUM(vpmsumh, u16, u32, uint32_t) PMSUM(vpmsumw, u32, u64, uint64_t) -- 2.34.1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/crypto/clmul.h | 16 ++++++++++++++++ crypto/clmul.c | 21 +++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h index XXXXXXX..XXXXXXX 100644 --- a/include/crypto/clmul.h +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_odd(uint64_t, uint64_t); */ uint64_t clmul_8x4_packed(uint32_t, uint32_t); +/** + * clmul_16x2_even: + * + * Perform two 16x16->32 carry-less multiplies. + * The odd words of the inputs are ignored. + */ +uint64_t clmul_16x2_even(uint64_t, uint64_t); + +/** + * clmul_16x2_odd: + * + * Perform two 16x16->32 carry-less multiplies. + * The even bytes of the inputs are ignored. + */ +uint64_t clmul_16x2_odd(uint64_t, uint64_t); + #endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c index XXXXXXX..XXXXXXX 100644 --- a/crypto/clmul.c +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_packed(uint32_t n, uint32_t m) { return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m)); } + +uint64_t clmul_16x2_even(uint64_t n, uint64_t m) +{ + uint64_t r = 0; + + n &= 0x0000ffff0000ffffull; + m &= 0x0000ffff0000ffffull; + + for (int i = 0; i < 16; ++i) { + uint64_t mask = (n & 0x0000000100000001ull) * 0xffffffffull; + r ^= m & mask; + n >>= 1; + m <<= 1; + } + return r; +} + +uint64_t clmul_16x2_odd(uint64_t n, uint64_t m) +{ + return clmul_16x2_even(n >> 16, m >> 16); +} -- 2.34.1
Use generic routines for 16-bit carry-less multiply. Remove our local version of pmull_w. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_internal.h | 6 ------ target/arm/tcg/mve_helper.c | 8 ++------ target/arm/tcg/vec_helper.c | 13 ------------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -XXX,XX +XXX,XX @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); -/* - * 16 x 16 -> 32 vector polynomial multiply where the inputs are - * in the low 16 bits of each 32-bit element - */ -uint64_t pmull_w(uint64_t op1, uint64_t op2); - /** * bfdotadd: * @sum: addend diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -XXX,XX +XXX,XX @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) * Polynomial multiply. We can always do this generating 64 bits * of the result at a time, so we don't need to use DO_2OP_L. */ -#define VMULLPW_MASK 0x0000ffff0000ffffULL -#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) -#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) - DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even) DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd) -DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) -DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) +DO_2OP(vmullpbw, 8, uint64_t, clmul_16x2_even) +DO_2OP(vmullptw, 8, uint64_t, clmul_16x2_odd) /* * Because the computation type is at least twice as large as required, diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) clear_tail(d, opr_sz, simd_maxsz(desc)); } -uint64_t pmull_w(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 16; ++i) { - uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) { int hi = simd_data(desc); -- 2.34.1
Use generic routines for 16-bit carry-less multiply. Remove our local version of galois_multiply16. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \ } \ return res; \ } -DEF_GALOIS_MULTIPLY(16, 32) DEF_GALOIS_MULTIPLY(32, 64) static S390Vector galois_multiply64(uint64_t a, uint64_t b) @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, q1[1] = do_gfma8(q2[1], q3[1], q4[1]); } +static inline uint64_t do_gfma16(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_16x2_even(n, m) ^ clmul_16x2_odd(n, m) ^ a; +} + +void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma16(q2[0], q3[0], 0); + q1[1] = do_gfma16(q2[1], q3[1], 0); +} + +void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma16(q2[0], q3[0], q4[0]); + q1[1] = do_gfma16(q2[1], q3[1], q4[1]); +} + #define DEF_VGFM(BITS, TBITS) \ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ uint32_t desc) \ @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFM(16, 32) DEF_VGFM(32, 64) void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \ s390_vec_write_element##TBITS(v1, i, d); \ } \ } -DEF_VGFMA(16, 32) DEF_VGFMA(32, 64) void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, -- 2.34.1
Use generic routines for 16-bit carry-less multiply. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) } } +void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +{ + for (int i = 0; i < 2; ++i) { + uint64_t aa = a->u64[i], bb = b->u64[i]; + r->u64[i] = clmul_16x2_even(aa, bb) ^ clmul_16x2_odd(aa, bb); + } +} + #define PMSUM(name, srcfld, trgfld, trgtyp) \ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ { \ @@ -XXX,XX +XXX,XX @@ void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ } \ } -PMSUM(vpmsumh, u16, u32, uint32_t) PMSUM(vpmsumw, u32, u64, uint64_t) void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) -- 2.34.1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/crypto/clmul.h | 7 +++++++ crypto/clmul.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h index XXXXXXX..XXXXXXX 100644 --- a/include/crypto/clmul.h +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_even(uint64_t, uint64_t); */ uint64_t clmul_16x2_odd(uint64_t, uint64_t); +/** + * clmul_32: + * + * Perform a 32x32->64 carry-less multiply. + */ +uint64_t clmul_32(uint32_t, uint32_t); + #endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c index XXXXXXX..XXXXXXX 100644 --- a/crypto/clmul.c +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t n, uint64_t m) { return clmul_16x2_even(n >> 16, m >> 16); } + +uint64_t clmul_32(uint32_t n, uint32_t m32) +{ + uint64_t r = 0; + uint64_t m = m32; + + for (int i = 0; i < 32; ++i) { + r ^= n & 1 ? m : 0; + n >>= 1; + m <<= 1; + } + return r; +} -- 2.34.1
Use generic routines for 32-bit carry-less multiply. Remove our local version of pmull_d. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_helper.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) } } -static uint64_t pmull_d(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - - for (i = 0; i < 32; ++i) { - uint64_t mask = -((op1 >> i) & 1); - result ^= (op2 << i) & mask; - } - return result; -} - void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) { intptr_t sel = H4(simd_data(desc)); @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) uint64_t *d = vd; for (i = 0; i < opr_sz / 8; ++i) { - d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); + d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); } } #endif -- 2.34.1
Use generic routines for 32-bit carry-less multiply. Remove our local version of galois_multiply32. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 75 +++++++++---------------------- 1 file changed, 22 insertions(+), 53 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ DEF_VCTZ(8) DEF_VCTZ(16) /* like binary multiplication, but XOR instead of addition */ -#define DEF_GALOIS_MULTIPLY(BITS, TBITS) \ -static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \ - uint##TBITS##_t b) \ -{ \ - uint##TBITS##_t res = 0; \ - \ - while (b) { \ - if (b & 0x1) { \ - res = res ^ a; \ - } \ - a = a << 1; \ - b = b >> 1; \ - } \ - return res; \ -} -DEF_GALOIS_MULTIPLY(32, 64) static S390Vector galois_multiply64(uint64_t a, uint64_t b) { @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, q1[1] = do_gfma16(q2[1], q3[1], q4[1]); } -#define DEF_VGFM(BITS, TBITS) \ -void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ - uint32_t desc) \ -{ \ - int i; \ - \ - for (i = 0; i < (128 / TBITS); i++) { \ - uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \ - uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \ - uint##TBITS##_t d = galois_multiply##BITS(a, b); \ - \ - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ - d = d ^ galois_multiply32(a, b); \ - s390_vec_write_element##TBITS(v1, i, d); \ - } \ +static inline uint64_t do_gfma32(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_32(n, m) ^ clmul_32(n >> 32, m >> 32) ^ a; +} + +void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma32(q2[0], q3[0], 0); + q1[1] = do_gfma32(q2[1], q3[1], 0); +} + +void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma32(q2[0], q3[0], q4[0]); + q1[1] = do_gfma32(q2[1], q3[1], q4[1]); } -DEF_VGFM(32, 64) void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, uint32_t desc) @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, s390_vec_xor(v1, &tmp1, &tmp2); } -#define DEF_VGFMA(BITS, TBITS) \ -void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \ - const void *v4, uint32_t desc) \ -{ \ - int i; \ - \ - for (i = 0; i < (128 / TBITS); i++) { \ - uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \ - uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \ - uint##TBITS##_t d = galois_multiply##BITS(a, b); \ - \ - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ - d = d ^ galois_multiply32(a, b); \ - d = d ^ s390_vec_read_element##TBITS(v4, i); \ - s390_vec_write_element##TBITS(v1, i, d); \ - } \ -} -DEF_VGFMA(32, 64) - void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, const void *v4, uint32_t desc) { -- 2.34.1
Use generic routines for 32-bit carry-less multiply. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) } } -#define PMSUM(name, srcfld, trgfld, trgtyp) \ -void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ -{ \ - int i, j; \ - trgtyp prod[sizeof(ppc_avr_t) / sizeof(a->srcfld[0])]; \ - \ - VECTOR_FOR_INORDER_I(i, srcfld) { \ - prod[i] = 0; \ - for (j = 0; j < sizeof(a->srcfld[0]) * 8; j++) { \ - if (a->srcfld[i] & (1ull << j)) { \ - prod[i] ^= ((trgtyp)b->srcfld[i] << j); \ - } \ - } \ - } \ - \ - VECTOR_FOR_INORDER_I(i, trgfld) { \ - r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \ - } \ +void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) +{ + for (int i = 0; i < 2; ++i) { + uint64_t aa = a->u64[i], bb = b->u64[i]; + r->u64[i] = clmul_32(aa, bb) ^ clmul_32(aa >> 32, bb >> 32); + } } -PMSUM(vpmsumw, u32, u64, uint64_t) - void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int i, j; -- 2.34.1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- host/include/generic/host/crypto/clmul.h | 15 +++++++++++++++ include/crypto/clmul.h | 19 +++++++++++++++++++ crypto/clmul.c | 18 ++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 host/include/generic/host/crypto/clmul.h diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/generic/host/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * No host specific carry-less multiply acceleration. + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef GENERIC_HOST_CRYPTO_CLMUL_H +#define GENERIC_HOST_CRYPTO_CLMUL_H + +#define HAVE_CLMUL_ACCEL false +#define ATTR_CLMUL_ACCEL + +Int128 clmul_64_accel(uint64_t, uint64_t) + QEMU_ERROR("unsupported accel"); + +#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */ diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h index XXXXXXX..XXXXXXX 100644 --- a/include/crypto/clmul.h +++ b/include/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ #ifndef CRYPTO_CLMUL_H #define CRYPTO_CLMUL_H +#include "qemu/int128.h" +#include "host/crypto/clmul.h" + /** * clmul_8x8_low: * @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t, uint64_t); */ uint64_t clmul_32(uint32_t, uint32_t); +/** + * clmul_64: + * + * Perform a 64x64->128 carry-less multiply. + */ +Int128 clmul_64_gen(uint64_t, uint64_t); + +static inline Int128 clmul_64(uint64_t a, uint64_t b) +{ + if (HAVE_CLMUL_ACCEL) { + return clmul_64_accel(a, b); + } else { + return clmul_64_gen(a, b); + } +} + #endif /* CRYPTO_CLMUL_H */ diff --git a/crypto/clmul.c b/crypto/clmul.c index XXXXXXX..XXXXXXX 100644 --- a/crypto/clmul.c +++ b/crypto/clmul.c @@ -XXX,XX +XXX,XX @@ uint64_t clmul_32(uint32_t n, uint32_t m32) } return r; } + +Int128 clmul_64_gen(uint64_t n, uint64_t m) +{ + uint64_t rl = 0, rh = 0; + + /* Bit 0 can only influence the low 64-bit result. */ + if (n & 1) { + rl = m; + } + + for (int i = 1; i < 64; ++i) { + uint64_t mask = -(n & 1); + rl ^= (m << i) & mask; + rh ^= (m >> (64 - i)) & mask; + n >>= 1; + } + return int128_make128(rl, rh); +} -- 2.34.1
Use generic routine for 64-bit carry-less multiply. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/tcg/vec_helper.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) */ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) { - intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t i, opr_sz = simd_oprsz(desc); intptr_t hi = simd_data(desc); uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; i += 2) { - uint64_t nn = n[i + hi]; - uint64_t mm = m[i + hi]; - uint64_t rhi = 0; - uint64_t rlo = 0; - - /* Bit 0 can only influence the low 64-bit result. */ - if (nn & 1) { - rlo = mm; - } - - for (j = 1; j < 64; ++j) { - uint64_t mask = -((nn >> j) & 1); - rlo ^= (mm << j) & mask; - rhi ^= (mm >> (64 - j)) & mask; - } - d[i] = rlo; - d[i + 1] = rhi; + Int128 r = clmul_64(n[i + hi], m[i + hi]); + d[i] = int128_getlo(r); + d[i + 1] = int128_gethi(r); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -- 2.34.1
Use generic routine for 64-bit carry-less multiply. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/i386/ops_sse.h | 40 +++++++++------------------------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index XXXXXXX..XXXXXXX 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -XXX,XX +XXX,XX @@ #include "crypto/aes.h" #include "crypto/aes-round.h" +#include "crypto/clmul.h" #if SHIFT == 0 #define Reg MMXReg @@ -XXX,XX +XXX,XX @@ target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) #endif -#if SHIFT == 1 -static void clmulq(uint64_t *dest_l, uint64_t *dest_h, - uint64_t a, uint64_t b) -{ - uint64_t al, ah, resh, resl; - - ah = 0; - al = a; - resh = resl = 0; - - while (b) { - if (b & 1) { - resl ^= al; - resh ^= ah; - } - ah = (ah << 1) | (al >> 63); - al <<= 1; - b >>= 1; - } - - *dest_l = resl; - *dest_h = resh; -} -#endif - void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, uint32_t ctrl) { - uint64_t a, b; - int i; + int a_idx = (ctrl & 1) != 0; + int b_idx = (ctrl & 16) != 0; - for (i = 0; i < 1 << SHIFT; i += 2) { - a = v->Q(((ctrl & 1) != 0) + i); - b = s->Q(((ctrl & 16) != 0) + i); - clmulq(&d->Q(i), &d->Q(i + 1), a, b); + for (int i = 0; i < SHIFT; i++) { + uint64_t a = v->Q(2 * i + a_idx); + uint64_t b = s->Q(2 * i + b_idx); + Int128 *r = (Int128 *)&d->ZMM_X(i); + + *r = clmul_64(a, b); } } -- 2.34.1
Use the generic routine for 64-bit carry-less multiply. Remove our local version of galois_multiply64. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/s390x/tcg/vec_int_helper.c | 58 +++++++------------------------ 1 file changed, 12 insertions(+), 46 deletions(-) diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -XXX,XX +XXX,XX @@ static bool s390_vec_is_zero(const S390Vector *v) return !v->doubleword[0] && !v->doubleword[1]; } -static void s390_vec_xor(S390Vector *res, const S390Vector *a, - const S390Vector *b) -{ - res->doubleword[0] = a->doubleword[0] ^ b->doubleword[0]; - res->doubleword[1] = a->doubleword[1] ^ b->doubleword[1]; -} - static void s390_vec_and(S390Vector *res, const S390Vector *a, const S390Vector *b) { @@ -XXX,XX +XXX,XX @@ DEF_VCTZ(16) /* like binary multiplication, but XOR instead of addition */ -static S390Vector galois_multiply64(uint64_t a, uint64_t b) -{ - S390Vector res = {}; - S390Vector va = { - .doubleword[1] = a, - }; - S390Vector vb = { - .doubleword[1] = b, - }; - - while (!s390_vec_is_zero(&vb)) { - if (vb.doubleword[1] & 0x1) { - s390_vec_xor(&res, &res, &va); - } - s390_vec_shl(&va, &va, 1); - s390_vec_shr(&vb, &vb, 1); - } - return res; -} - /* * There is no carry across the two doublewords, so their order does * not matter. Nor is there partial overlap between registers. @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, uint32_t desc) { - S390Vector tmp1, tmp2; - uint64_t a, b; + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + Int128 r; - a = s390_vec_read_element64(v2, 0); - b = s390_vec_read_element64(v3, 0); - tmp1 = galois_multiply64(a, b); - a = s390_vec_read_element64(v2, 1); - b = s390_vec_read_element64(v3, 1); - tmp2 = galois_multiply64(a, b); - s390_vec_xor(v1, &tmp1, &tmp2); + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); + q1[0] = int128_gethi(r); + q1[1] = int128_getlo(r); } void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, const void *v4, uint32_t desc) { - S390Vector tmp1, tmp2; - uint64_t a, b; + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + Int128 r; - a = s390_vec_read_element64(v2, 0); - b = s390_vec_read_element64(v3, 0); - tmp1 = galois_multiply64(a, b); - a = s390_vec_read_element64(v2, 1); - b = s390_vec_read_element64(v3, 1); - tmp2 = galois_multiply64(a, b); - s390_vec_xor(&tmp1, &tmp1, &tmp2); - s390_vec_xor(v1, &tmp1, v4); + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); + q1[0] = q4[0] ^ int128_gethi(r); + q1[1] = q4[1] ^ int128_getlo(r); } #define DEF_VMAL(BITS) \ -- 2.34.1
Use generic routine for 64-bit carry-less multiply. Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/ppc/int_helper.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index XXXXXXX..XXXXXXX 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -XXX,XX +XXX,XX @@ void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { - int i, j; - Int128 tmp, prod[2] = {int128_zero(), int128_zero()}; - - for (j = 0; j < 64; j++) { - for (i = 0; i < ARRAY_SIZE(r->u64); i++) { - if (a->VsrD(i) & (1ull << j)) { - tmp = int128_make64(b->VsrD(i)); - tmp = int128_lshift(tmp, j); - prod[i] = int128_xor(prod[i], tmp); - } - } - } - - r->s128 = int128_xor(prod[0], prod[1]); + Int128 e = clmul_64(a->u64[0], b->u64[0]); + Int128 o = clmul_64(a->u64[1], b->u64[1]); + r->s128 = int128_xor(e, o); } #if HOST_BIG_ENDIAN -- 2.34.1
Detect PCLMUL in cpuinfo; implement the accel hook. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- host/include/i386/host/cpuinfo.h | 1 + host/include/i386/host/crypto/clmul.h | 29 +++++++++++++++++++++++++ host/include/x86_64/host/crypto/clmul.h | 1 + include/qemu/cpuid.h | 3 +++ util/cpuinfo-i386.c | 1 + 5 files changed, 35 insertions(+) create mode 100644 host/include/i386/host/crypto/clmul.h create mode 100644 host/include/x86_64/host/crypto/clmul.h diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h index XXXXXXX..XXXXXXX 100644 --- a/host/include/i386/host/cpuinfo.h +++ b/host/include/i386/host/cpuinfo.h @@ -XXX,XX +XXX,XX @@ #define CPUINFO_ATOMIC_VMOVDQA (1u << 16) #define CPUINFO_ATOMIC_VMOVDQU (1u << 17) #define CPUINFO_AES (1u << 18) +#define CPUINFO_PCLMUL (1u << 19) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/host/include/i386/host/crypto/clmul.h b/host/include/i386/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/i386/host/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * x86 specific clmul acceleration. + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef X86_HOST_CRYPTO_CLMUL_H +#define X86_HOST_CRYPTO_CLMUL_H + +#include "host/cpuinfo.h" +#include <immintrin.h> + +#if defined(__PCLMUL__) +# define HAVE_CLMUL_ACCEL true +# define ATTR_CLMUL_ACCEL +#else +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL) +# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul"))) +#endif + +static inline Int128 ATTR_CLMUL_ACCEL +clmul_64_accel(uint64_t n, uint64_t m) +{ + union { __m128i v; Int128 s; } u; + + u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0); + return u.s; +} + +#endif /* X86_HOST_CRYPTO_CLMUL_H */ diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/x86_64/host/crypto/clmul.h @@ -0,0 +1 @@ +#include "host/include/i386/host/crypto/clmul.h" diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h index XXXXXXX..XXXXXXX 100644 --- a/include/qemu/cpuid.h +++ b/include/qemu/cpuid.h @@ -XXX,XX +XXX,XX @@ #endif /* Leaf 1, %ecx */ +#ifndef bit_PCLMUL +#define bit_PCLMUL (1 << 1) +#endif #ifndef bit_SSE4_1 #define bit_SSE4_1 (1 << 19) #endif diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c index XXXXXXX..XXXXXXX 100644 --- a/util/cpuinfo-i386.c +++ b/util/cpuinfo-i386.c @@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void) info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0); info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0); info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0); + info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0); /* Our AES support requires PSHUFB as well. */ info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0); -- 2.34.1
Detect PMULL in cpuinfo; implement the accel hook. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- host/include/aarch64/host/cpuinfo.h | 1 + host/include/aarch64/host/crypto/clmul.h | 41 ++++++++++++++++++++++++ util/cpuinfo-aarch64.c | 4 ++- 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 host/include/aarch64/host/crypto/clmul.h diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h index XXXXXXX..XXXXXXX 100644 --- a/host/include/aarch64/host/cpuinfo.h +++ b/host/include/aarch64/host/cpuinfo.h @@ -XXX,XX +XXX,XX @@ #define CPUINFO_LSE (1u << 1) #define CPUINFO_LSE2 (1u << 2) #define CPUINFO_AES (1u << 3) +#define CPUINFO_PMULL (1u << 4) /* Initialized with a constructor. */ extern unsigned cpuinfo; diff --git a/host/include/aarch64/host/crypto/clmul.h b/host/include/aarch64/host/crypto/clmul.h new file mode 100644 index XXXXXXX..XXXXXXX --- /dev/null +++ b/host/include/aarch64/host/crypto/clmul.h @@ -XXX,XX +XXX,XX @@ +/* + * AArch64 specific clmul acceleration. + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef AARCH64_HOST_CRYPTO_CLMUL_H +#define AARCH64_HOST_CRYPTO_CLMUL_H + +#include "host/cpuinfo.h" +#include <arm_neon.h> + +/* + * 64x64->128 pmull is available with FEAT_PMULL. + * Both FEAT_AES and FEAT_PMULL are covered under the same macro. + */ +#ifdef __ARM_FEATURE_AES +# define HAVE_CLMUL_ACCEL true +#else +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL) +#endif +#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN) +# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto"))) +#else +# define ATTR_CLMUL_ACCEL +#endif + +static inline Int128 ATTR_CLMUL_ACCEL +clmul_64_accel(uint64_t n, uint64_t m) +{ + union { poly128_t v; Int128 s; } u; + +#ifdef CONFIG_ARM_AES_BUILTIN + u.v = vmull_p64((poly64_t)n, (poly64_t)m); +#else + asm(".arch_extension aes\n\t" + "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m)); +#endif + return u.s; +} + +#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */ diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c index XXXXXXX..XXXXXXX 100644 --- a/util/cpuinfo-aarch64.c +++ b/util/cpuinfo-aarch64.c @@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void) unsigned long hwcap = qemu_getauxval(AT_HWCAP); info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0); info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0); - info |= (hwcap & HWCAP_AES ? CPUINFO_AES: 0); + info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0); + info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0); #endif #ifdef CONFIG_DARWIN info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE; info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2; info |= sysctl_for_bool("hw.optional.arm.FEAT_AES") * CPUINFO_AES; + info |= sysctl_for_bool("hw.optional.arm.FEAT_PMULL") * CPUINFO_PMULL; #endif cpuinfo = info; -- 2.34.1