1 | Inspired by Ard Biesheuvel's RFC patches [1] for accelerating | 1 | The following changes since commit 005ad32358f12fe9313a4a01918a55e60d4f39e5: |
---|---|---|---|
2 | carry-less multiply under emulation. | ||
3 | 2 | ||
4 | This is less polished than the AES patch set: | 3 | Merge tag 'pull-tpm-2023-09-12-3' of https://github.com/stefanberger/qemu-tpm into staging (2023-09-13 13:41:57 -0400) |
5 | 4 | ||
6 | (1) Should I split HAVE_CLMUL_ACCEL into per-width HAVE_CLMUL{N}_ACCEL? | 5 | are available in the Git repository at: |
7 | The "_generic" and "_accel" split is different from aes-round.h | ||
8 | because of the difference in support for different widths, and it | ||
9 | means that each host accel has more boilerplate. | ||
10 | 6 | ||
11 | (2) Should I bother trying to accelerate anything other than 64x64->128? | 7 | https://gitlab.com/rth7680/qemu.git tags/pull-crypto-20230915 |
12 | That seems to be the one that GSM really wants anyway. I'd keep all | ||
13 | of the sizes implemented generically, since that centralizes the 3 | ||
14 | target implementations. | ||
15 | 8 | ||
16 | (3) The use of Int128 isn't fantastic -- better would be a vector type, | 9 | for you to fetch changes up to 055c99015a4ec3c608d0260592368adc604429ea: |
17 | though that has its own special problems for ppc64le (see the | ||
18 | endianness hoops within aes-round.h). Perhaps leave things in | ||
19 | env memory, like I was mostly able to do with AES? | ||
20 | 10 | ||
21 | (4) No guest test case(s). | 11 | host/include/aarch64: Implement clmul.h (2023-09-15 13:57:00 +0000) |
22 | 12 | ||
13 | ---------------------------------------------------------------- | ||
14 | Unify implementation of carry-less multiply. | ||
15 | Accelerate carry-less multiply for 64x64->128. | ||
23 | 16 | ||
24 | r~ | 17 | ---------------------------------------------------------------- |
25 | 18 | Richard Henderson (19): | |
26 | 19 | crypto: Add generic 8-bit carry-less multiply routines | |
27 | [1] https://patchew.org/QEMU/20230601123332.3297404-1-ardb@kernel.org/ | 20 | target/arm: Use clmul_8* routines |
28 | 21 | target/s390x: Use clmul_8* routines | |
29 | Richard Henderson (18): | 22 | target/ppc: Use clmul_8* routines |
30 | crypto: Add generic 8-bit carry-less multiply routines | 23 | crypto: Add generic 16-bit carry-less multiply routines |
31 | target/arm: Use clmul_8* routines | 24 | target/arm: Use clmul_16* routines |
32 | target/s390x: Use clmul_8* routines | 25 | target/s390x: Use clmul_16* routines |
33 | target/ppc: Use clmul_8* routines | 26 | target/ppc: Use clmul_16* routines |
34 | crypto: Add generic 16-bit carry-less multiply routines | 27 | crypto: Add generic 32-bit carry-less multiply routines |
35 | target/arm: Use clmul_16* routines | 28 | target/arm: Use clmul_32* routines |
36 | target/s390x: Use clmul_16* routines | 29 | target/s390x: Use clmul_32* routines |
37 | target/ppc: Use clmul_16* routines | 30 | target/ppc: Use clmul_32* routines |
38 | crypto: Add generic 32-bit carry-less multiply routines | 31 | crypto: Add generic 64-bit carry-less multiply routine |
39 | target/arm: Use clmul_32* routines | 32 | target/arm: Use clmul_64 |
40 | target/s390x: Use clmul_32* routines | 33 | target/i386: Use clmul_64 |
41 | target/ppc: Use clmul_32* routines | 34 | target/s390x: Use clmul_64 |
42 | crypto: Add generic 64-bit carry-less multiply routine | 35 | target/ppc: Use clmul_64 |
43 | target/arm: Use clmul_64 | 36 | host/include/i386: Implement clmul.h |
44 | target/s390x: Use clmul_64 | 37 | host/include/aarch64: Implement clmul.h |
45 | target/ppc: Use clmul_64 | ||
46 | host/include/i386: Implement clmul.h | ||
47 | host/include/aarch64: Implement clmul.h | ||
48 | 38 | ||
49 | host/include/aarch64/host/cpuinfo.h | 1 + | 39 | host/include/aarch64/host/cpuinfo.h | 1 + |
50 | host/include/aarch64/host/crypto/clmul.h | 230 +++++++++++++++++++++++ | 40 | host/include/aarch64/host/crypto/clmul.h | 41 +++++++ |
51 | host/include/generic/host/crypto/clmul.h | 28 +++ | 41 | host/include/generic/host/crypto/clmul.h | 15 +++ |
52 | host/include/i386/host/cpuinfo.h | 1 + | 42 | host/include/i386/host/cpuinfo.h | 1 + |
53 | host/include/i386/host/crypto/clmul.h | 187 ++++++++++++++++++ | 43 | host/include/i386/host/crypto/clmul.h | 29 +++++ |
54 | host/include/x86_64/host/crypto/clmul.h | 1 + | 44 | host/include/x86_64/host/crypto/clmul.h | 1 + |
55 | include/crypto/clmul.h | 123 ++++++++++++ | 45 | include/crypto/clmul.h | 83 ++++++++++++++ |
46 | include/qemu/cpuid.h | 3 + | ||
56 | target/arm/tcg/vec_internal.h | 11 -- | 47 | target/arm/tcg/vec_internal.h | 11 -- |
57 | crypto/clmul.c | 163 ++++++++++++++++ | 48 | target/i386/ops_sse.h | 40 ++----- |
58 | target/arm/tcg/mve_helper.c | 16 +- | 49 | crypto/clmul.c | 111 ++++++++++++++++++ |
59 | target/arm/tcg/vec_helper.c | 112 ++--------- | 50 | target/arm/tcg/mve_helper.c | 16 +-- |
60 | target/ppc/int_helper.c | 63 +++---- | 51 | target/arm/tcg/vec_helper.c | 102 ++--------------- |
61 | target/s390x/tcg/vec_int_helper.c | 175 +++++++---------- | 52 | target/ppc/int_helper.c | 64 +++++------ |
53 | target/s390x/tcg/vec_int_helper.c | 186 ++++++++++++++----------------- | ||
62 | util/cpuinfo-aarch64.c | 4 +- | 54 | util/cpuinfo-aarch64.c | 4 +- |
63 | util/cpuinfo-i386.c | 1 + | 55 | util/cpuinfo-i386.c | 1 + |
64 | crypto/meson.build | 9 +- | 56 | crypto/meson.build | 9 +- |
65 | 16 files changed, 865 insertions(+), 260 deletions(-) | 57 | 18 files changed, 433 insertions(+), 285 deletions(-) |
66 | create mode 100644 host/include/aarch64/host/crypto/clmul.h | 58 | create mode 100644 host/include/aarch64/host/crypto/clmul.h |
67 | create mode 100644 host/include/generic/host/crypto/clmul.h | 59 | create mode 100644 host/include/generic/host/crypto/clmul.h |
68 | create mode 100644 host/include/i386/host/crypto/clmul.h | 60 | create mode 100644 host/include/i386/host/crypto/clmul.h |
69 | create mode 100644 host/include/x86_64/host/crypto/clmul.h | 61 | create mode 100644 host/include/x86_64/host/crypto/clmul.h |
70 | create mode 100644 include/crypto/clmul.h | 62 | create mode 100644 include/crypto/clmul.h |
71 | create mode 100644 crypto/clmul.c | 63 | create mode 100644 crypto/clmul.c |
72 | |||
73 | -- | ||
74 | 2.34.1 | diff view generated by jsdifflib |
1 | Reviewed-by: Ard Biesheuvel <ardb@kernel.org> | ||
---|---|---|---|
1 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 2 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
2 | --- | 3 | --- |
3 | host/include/generic/host/crypto/clmul.h | 17 ++++++ | 4 | include/crypto/clmul.h | 41 +++++++++++++++++++++++++++++ |
4 | include/crypto/clmul.h | 61 +++++++++++++++++++ | 5 | crypto/clmul.c | 60 ++++++++++++++++++++++++++++++++++++++++++ |
5 | crypto/clmul.c | 76 ++++++++++++++++++++++++ | 6 | crypto/meson.build | 9 ++++--- |
6 | crypto/meson.build | 9 ++- | 7 | 3 files changed, 107 insertions(+), 3 deletions(-) |
7 | 4 files changed, 160 insertions(+), 3 deletions(-) | ||
8 | create mode 100644 host/include/generic/host/crypto/clmul.h | ||
9 | create mode 100644 include/crypto/clmul.h | 8 | create mode 100644 include/crypto/clmul.h |
10 | create mode 100644 crypto/clmul.c | 9 | create mode 100644 crypto/clmul.c |
11 | 10 | ||
12 | diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h | ||
13 | new file mode 100644 | ||
14 | index XXXXXXX..XXXXXXX | ||
15 | --- /dev/null | ||
16 | +++ b/host/include/generic/host/crypto/clmul.h | ||
17 | @@ -XXX,XX +XXX,XX @@ | ||
18 | +/* | ||
19 | + * No host specific carry-less multiply acceleration. | ||
20 | + * SPDX-License-Identifier: GPL-2.0-or-later | ||
21 | + */ | ||
22 | + | ||
23 | +#ifndef GENERIC_HOST_CRYPTO_CLMUL_H | ||
24 | +#define GENERIC_HOST_CRYPTO_CLMUL_H | ||
25 | + | ||
26 | +/* Defer everything to the generic routines. */ | ||
27 | +#define clmul_8x8_low clmul_8x8_low_gen | ||
28 | +#define clmul_8x4_even clmul_8x4_even_gen | ||
29 | +#define clmul_8x4_odd clmul_8x4_odd_gen | ||
30 | +#define clmul_8x8_even clmul_8x8_even_gen | ||
31 | +#define clmul_8x8_odd clmul_8x8_odd_gen | ||
32 | +#define clmul_8x8_packed clmul_8x8_packed_gen | ||
33 | + | ||
34 | +#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */ | ||
35 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h | 11 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h |
36 | new file mode 100644 | 12 | new file mode 100644 |
37 | index XXXXXXX..XXXXXXX | 13 | index XXXXXXX..XXXXXXX |
38 | --- /dev/null | 14 | --- /dev/null |
39 | +++ b/include/crypto/clmul.h | 15 | +++ b/include/crypto/clmul.h |
40 | @@ -XXX,XX +XXX,XX @@ | 16 | @@ -XXX,XX +XXX,XX @@ |
41 | +/* | 17 | +/* |
42 | + * Carry-less multiply | 18 | + * Carry-less multiply operations. |
43 | + * SPDX-License-Identifier: GPL-2.0-or-later | 19 | + * SPDX-License-Identifier: GPL-2.0-or-later |
44 | + * | 20 | + * |
45 | + * Copyright (C) 2023 Linaro, Ltd. | 21 | + * Copyright (C) 2023 Linaro, Ltd. |
46 | + */ | 22 | + */ |
47 | + | 23 | + |
48 | +#ifndef CRYPTO_CLMUL_H | 24 | +#ifndef CRYPTO_CLMUL_H |
49 | +#define CRYPTO_CLMUL_H | 25 | +#define CRYPTO_CLMUL_H |
50 | + | 26 | + |
51 | +#include "qemu/int128.h" | ||
52 | + | ||
53 | +/** | 27 | +/** |
54 | + * clmul_8x8_low: | 28 | + * clmul_8x8_low: |
55 | + * | 29 | + * |
56 | + * Perform eight 8x8->8 carry-less multiplies. | 30 | + * Perform eight 8x8->8 carry-less multiplies. |
57 | + */ | 31 | + */ |
58 | +uint64_t clmul_8x8_low_gen(uint64_t, uint64_t); | 32 | +uint64_t clmul_8x8_low(uint64_t, uint64_t); |
59 | + | 33 | + |
60 | +/** | 34 | +/** |
61 | + * clmul_8x4_even: | 35 | + * clmul_8x4_even: |
62 | + * | 36 | + * |
63 | + * Perform four 8x8->16 carry-less multiplies. | 37 | + * Perform four 8x8->16 carry-less multiplies. |
64 | + * The odd bytes of the inputs are ignored. | 38 | + * The odd bytes of the inputs are ignored. |
65 | + */ | 39 | + */ |
66 | +uint64_t clmul_8x4_even_gen(uint64_t, uint64_t); | 40 | +uint64_t clmul_8x4_even(uint64_t, uint64_t); |
67 | + | 41 | + |
68 | +/** | 42 | +/** |
69 | + * clmul_8x4_odd: | 43 | + * clmul_8x4_odd: |
70 | + * | 44 | + * |
71 | + * Perform four 8x8->16 carry-less multiplies. | 45 | + * Perform four 8x8->16 carry-less multiplies. |
72 | + * The even bytes of the inputs are ignored. | 46 | + * The even bytes of the inputs are ignored. |
73 | + */ | 47 | + */ |
74 | +uint64_t clmul_8x4_odd_gen(uint64_t, uint64_t); | 48 | +uint64_t clmul_8x4_odd(uint64_t, uint64_t); |
75 | + | 49 | + |
76 | +/** | 50 | +/** |
77 | + * clmul_8x8_even: | 51 | + * clmul_8x4_packed: |
78 | + * | 52 | + * |
79 | + * Perform eight 8x8->16 carry-less multiplies. | 53 | + * Perform four 8x8->16 carry-less multiplies. |
80 | + * The odd bytes of the inputs are ignored. | ||
81 | + */ | 54 | + */ |
82 | +Int128 clmul_8x8_even_gen(Int128, Int128); | 55 | +uint64_t clmul_8x4_packed(uint32_t, uint32_t); |
83 | + | ||
84 | +/** | ||
85 | + * clmul_8x8_odd: | ||
86 | + * | ||
87 | + * Perform eight 8x8->16 carry-less multiplies. | ||
88 | + * The even bytes of the inputs are ignored. | ||
89 | + */ | ||
90 | +Int128 clmul_8x8_odd_gen(Int128, Int128); | ||
91 | + | ||
92 | +/** | ||
93 | + * clmul_8x8_packed: | ||
94 | + * | ||
95 | + * Perform eight 8x8->16 carry-less multiplies. | ||
96 | + */ | ||
97 | +Int128 clmul_8x8_packed_gen(uint64_t, uint64_t); | ||
98 | + | ||
99 | +#include "host/crypto/clmul.h" | ||
100 | + | 56 | + |
101 | +#endif /* CRYPTO_CLMUL_H */ | 57 | +#endif /* CRYPTO_CLMUL_H */ |
102 | diff --git a/crypto/clmul.c b/crypto/clmul.c | 58 | diff --git a/crypto/clmul.c b/crypto/clmul.c |
103 | new file mode 100644 | 59 | new file mode 100644 |
104 | index XXXXXXX..XXXXXXX | 60 | index XXXXXXX..XXXXXXX |
105 | --- /dev/null | 61 | --- /dev/null |
106 | +++ b/crypto/clmul.c | 62 | +++ b/crypto/clmul.c |
107 | @@ -XXX,XX +XXX,XX @@ | 63 | @@ -XXX,XX +XXX,XX @@ |
108 | +/* | 64 | +/* |
109 | + * No host specific carry-less multiply acceleration. | 65 | + * Carry-less multiply operations. |
110 | + * SPDX-License-Identifier: GPL-2.0-or-later | 66 | + * SPDX-License-Identifier: GPL-2.0-or-later |
67 | + * | ||
68 | + * Copyright (C) 2023 Linaro, Ltd. | ||
111 | + */ | 69 | + */ |
112 | + | 70 | + |
113 | +#include "qemu/osdep.h" | 71 | +#include "qemu/osdep.h" |
114 | +#include "crypto/clmul.h" | 72 | +#include "crypto/clmul.h" |
115 | + | 73 | + |
116 | +uint64_t clmul_8x8_low_gen(uint64_t n, uint64_t m) | 74 | +uint64_t clmul_8x8_low(uint64_t n, uint64_t m) |
117 | +{ | 75 | +{ |
118 | + uint64_t r = 0; | 76 | + uint64_t r = 0; |
119 | + | 77 | + |
120 | + for (int i = 0; i < 8; ++i) { | 78 | + for (int i = 0; i < 8; ++i) { |
121 | + uint64_t mask = (n & 0x0101010101010101ull) * 0xff; | 79 | + uint64_t mask = (n & 0x0101010101010101ull) * 0xff; |
... | ... | ||
124 | + n >>= 1; | 82 | + n >>= 1; |
125 | + } | 83 | + } |
126 | + return r; | 84 | + return r; |
127 | +} | 85 | +} |
128 | + | 86 | + |
129 | +uint64_t clmul_8x4_even_gen(uint64_t n, uint64_t m) | 87 | +static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m) |
130 | +{ | 88 | +{ |
131 | + uint64_t r = 0; | 89 | + uint64_t r = 0; |
132 | + | ||
133 | + n &= 0x00ff00ff00ff00ffull; | ||
134 | + m &= 0x00ff00ff00ff00ffull; | ||
135 | + | 90 | + |
136 | + for (int i = 0; i < 8; ++i) { | 91 | + for (int i = 0; i < 8; ++i) { |
137 | + uint64_t mask = (n & 0x0001000100010001ull) * 0xffff; | 92 | + uint64_t mask = (n & 0x0001000100010001ull) * 0xffff; |
138 | + r ^= m & mask; | 93 | + r ^= m & mask; |
139 | + n >>= 1; | 94 | + n >>= 1; |
140 | + m <<= 1; | 95 | + m <<= 1; |
141 | + } | 96 | + } |
142 | + return r; | 97 | + return r; |
143 | +} | 98 | +} |
144 | + | 99 | + |
145 | +uint64_t clmul_8x4_odd_gen(uint64_t n, uint64_t m) | 100 | +uint64_t clmul_8x4_even(uint64_t n, uint64_t m) |
146 | +{ | 101 | +{ |
147 | + return clmul_8x4_even_gen(n >> 8, m >> 8); | 102 | + n &= 0x00ff00ff00ff00ffull; |
103 | + m &= 0x00ff00ff00ff00ffull; | ||
104 | + return clmul_8x4_even_int(n, m); | ||
148 | +} | 105 | +} |
149 | + | 106 | + |
150 | +Int128 clmul_8x8_even_gen(Int128 n, Int128 m) | 107 | +uint64_t clmul_8x4_odd(uint64_t n, uint64_t m) |
151 | +{ | 108 | +{ |
152 | + uint64_t rl, rh; | 109 | + return clmul_8x4_even(n >> 8, m >> 8); |
153 | + | ||
154 | + rl = clmul_8x4_even_gen(int128_getlo(n), int128_getlo(m)); | ||
155 | + rh = clmul_8x4_even_gen(int128_gethi(n), int128_gethi(m)); | ||
156 | + return int128_make128(rl, rh); | ||
157 | +} | ||
158 | + | ||
159 | +Int128 clmul_8x8_odd_gen(Int128 n, Int128 m) | ||
160 | +{ | ||
161 | + uint64_t rl, rh; | ||
162 | + | ||
163 | + rl = clmul_8x4_odd_gen(int128_getlo(n), int128_getlo(m)); | ||
164 | + rh = clmul_8x4_odd_gen(int128_gethi(n), int128_gethi(m)); | ||
165 | + return int128_make128(rl, rh); | ||
166 | +} | 110 | +} |
167 | + | 111 | + |
168 | +static uint64_t unpack_8_to_16(uint64_t x) | 112 | +static uint64_t unpack_8_to_16(uint64_t x) |
169 | +{ | 113 | +{ |
170 | + return (x & 0x000000ff) | 114 | + return (x & 0x000000ff) |
171 | + | ((x & 0x0000ff00) << 8) | 115 | + | ((x & 0x0000ff00) << 8) |
172 | + | ((x & 0x00ff0000) << 16) | 116 | + | ((x & 0x00ff0000) << 16) |
173 | + | ((x & 0xff000000) << 24); | 117 | + | ((x & 0xff000000) << 24); |
174 | +} | 118 | +} |
175 | + | 119 | + |
176 | +Int128 clmul_8x8_packed_gen(uint64_t n, uint64_t m) | 120 | +uint64_t clmul_8x4_packed(uint32_t n, uint32_t m) |
177 | +{ | 121 | +{ |
178 | + uint64_t rl, rh; | 122 | + return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m)); |
179 | + | ||
180 | + rl = clmul_8x4_even_gen(unpack_8_to_16(n), unpack_8_to_16(m)); | ||
181 | + rh = clmul_8x4_even_gen(unpack_8_to_16(n >> 32), unpack_8_to_16(m >> 32)); | ||
182 | + return int128_make128(rl, rh); | ||
183 | +} | 123 | +} |
184 | diff --git a/crypto/meson.build b/crypto/meson.build | 124 | diff --git a/crypto/meson.build b/crypto/meson.build |
185 | index XXXXXXX..XXXXXXX 100644 | 125 | index XXXXXXX..XXXXXXX 100644 |
186 | --- a/crypto/meson.build | 126 | --- a/crypto/meson.build |
187 | +++ b/crypto/meson.build | 127 | +++ b/crypto/meson.build |
... | ... | diff view generated by jsdifflib |
1 | Use generic routines for 8-bit carry-less multiply. | 1 | Use generic routines for 8-bit carry-less multiply. |
---|---|---|---|
2 | Remove our local version of pmull_h. | 2 | Remove our local version of pmull_h. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 6 | --- |
6 | target/arm/tcg/vec_internal.h | 5 --- | 7 | target/arm/tcg/vec_internal.h | 5 ---- |
7 | target/arm/tcg/mve_helper.c | 8 ++--- | 8 | target/arm/tcg/mve_helper.c | 8 ++---- |
8 | target/arm/tcg/vec_helper.c | 63 +++++++---------------------------- | 9 | target/arm/tcg/vec_helper.c | 53 ++++------------------------------- |
9 | 3 files changed, 15 insertions(+), 61 deletions(-) | 10 | 3 files changed, 9 insertions(+), 57 deletions(-) |
10 | 11 | ||
11 | diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h | 12 | diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h |
12 | index XXXXXXX..XXXXXXX 100644 | 13 | index XXXXXXX..XXXXXXX 100644 |
13 | --- a/target/arm/tcg/vec_internal.h | 14 | --- a/target/arm/tcg/vec_internal.h |
14 | +++ b/target/arm/tcg/vec_internal.h | 15 | +++ b/target/arm/tcg/vec_internal.h |
... | ... | ||
132 | - | 133 | - |
133 | void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | 134 | void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) |
134 | { | 135 | { |
135 | int hi = simd_data(desc); | 136 | int hi = simd_data(desc); |
136 | uint64_t *d = vd, *n = vn, *m = vm; | 137 | uint64_t *d = vd, *n = vn, *m = vm; |
137 | - uint64_t nn = n[hi], mm = m[hi]; | 138 | uint64_t nn = n[hi], mm = m[hi]; |
138 | - | 139 | |
139 | - d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); | 140 | - d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); |
140 | - nn >>= 32; | 141 | + d[0] = clmul_8x4_packed(nn, mm); |
141 | - mm >>= 32; | 142 | nn >>= 32; |
143 | mm >>= 32; | ||
142 | - d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); | 144 | - d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); |
143 | + Int128 r = clmul_8x8_packed(n[hi], m[hi]); | 145 | + d[1] = clmul_8x4_packed(nn, mm); |
144 | 146 | ||
145 | + d[0] = int128_getlo(r); | ||
146 | + d[1] = int128_gethi(r); | ||
147 | clear_tail(d, 16, simd_maxsz(desc)); | 147 | clear_tail(d, 16, simd_maxsz(desc)); |
148 | } | 148 | } |
149 | |||
150 | @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | 149 | @@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) |
151 | intptr_t i, opr_sz = simd_oprsz(desc); | ||
152 | uint64_t *d = vd, *n = vn, *m = vm; | 150 | uint64_t *d = vd, *n = vn, *m = vm; |
153 | 151 | ||
154 | - for (i = 0; i < opr_sz / 8; ++i) { | 152 | for (i = 0; i < opr_sz / 8; ++i) { |
155 | - uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; | 153 | - uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; |
156 | - uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; | 154 | - uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; |
157 | + for (i = 0; i < opr_sz / 8; i += 2) { | 155 | - |
158 | + Int128 nn = int128_make128(n[i] >> shift, n[i + 1] >> shift); | ||
159 | + Int128 mm = int128_make128(m[i] >> shift, m[i + 1] >> shift); | ||
160 | + Int128 r = clmul_8x8_even(nn, mm); | ||
161 | |||
162 | - d[i] = pmull_h(nn, mm); | 156 | - d[i] = pmull_h(nn, mm); |
163 | + d[0] = int128_getlo(r); | 157 | + d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); |
164 | + d[1] = int128_gethi(r); | ||
165 | } | 158 | } |
166 | } | 159 | } |
167 | 160 | ||
168 | -- | 161 | -- |
169 | 2.34.1 | 162 | 2.34.1 |
163 | |||
164 | diff view generated by jsdifflib |
1 | Use generic routines for 8-bit carry-less multiply. | 1 | Use generic routines for 8-bit carry-less multiply. |
---|---|---|---|
2 | Remove our local version of galois_multiply8. | 2 | Remove our local version of galois_multiply8. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 6 | --- |
6 | target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++--- | 7 | target/s390x/tcg/vec_int_helper.c | 32 ++++++++++++++++++++++++++++--- |
7 | 1 file changed, 24 insertions(+), 3 deletions(-) | 8 | 1 file changed, 29 insertions(+), 3 deletions(-) |
8 | 9 | ||
9 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c | 10 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c |
10 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
11 | --- a/target/s390x/tcg/vec_int_helper.c | 12 | --- a/target/s390x/tcg/vec_int_helper.c |
12 | +++ b/target/s390x/tcg/vec_int_helper.c | 13 | +++ b/target/s390x/tcg/vec_int_helper.c |
... | ... | ||
28 | 29 | ||
29 | @@ -XXX,XX +XXX,XX @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b) | 30 | @@ -XXX,XX +XXX,XX @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b) |
30 | return res; | 31 | return res; |
31 | } | 32 | } |
32 | 33 | ||
33 | +static Int128 do_gfm8(Int128 n, Int128 m) | 34 | +/* |
35 | + * There is no carry across the two doublewords, so their order does | ||
36 | + * not matter. Nor is there partial overlap between registers. | ||
37 | + */ | ||
38 | +static inline uint64_t do_gfma8(uint64_t n, uint64_t m, uint64_t a) | ||
34 | +{ | 39 | +{ |
35 | + Int128 e = clmul_8x8_even(n, m); | 40 | + return clmul_8x4_even(n, m) ^ clmul_8x4_odd(n, m) ^ a; |
36 | + Int128 o = clmul_8x8_odd(n, m); | ||
37 | + return int128_xor(e, o); | ||
38 | +} | 41 | +} |
39 | + | 42 | + |
40 | +void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d) | 43 | +void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d) |
41 | +{ | 44 | +{ |
42 | + /* | 45 | + uint64_t *q1 = v1; |
43 | + * There is no carry across the two doublewords, so their order | 46 | + const uint64_t *q2 = v2, *q3 = v3; |
44 | + * does not matter, so we need not care for host endianness. | 47 | + |
45 | + */ | 48 | + q1[0] = do_gfma8(q2[0], q3[0], 0); |
46 | + *(Int128 *)v1 = do_gfm8(*(const Int128 *)v2, *(const Int128 *)v3); | 49 | + q1[1] = do_gfma8(q2[1], q3[1], 0); |
47 | +} | 50 | +} |
48 | + | 51 | + |
49 | +void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, | 52 | +void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, |
50 | + const void *v4, uint32_t d) | 53 | + const void *v4, uint32_t desc) |
51 | +{ | 54 | +{ |
52 | + Int128 r = do_gfm8(*(const Int128 *)v2, *(const Int128 *)v3); | 55 | + uint64_t *q1 = v1; |
53 | + *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4); | 56 | + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; |
57 | + | ||
58 | + q1[0] = do_gfma8(q2[0], q3[0], q4[0]); | ||
59 | + q1[1] = do_gfma8(q2[1], q3[1], q4[1]); | ||
54 | +} | 60 | +} |
55 | + | 61 | + |
56 | #define DEF_VGFM(BITS, TBITS) \ | 62 | #define DEF_VGFM(BITS, TBITS) \ |
57 | void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ | 63 | void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ |
58 | uint32_t desc) \ | 64 | uint32_t desc) \ |
... | ... | ||
72 | DEF_VGFMA(16, 32) | 78 | DEF_VGFMA(16, 32) |
73 | DEF_VGFMA(32, 64) | 79 | DEF_VGFMA(32, 64) |
74 | 80 | ||
75 | -- | 81 | -- |
76 | 2.34.1 | 82 | 2.34.1 |
83 | |||
84 | diff view generated by jsdifflib |
1 | Use generic routines for 8-bit carry-less multiply. | 1 | Use generic routines for 8-bit carry-less multiply. |
---|---|---|---|
2 | 2 | ||
3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
4 | --- | 4 | --- |
5 | target/ppc/int_helper.c | 11 ++++++++++- | 5 | target/ppc/int_helper.c | 14 +++++++++++++- |
6 | 1 file changed, 10 insertions(+), 1 deletion(-) | 6 | 1 file changed, 13 insertions(+), 1 deletion(-) |
7 | 7 | ||
8 | diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c | 8 | diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c |
9 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
10 | --- a/target/ppc/int_helper.c | 10 | --- a/target/ppc/int_helper.c |
11 | +++ b/target/ppc/int_helper.c | 11 | +++ b/target/ppc/int_helper.c |
... | ... | ||
19 | #include "qemu/guest-random.h" | 19 | #include "qemu/guest-random.h" |
20 | @@ -XXX,XX +XXX,XX @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) | 20 | @@ -XXX,XX +XXX,XX @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) |
21 | #undef VBPERMQ_INDEX | 21 | #undef VBPERMQ_INDEX |
22 | #undef VBPERMQ_DW | 22 | #undef VBPERMQ_DW |
23 | 23 | ||
24 | +/* | ||
25 | + * There is no carry across the two doublewords, so their order does | ||
26 | + * not matter. Nor is there partial overlap between registers. | ||
27 | + */ | ||
24 | +void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) | 28 | +void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) |
25 | +{ | 29 | +{ |
26 | + Int128 ia = a->s128; | 30 | + for (int i = 0; i < 2; ++i) { |
27 | + Int128 ib = b->s128; | 31 | + uint64_t aa = a->u64[i], bb = b->u64[i]; |
28 | + Int128 e = clmul_8x8_even(ia, ib); | 32 | + r->u64[i] = clmul_8x4_even(aa, bb) ^ clmul_8x4_odd(aa, bb); |
29 | + Int128 o = clmul_8x8_odd(ia, ib); | 33 | + } |
30 | + r->s128 = int128_xor(e, o); | ||
31 | +} | 34 | +} |
32 | + | 35 | + |
33 | #define PMSUM(name, srcfld, trgfld, trgtyp) \ | 36 | #define PMSUM(name, srcfld, trgfld, trgtyp) \ |
34 | void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ | 37 | void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ |
35 | { \ | 38 | { \ |
... | ... | diff view generated by jsdifflib |
1 | Reviewed-by: Ard Biesheuvel <ardb@kernel.org> | ||
---|---|---|---|
1 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 2 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
2 | --- | 3 | --- |
3 | host/include/generic/host/crypto/clmul.h | 5 +++ | 4 | include/crypto/clmul.h | 16 ++++++++++++++++ |
4 | include/crypto/clmul.h | 32 +++++++++++++++++++ | 5 | crypto/clmul.c | 21 +++++++++++++++++++++ |
5 | crypto/clmul.c | 39 ++++++++++++++++++++++++ | 6 | 2 files changed, 37 insertions(+) |
6 | 3 files changed, 76 insertions(+) | ||
7 | 7 | ||
8 | diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h | ||
9 | index XXXXXXX..XXXXXXX 100644 | ||
10 | --- a/host/include/generic/host/crypto/clmul.h | ||
11 | +++ b/host/include/generic/host/crypto/clmul.h | ||
12 | @@ -XXX,XX +XXX,XX @@ | ||
13 | #define clmul_8x8_odd clmul_8x8_odd_gen | ||
14 | #define clmul_8x8_packed clmul_8x8_packed_gen | ||
15 | |||
16 | +#define clmul_16x2_even clmul_16x2_even_gen | ||
17 | +#define clmul_16x2_odd clmul_16x2_odd_gen | ||
18 | +#define clmul_16x4_even clmul_16x4_even_gen | ||
19 | +#define clmul_16x4_odd clmul_16x4_odd_gen | ||
20 | + | ||
21 | #endif /* GENERIC_HOST_CRYPTO_CLMUL_H */ | ||
22 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h | 8 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h |
23 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
24 | --- a/include/crypto/clmul.h | 10 | --- a/include/crypto/clmul.h |
25 | +++ b/include/crypto/clmul.h | 11 | +++ b/include/crypto/clmul.h |
26 | @@ -XXX,XX +XXX,XX @@ Int128 clmul_8x8_odd_gen(Int128, Int128); | 12 | @@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_odd(uint64_t, uint64_t); |
27 | */ | 13 | */ |
28 | Int128 clmul_8x8_packed_gen(uint64_t, uint64_t); | 14 | uint64_t clmul_8x4_packed(uint32_t, uint32_t); |
29 | 15 | ||
30 | +/** | 16 | +/** |
31 | + * clmul_16x2_even: | 17 | + * clmul_16x2_even: |
32 | + * | 18 | + * |
33 | + * Perform two 16x16->32 carry-less multiplies. | 19 | + * Perform two 16x16->32 carry-less multiplies. |
34 | + * The odd words of the inputs are ignored. | 20 | + * The odd words of the inputs are ignored. |
35 | + */ | 21 | + */ |
36 | +uint64_t clmul_16x2_even_gen(uint64_t, uint64_t); | 22 | +uint64_t clmul_16x2_even(uint64_t, uint64_t); |
37 | + | 23 | + |
38 | +/** | 24 | +/** |
39 | + * clmul_16x2_odd: | 25 | + * clmul_16x2_odd: |
40 | + * | 26 | + * |
41 | + * Perform two 16x16->32 carry-less multiplies. | 27 | + * Perform two 16x16->32 carry-less multiplies. |
42 | + * The even bytes of the inputs are ignored. | 28 | + * The even words of the inputs are ignored. |
43 | + */ | 29 | + */ |
44 | +uint64_t clmul_16x2_odd_gen(uint64_t, uint64_t); | 30 | +uint64_t clmul_16x2_odd(uint64_t, uint64_t); |
45 | + | 31 | + |
46 | +/** | ||
47 | + * clmul_16x4_even: | ||
48 | + * | ||
49 | + * Perform four 16x16->32 carry-less multiplies. | ||
50 | + * The odd bytes of the inputs are ignored. | ||
51 | + */ | ||
52 | +Int128 clmul_16x4_even_gen(Int128, Int128); | ||
53 | + | ||
54 | +/** | ||
55 | + * clmul_16x4_odd: | ||
56 | + * | ||
57 | + * Perform eight 16x16->32 carry-less multiplies. | ||
58 | + * The even bytes of the inputs are ignored. | ||
59 | + */ | ||
60 | +Int128 clmul_16x4_odd_gen(Int128, Int128); | ||
61 | + | ||
62 | #include "host/crypto/clmul.h" | ||
63 | |||
64 | #endif /* CRYPTO_CLMUL_H */ | 32 | #endif /* CRYPTO_CLMUL_H */ |
65 | diff --git a/crypto/clmul.c b/crypto/clmul.c | 33 | diff --git a/crypto/clmul.c b/crypto/clmul.c |
66 | index XXXXXXX..XXXXXXX 100644 | 34 | index XXXXXXX..XXXXXXX 100644 |
67 | --- a/crypto/clmul.c | 35 | --- a/crypto/clmul.c |
68 | +++ b/crypto/clmul.c | 36 | +++ b/crypto/clmul.c |
69 | @@ -XXX,XX +XXX,XX @@ Int128 clmul_8x8_packed_gen(uint64_t n, uint64_t m) | 37 | @@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_packed(uint32_t n, uint32_t m) |
70 | rh = clmul_8x4_even_gen(unpack_8_to_16(n >> 32), unpack_8_to_16(m >> 32)); | 38 | { |
71 | return int128_make128(rl, rh); | 39 | return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m)); |
72 | } | 40 | } |
73 | + | 41 | + |
74 | +uint64_t clmul_16x2_even_gen(uint64_t n, uint64_t m) | 42 | +uint64_t clmul_16x2_even(uint64_t n, uint64_t m) |
75 | +{ | 43 | +{ |
76 | + uint64_t r = 0; | 44 | + uint64_t r = 0; |
77 | + | 45 | + |
78 | + n &= 0x0000ffff0000ffffull; | 46 | + n &= 0x0000ffff0000ffffull; |
79 | + m &= 0x0000ffff0000ffffull; | 47 | + m &= 0x0000ffff0000ffffull; |
... | ... | ||
85 | + m <<= 1; | 53 | + m <<= 1; |
86 | + } | 54 | + } |
87 | + return r; | 55 | + return r; |
88 | +} | 56 | +} |
89 | + | 57 | + |
90 | +uint64_t clmul_16x2_odd_gen(uint64_t n, uint64_t m) | 58 | +uint64_t clmul_16x2_odd(uint64_t n, uint64_t m) |
91 | +{ | 59 | +{ |
92 | + return clmul_16x2_even_gen(n >> 16, m >> 16); | 60 | + return clmul_16x2_even(n >> 16, m >> 16); |
93 | +} | ||
94 | + | ||
95 | +Int128 clmul_16x4_even_gen(Int128 n, Int128 m) | ||
96 | +{ | ||
97 | + uint64_t rl, rh; | ||
98 | + | ||
99 | + rl = clmul_16x2_even_gen(int128_getlo(n), int128_getlo(m)); | ||
100 | + rh = clmul_16x2_even_gen(int128_gethi(n), int128_gethi(m)); | ||
101 | + return int128_make128(rl, rh); | ||
102 | +} | ||
103 | + | ||
104 | +Int128 clmul_16x4_odd_gen(Int128 n, Int128 m) | ||
105 | +{ | ||
106 | + uint64_t rl, rh; | ||
107 | + | ||
108 | + rl = clmul_16x2_odd_gen(int128_getlo(n), int128_getlo(m)); | ||
109 | + rh = clmul_16x2_odd_gen(int128_gethi(n), int128_gethi(m)); | ||
110 | + return int128_make128(rl, rh); | ||
111 | +} | 61 | +} |
112 | -- | 62 | -- |
113 | 2.34.1 | 63 | 2.34.1 | diff view generated by jsdifflib |
1 | Use generic routines for 16-bit carry-less multiply. | 1 | Use generic routines for 16-bit carry-less multiply. |
---|---|---|---|
2 | Remove our local version of pmull_w. | 2 | Remove our local version of pmull_w. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 6 | --- |
6 | target/arm/tcg/vec_internal.h | 6 ------ | 7 | target/arm/tcg/vec_internal.h | 6 ------ |
7 | target/arm/tcg/mve_helper.c | 8 ++------ | 8 | target/arm/tcg/mve_helper.c | 8 ++------ |
8 | target/arm/tcg/vec_helper.c | 13 ------------- | 9 | target/arm/tcg/vec_helper.c | 13 ------------- |
... | ... | ||
70 | void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | 71 | void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) |
71 | { | 72 | { |
72 | int hi = simd_data(desc); | 73 | int hi = simd_data(desc); |
73 | -- | 74 | -- |
74 | 2.34.1 | 75 | 2.34.1 |
76 | |||
77 | diff view generated by jsdifflib |
1 | Use generic routines for 16-bit carry-less multiply. | 1 | Use generic routines for 16-bit carry-less multiply. |
---|---|---|---|
2 | Remove our local version of galois_multiply16. | 2 | Remove our local version of galois_multiply16. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 6 | --- |
6 | target/s390x/tcg/vec_int_helper.c | 22 +++++++++++++++++++--- | 7 | target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++--- |
7 | 1 file changed, 19 insertions(+), 3 deletions(-) | 8 | 1 file changed, 24 insertions(+), 3 deletions(-) |
8 | 9 | ||
9 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c | 10 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c |
10 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
11 | --- a/target/s390x/tcg/vec_int_helper.c | 12 | --- a/target/s390x/tcg/vec_int_helper.c |
12 | +++ b/target/s390x/tcg/vec_int_helper.c | 13 | +++ b/target/s390x/tcg/vec_int_helper.c |
... | ... | ||
17 | -DEF_GALOIS_MULTIPLY(16, 32) | 18 | -DEF_GALOIS_MULTIPLY(16, 32) |
18 | DEF_GALOIS_MULTIPLY(32, 64) | 19 | DEF_GALOIS_MULTIPLY(32, 64) |
19 | 20 | ||
20 | static S390Vector galois_multiply64(uint64_t a, uint64_t b) | 21 | static S390Vector galois_multiply64(uint64_t a, uint64_t b) |
21 | @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, | 22 | @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, |
22 | *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4); | 23 | q1[1] = do_gfma8(q2[1], q3[1], q4[1]); |
23 | } | 24 | } |
24 | 25 | ||
25 | +static Int128 do_gfm16(Int128 n, Int128 m) | 26 | +static inline uint64_t do_gfma16(uint64_t n, uint64_t m, uint64_t a) |
26 | +{ | 27 | +{ |
27 | + Int128 e = clmul_16x4_even(n, m); | 28 | + return clmul_16x2_even(n, m) ^ clmul_16x2_odd(n, m) ^ a; |
28 | + Int128 o = clmul_16x4_odd(n, m); | ||
29 | + return int128_xor(e, o); | ||
30 | +} | 29 | +} |
31 | + | 30 | + |
32 | +void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d) | 31 | +void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d) |
33 | +{ | 32 | +{ |
34 | + *(Int128 *)v1 = do_gfm16(*(const Int128 *)v2, *(const Int128 *)v3); | 33 | + uint64_t *q1 = v1; |
34 | + const uint64_t *q2 = v2, *q3 = v3; | ||
35 | + | ||
36 | + q1[0] = do_gfma16(q2[0], q3[0], 0); | ||
37 | + q1[1] = do_gfma16(q2[1], q3[1], 0); | ||
35 | +} | 38 | +} |
36 | + | 39 | + |
37 | +void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, | 40 | +void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, |
38 | + const void *v4, uint32_t d) | 41 | + const void *v4, uint32_t d) |
39 | +{ | 42 | +{ |
40 | + Int128 r = do_gfm16(*(const Int128 *)v2, *(const Int128 *)v3); | 43 | + uint64_t *q1 = v1; |
41 | + *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4); | 44 | + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; |
45 | + | ||
46 | + q1[0] = do_gfma16(q2[0], q3[0], q4[0]); | ||
47 | + q1[1] = do_gfma16(q2[1], q3[1], q4[1]); | ||
42 | +} | 48 | +} |
43 | + | 49 | + |
44 | #define DEF_VGFM(BITS, TBITS) \ | 50 | #define DEF_VGFM(BITS, TBITS) \ |
45 | void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ | 51 | void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ |
46 | uint32_t desc) \ | 52 | uint32_t desc) \ |
... | ... | ||
60 | DEF_VGFMA(32, 64) | 66 | DEF_VGFMA(32, 64) |
61 | 67 | ||
62 | void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, | 68 | void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, |
63 | -- | 69 | -- |
64 | 2.34.1 | 70 | 2.34.1 |
71 | |||
72 | diff view generated by jsdifflib |
1 | Use generic routines for 16-bit carry-less multiply. | 1 | Use generic routines for 16-bit carry-less multiply. |
---|---|---|---|
2 | 2 | ||
3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
4 | --- | 4 | --- |
5 | target/ppc/int_helper.c | 10 +++++++++- | 5 | target/ppc/int_helper.c | 9 ++++++++- |
6 | 1 file changed, 9 insertions(+), 1 deletion(-) | 6 | 1 file changed, 8 insertions(+), 1 deletion(-) |
7 | 7 | ||
8 | diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c | 8 | diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c |
9 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
10 | --- a/target/ppc/int_helper.c | 10 | --- a/target/ppc/int_helper.c |
11 | +++ b/target/ppc/int_helper.c | 11 | +++ b/target/ppc/int_helper.c |
12 | @@ -XXX,XX +XXX,XX @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) | 12 | @@ -XXX,XX +XXX,XX @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) |
13 | r->s128 = int128_xor(e, o); | 13 | } |
14 | } | 14 | } |
15 | 15 | ||
16 | +void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) | 16 | +void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) |
17 | +{ | 17 | +{ |
18 | + Int128 ia = a->s128; | 18 | + for (int i = 0; i < 2; ++i) { |
19 | + Int128 ib = b->s128; | 19 | + uint64_t aa = a->u64[i], bb = b->u64[i]; |
20 | + Int128 e = clmul_16x4_even(ia, ib); | 20 | + r->u64[i] = clmul_16x2_even(aa, bb) ^ clmul_16x2_odd(aa, bb); |
21 | + Int128 o = clmul_16x4_odd(ia, ib); | 21 | + } |
22 | + r->s128 = int128_xor(e, o); | ||
23 | +} | 22 | +} |
24 | + | 23 | + |
25 | #define PMSUM(name, srcfld, trgfld, trgtyp) \ | 24 | #define PMSUM(name, srcfld, trgfld, trgtyp) \ |
26 | void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ | 25 | void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ |
27 | { \ | 26 | { \ |
... | ... | diff view generated by jsdifflib |
1 | Reviewed-by: Ard Biesheuvel <ardb@kernel.org> | ||
---|---|---|---|
1 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 2 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
2 | --- | 3 | --- |
3 | host/include/generic/host/crypto/clmul.h | 4 +++ | 4 | include/crypto/clmul.h | 7 +++++++ |
4 | include/crypto/clmul.h | 23 ++++++++++++++++++ | 5 | crypto/clmul.c | 13 +++++++++++++ |
5 | crypto/clmul.c | 31 ++++++++++++++++++++++++ | 6 | 2 files changed, 20 insertions(+) |
6 | 3 files changed, 58 insertions(+) | ||
7 | 7 | ||
8 | diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h | ||
9 | index XXXXXXX..XXXXXXX 100644 | ||
10 | --- a/host/include/generic/host/crypto/clmul.h | ||
11 | +++ b/host/include/generic/host/crypto/clmul.h | ||
12 | @@ -XXX,XX +XXX,XX @@ | ||
13 | #define clmul_16x4_even clmul_16x4_even_gen | ||
14 | #define clmul_16x4_odd clmul_16x4_odd_gen | ||
15 | |||
16 | +#define clmul_32 clmul_32_gen | ||
17 | +#define clmul_32x2_even clmul_32x2_even_gen | ||
18 | +#define clmul_32x2_odd clmul_32x2_odd_gen | ||
19 | + | ||
20 | #endif /* GENERIC_HOST_CRYPTO_CLMUL_H */ | ||
21 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h | 8 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h |
22 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
23 | --- a/include/crypto/clmul.h | 10 | --- a/include/crypto/clmul.h |
24 | +++ b/include/crypto/clmul.h | 11 | +++ b/include/crypto/clmul.h |
25 | @@ -XXX,XX +XXX,XX @@ Int128 clmul_16x4_even_gen(Int128, Int128); | 12 | @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_even(uint64_t, uint64_t); |
26 | */ | 13 | */ |
27 | Int128 clmul_16x4_odd_gen(Int128, Int128); | 14 | uint64_t clmul_16x2_odd(uint64_t, uint64_t); |
28 | 15 | ||
29 | +/** | 16 | +/** |
30 | + * clmul_32: | 17 | + * clmul_32: |
31 | + * | 18 | + * |
32 | + * Perform a 32x32->64 carry-less multiply. | 19 | + * Perform a 32x32->64 carry-less multiply. |
33 | + */ | 20 | + */ |
34 | +uint64_t clmul_32_gen(uint32_t, uint32_t); | 21 | +uint64_t clmul_32(uint32_t, uint32_t); |
35 | + | 22 | + |
36 | +/** | ||
37 | + * clmul_32x2_even: | ||
38 | + * | ||
39 | + * Perform two 32x32->64 carry-less multiplies. | ||
40 | + * The odd words of the inputs are ignored. | ||
41 | + */ | ||
42 | +Int128 clmul_32x2_even_gen(Int128, Int128); | ||
43 | + | ||
44 | +/** | ||
45 | + * clmul_32x2_odd: | ||
46 | + * | ||
47 | + * Perform two 32x32->64 carry-less multiplies. | ||
48 | + * The even words of the inputs are ignored. | ||
49 | + */ | ||
50 | +Int128 clmul_32x2_odd_gen(Int128, Int128); | ||
51 | + | ||
52 | #include "host/crypto/clmul.h" | ||
53 | |||
54 | #endif /* CRYPTO_CLMUL_H */ | 23 | #endif /* CRYPTO_CLMUL_H */ |
55 | diff --git a/crypto/clmul.c b/crypto/clmul.c | 24 | diff --git a/crypto/clmul.c b/crypto/clmul.c |
56 | index XXXXXXX..XXXXXXX 100644 | 25 | index XXXXXXX..XXXXXXX 100644 |
57 | --- a/crypto/clmul.c | 26 | --- a/crypto/clmul.c |
58 | +++ b/crypto/clmul.c | 27 | +++ b/crypto/clmul.c |
59 | @@ -XXX,XX +XXX,XX @@ Int128 clmul_16x4_odd_gen(Int128 n, Int128 m) | 28 | @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t n, uint64_t m) |
60 | rh = clmul_16x2_odd_gen(int128_gethi(n), int128_gethi(m)); | 29 | { |
61 | return int128_make128(rl, rh); | 30 | return clmul_16x2_even(n >> 16, m >> 16); |
62 | } | 31 | } |
63 | + | 32 | + |
64 | +uint64_t clmul_32_gen(uint32_t n, uint32_t m32) | 33 | +uint64_t clmul_32(uint32_t n, uint32_t m32) |
65 | +{ | 34 | +{ |
66 | + uint64_t r = 0; | 35 | + uint64_t r = 0; |
67 | + uint64_t m = m32; | 36 | + uint64_t m = m32; |
68 | + | 37 | + |
69 | + for (int i = 0; i < 32; ++i) { | 38 | + for (int i = 0; i < 32; ++i) { |
70 | + r ^= n & 1 ? m : 0; | 39 | + r ^= n & 1 ? m : 0; |
71 | + n >>= 1; | 40 | + n >>= 1; |
72 | + m <<= 1; | 41 | + m <<= 1; |
73 | + } | 42 | + } |
74 | + return r; | 43 | + return r; |
75 | +} | 44 | +} |
76 | + | ||
77 | +Int128 clmul_32x2_even_gen(Int128 n, Int128 m) | ||
78 | +{ | ||
79 | + uint64_t rl, rh; | ||
80 | + | ||
81 | + rl = clmul_32_gen(int128_getlo(n), int128_getlo(m)); | ||
82 | + rh = clmul_32_gen(int128_gethi(n), int128_gethi(m)); | ||
83 | + return int128_make128(rl, rh); | ||
84 | +} | ||
85 | + | ||
86 | +Int128 clmul_32x2_odd_gen(Int128 n, Int128 m) | ||
87 | +{ | ||
88 | + uint64_t rl, rh; | ||
89 | + | ||
90 | + rl = clmul_32_gen(int128_getlo(n) >> 32, int128_getlo(m) >> 32); | ||
91 | + rh = clmul_32_gen(int128_gethi(n) >> 32, int128_gethi(m) >> 32); | ||
92 | + return int128_make128(rl, rh); | ||
93 | +} | ||
94 | -- | 45 | -- |
95 | 2.34.1 | 46 | 2.34.1 | diff view generated by jsdifflib |
1 | Use generic routines for 32-bit carry-less multiply. | 1 | Use generic routines for 32-bit carry-less multiply. |
---|---|---|---|
2 | Remove our local version of pmull_d. | 2 | Remove our local version of pmull_d. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 6 | --- |
6 | target/arm/tcg/vec_helper.c | 14 +------------- | 7 | target/arm/tcg/vec_helper.c | 14 +------------- |
7 | 1 file changed, 1 insertion(+), 13 deletions(-) | 8 | 1 file changed, 1 insertion(+), 13 deletions(-) |
8 | 9 | ||
... | ... | ||
38 | } | 39 | } |
39 | } | 40 | } |
40 | #endif | 41 | #endif |
41 | -- | 42 | -- |
42 | 2.34.1 | 43 | 2.34.1 |
44 | |||
45 | diff view generated by jsdifflib |
1 | Use generic routines for 32-bit carry-less multiply. | 1 | Use generic routines for 32-bit carry-less multiply. |
---|---|---|---|
2 | Remove our local version of galois_multiply32. | 2 | Remove our local version of galois_multiply32. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 6 | --- |
6 | target/s390x/tcg/vec_int_helper.c | 70 ++++++++----------------------- | 7 | target/s390x/tcg/vec_int_helper.c | 75 +++++++++---------------------- |
7 | 1 file changed, 17 insertions(+), 53 deletions(-) | 8 | 1 file changed, 22 insertions(+), 53 deletions(-) |
8 | 9 | ||
9 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c | 10 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c |
10 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
11 | --- a/target/s390x/tcg/vec_int_helper.c | 12 | --- a/target/s390x/tcg/vec_int_helper.c |
12 | +++ b/target/s390x/tcg/vec_int_helper.c | 13 | +++ b/target/s390x/tcg/vec_int_helper.c |
... | ... | ||
32 | -DEF_GALOIS_MULTIPLY(32, 64) | 33 | -DEF_GALOIS_MULTIPLY(32, 64) |
33 | 34 | ||
34 | static S390Vector galois_multiply64(uint64_t a, uint64_t b) | 35 | static S390Vector galois_multiply64(uint64_t a, uint64_t b) |
35 | { | 36 | { |
36 | @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, | 37 | @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, |
37 | *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4); | 38 | q1[1] = do_gfma16(q2[1], q3[1], q4[1]); |
38 | } | 39 | } |
39 | 40 | ||
40 | -#define DEF_VGFM(BITS, TBITS) \ | 41 | -#define DEF_VGFM(BITS, TBITS) \ |
41 | -void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ | 42 | -void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ |
42 | - uint32_t desc) \ | 43 | - uint32_t desc) \ |
... | ... | ||
51 | - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ | 52 | - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ |
52 | - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ | 53 | - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ |
53 | - d = d ^ galois_multiply32(a, b); \ | 54 | - d = d ^ galois_multiply32(a, b); \ |
54 | - s390_vec_write_element##TBITS(v1, i, d); \ | 55 | - s390_vec_write_element##TBITS(v1, i, d); \ |
55 | - } \ | 56 | - } \ |
56 | +static Int128 do_gfm32(Int128 n, Int128 m) | 57 | +static inline uint64_t do_gfma32(uint64_t n, uint64_t m, uint64_t a) |
57 | +{ | 58 | +{ |
58 | + Int128 e = clmul_32x2_even(n, m); | 59 | + return clmul_32(n, m) ^ clmul_32(n >> 32, m >> 32) ^ a; |
59 | + Int128 o = clmul_32x2_odd(n, m); | ||
60 | + return int128_xor(e, o); | ||
61 | +} | 60 | +} |
62 | + | 61 | + |
63 | +void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d) | 62 | +void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d) |
64 | +{ | 63 | +{ |
65 | + *(Int128 *)v1 = do_gfm32(*(const Int128 *)v2, *(const Int128 *)v3); | 64 | + uint64_t *q1 = v1; |
65 | + const uint64_t *q2 = v2, *q3 = v3; | ||
66 | + | ||
67 | + q1[0] = do_gfma32(q2[0], q3[0], 0); | ||
68 | + q1[1] = do_gfma32(q2[1], q3[1], 0); | ||
66 | +} | 69 | +} |
67 | + | 70 | + |
68 | +void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, | 71 | +void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, |
69 | + const void *v4, uint32_t d) | 72 | + const void *v4, uint32_t d) |
70 | +{ | 73 | +{ |
71 | + Int128 r = do_gfm32(*(const Int128 *)v2, *(const Int128 *)v3); | 74 | + uint64_t *q1 = v1; |
72 | + *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4); | 75 | + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; |
76 | + | ||
77 | + q1[0] = do_gfma32(q2[0], q3[0], q4[0]); | ||
78 | + q1[1] = do_gfma32(q2[1], q3[1], q4[1]); | ||
73 | } | 79 | } |
74 | -DEF_VGFM(32, 64) | 80 | -DEF_VGFM(32, 64) |
75 | 81 | ||
76 | void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, | 82 | void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, |
77 | uint32_t desc) | 83 | uint32_t desc) |
... | ... | ||
102 | void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, | 108 | void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, |
103 | const void *v4, uint32_t desc) | 109 | const void *v4, uint32_t desc) |
104 | { | 110 | { |
105 | -- | 111 | -- |
106 | 2.34.1 | 112 | 2.34.1 |
113 | |||
114 | diff view generated by jsdifflib |
1 | Use generic routines for 32-bit carry-less multiply. | 1 | Use generic routines for 32-bit carry-less multiply. |
---|---|---|---|
2 | 2 | ||
3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
4 | --- | 4 | --- |
5 | target/ppc/int_helper.c | 27 +++++++-------------------- | 5 | target/ppc/int_helper.c | 26 ++++++-------------------- |
6 | 1 file changed, 7 insertions(+), 20 deletions(-) | 6 | 1 file changed, 6 insertions(+), 20 deletions(-) |
7 | 7 | ||
8 | diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c | 8 | diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c |
9 | index XXXXXXX..XXXXXXX 100644 | 9 | index XXXXXXX..XXXXXXX 100644 |
10 | --- a/target/ppc/int_helper.c | 10 | --- a/target/ppc/int_helper.c |
11 | +++ b/target/ppc/int_helper.c | 11 | +++ b/target/ppc/int_helper.c |
12 | @@ -XXX,XX +XXX,XX @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) | 12 | @@ -XXX,XX +XXX,XX @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) |
13 | r->s128 = int128_xor(e, o); | 13 | } |
14 | } | 14 | } |
15 | 15 | ||
16 | -#define PMSUM(name, srcfld, trgfld, trgtyp) \ | 16 | -#define PMSUM(name, srcfld, trgfld, trgtyp) \ |
17 | -void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ | 17 | -void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ |
18 | -{ \ | 18 | -{ \ |
... | ... | ||
31 | - VECTOR_FOR_INORDER_I(i, trgfld) { \ | 31 | - VECTOR_FOR_INORDER_I(i, trgfld) { \ |
32 | - r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \ | 32 | - r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \ |
33 | - } \ | 33 | - } \ |
34 | +void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) | 34 | +void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) |
35 | +{ | 35 | +{ |
36 | + Int128 ia = a->s128; | 36 | + for (int i = 0; i < 2; ++i) { |
37 | + Int128 ib = b->s128; | 37 | + uint64_t aa = a->u64[i], bb = b->u64[i]; |
38 | + Int128 e = clmul_32x2_even(ia, ib); | 38 | + r->u64[i] = clmul_32(aa, bb) ^ clmul_32(aa >> 32, bb >> 32); |
39 | + Int128 o = clmul_32x2_odd(ia, ib); | 39 | + } |
40 | + r->s128 = int128_xor(e, o); | ||
41 | } | 40 | } |
42 | 41 | ||
43 | -PMSUM(vpmsumw, u32, u64, uint64_t) | 42 | -PMSUM(vpmsumw, u32, u64, uint64_t) |
44 | - | 43 | - |
45 | void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) | 44 | void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) |
46 | { | 45 | { |
47 | int i, j; | 46 | int i, j; |
48 | -- | 47 | -- |
49 | 2.34.1 | 48 | 2.34.1 | diff view generated by jsdifflib |
1 | Reviewed-by: Ard Biesheuvel <ardb@kernel.org> | ||
---|---|---|---|
1 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 2 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
2 | --- | 3 | --- |
3 | host/include/generic/host/crypto/clmul.h | 2 ++ | 4 | host/include/generic/host/crypto/clmul.h | 15 +++++++++++++++ |
4 | include/crypto/clmul.h | 7 +++++++ | 5 | include/crypto/clmul.h | 19 +++++++++++++++++++ |
5 | crypto/clmul.c | 17 +++++++++++++++++ | 6 | crypto/clmul.c | 17 +++++++++++++++++ |
6 | 3 files changed, 26 insertions(+) | 7 | 3 files changed, 51 insertions(+) |
8 | create mode 100644 host/include/generic/host/crypto/clmul.h | ||
7 | 9 | ||
8 | diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h | 10 | diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h |
9 | index XXXXXXX..XXXXXXX 100644 | 11 | new file mode 100644 |
10 | --- a/host/include/generic/host/crypto/clmul.h | 12 | index XXXXXXX..XXXXXXX |
13 | --- /dev/null | ||
11 | +++ b/host/include/generic/host/crypto/clmul.h | 14 | +++ b/host/include/generic/host/crypto/clmul.h |
12 | @@ -XXX,XX +XXX,XX @@ | 15 | @@ -XXX,XX +XXX,XX @@ |
13 | #define clmul_32x2_even clmul_32x2_even_gen | 16 | +/* |
14 | #define clmul_32x2_odd clmul_32x2_odd_gen | 17 | + * No host specific carry-less multiply acceleration. |
15 | 18 | + * SPDX-License-Identifier: GPL-2.0-or-later | |
16 | +#define clmul_64 clmul_64_gen | 19 | + */ |
17 | + | 20 | + |
18 | #endif /* GENERIC_HOST_CRYPTO_CLMUL_H */ | 21 | +#ifndef GENERIC_HOST_CRYPTO_CLMUL_H |
22 | +#define GENERIC_HOST_CRYPTO_CLMUL_H | ||
23 | + | ||
24 | +#define HAVE_CLMUL_ACCEL false | ||
25 | +#define ATTR_CLMUL_ACCEL | ||
26 | + | ||
27 | +Int128 clmul_64_accel(uint64_t, uint64_t) | ||
28 | + QEMU_ERROR("unsupported accel"); | ||
29 | + | ||
30 | +#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */ | ||
19 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h | 31 | diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h |
20 | index XXXXXXX..XXXXXXX 100644 | 32 | index XXXXXXX..XXXXXXX 100644 |
21 | --- a/include/crypto/clmul.h | 33 | --- a/include/crypto/clmul.h |
22 | +++ b/include/crypto/clmul.h | 34 | +++ b/include/crypto/clmul.h |
23 | @@ -XXX,XX +XXX,XX @@ Int128 clmul_32x2_even_gen(Int128, Int128); | 35 | @@ -XXX,XX +XXX,XX @@ |
36 | #ifndef CRYPTO_CLMUL_H | ||
37 | #define CRYPTO_CLMUL_H | ||
38 | |||
39 | +#include "qemu/int128.h" | ||
40 | +#include "host/crypto/clmul.h" | ||
41 | + | ||
42 | /** | ||
43 | * clmul_8x8_low: | ||
44 | * | ||
45 | @@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t, uint64_t); | ||
24 | */ | 46 | */ |
25 | Int128 clmul_32x2_odd_gen(Int128, Int128); | 47 | uint64_t clmul_32(uint32_t, uint32_t); |
26 | 48 | ||
27 | +/** | 49 | +/** |
28 | + * clmul_64: | 50 | + * clmul_64: |
29 | + * | 51 | + * |
30 | + * Perform a 64x64->128 carry-less multiply. | 52 | + * Perform a 64x64->128 carry-less multiply. |
31 | + */ | 53 | + */ |
32 | +Int128 clmul_64_gen(uint64_t, uint64_t); | 54 | +Int128 clmul_64_gen(uint64_t, uint64_t); |
33 | + | 55 | + |
34 | #include "host/crypto/clmul.h" | 56 | +static inline Int128 clmul_64(uint64_t a, uint64_t b) |
35 | 57 | +{ | |
58 | + if (HAVE_CLMUL_ACCEL) { | ||
59 | + return clmul_64_accel(a, b); | ||
60 | + } else { | ||
61 | + return clmul_64_gen(a, b); | ||
62 | + } | ||
63 | +} | ||
64 | + | ||
36 | #endif /* CRYPTO_CLMUL_H */ | 65 | #endif /* CRYPTO_CLMUL_H */ |
37 | diff --git a/crypto/clmul.c b/crypto/clmul.c | 66 | diff --git a/crypto/clmul.c b/crypto/clmul.c |
38 | index XXXXXXX..XXXXXXX 100644 | 67 | index XXXXXXX..XXXXXXX 100644 |
39 | --- a/crypto/clmul.c | 68 | --- a/crypto/clmul.c |
40 | +++ b/crypto/clmul.c | 69 | +++ b/crypto/clmul.c |
41 | @@ -XXX,XX +XXX,XX @@ Int128 clmul_32x2_odd_gen(Int128 n, Int128 m) | 70 | @@ -XXX,XX +XXX,XX @@ uint64_t clmul_32(uint32_t n, uint32_t m32) |
42 | rh = clmul_32_gen(int128_gethi(n) >> 32, int128_gethi(m) >> 32); | 71 | } |
43 | return int128_make128(rl, rh); | 72 | return r; |
44 | } | 73 | } |
45 | + | 74 | + |
46 | +Int128 clmul_64_gen(uint64_t n, uint64_t m) | 75 | +Int128 clmul_64_gen(uint64_t n, uint64_t m) |
47 | +{ | 76 | +{ |
48 | + uint64_t rl = 0, rh = 0; | 77 | + uint64_t rl = 0, rh = 0; |
... | ... | diff view generated by jsdifflib |
1 | Use generic routine for 64-bit carry-less multiply. | 1 | Use generic routine for 64-bit carry-less multiply. |
---|---|---|---|
2 | 2 | ||
3 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
4 | --- | 5 | --- |
5 | target/arm/tcg/vec_helper.c | 22 ++++------------------ | 6 | target/arm/tcg/vec_helper.c | 22 ++++------------------ |
6 | 1 file changed, 4 insertions(+), 18 deletions(-) | 7 | 1 file changed, 4 insertions(+), 18 deletions(-) |
7 | 8 | ||
... | ... | ||
42 | } | 43 | } |
43 | clear_tail(d, opr_sz, simd_maxsz(desc)); | 44 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
44 | } | 45 | } |
45 | -- | 46 | -- |
46 | 2.34.1 | 47 | 2.34.1 |
48 | |||
49 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | Use generic routine for 64-bit carry-less multiply. | ||
1 | 2 | ||
3 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
5 | --- | ||
6 | target/i386/ops_sse.h | 40 +++++++++------------------------------- | ||
7 | 1 file changed, 9 insertions(+), 31 deletions(-) | ||
8 | |||
9 | diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h | ||
10 | index XXXXXXX..XXXXXXX 100644 | ||
11 | --- a/target/i386/ops_sse.h | ||
12 | +++ b/target/i386/ops_sse.h | ||
13 | @@ -XXX,XX +XXX,XX @@ | ||
14 | |||
15 | #include "crypto/aes.h" | ||
16 | #include "crypto/aes-round.h" | ||
17 | +#include "crypto/clmul.h" | ||
18 | |||
19 | #if SHIFT == 0 | ||
20 | #define Reg MMXReg | ||
21 | @@ -XXX,XX +XXX,XX @@ target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) | ||
22 | |||
23 | #endif | ||
24 | |||
25 | -#if SHIFT == 1 | ||
26 | -static void clmulq(uint64_t *dest_l, uint64_t *dest_h, | ||
27 | - uint64_t a, uint64_t b) | ||
28 | -{ | ||
29 | - uint64_t al, ah, resh, resl; | ||
30 | - | ||
31 | - ah = 0; | ||
32 | - al = a; | ||
33 | - resh = resl = 0; | ||
34 | - | ||
35 | - while (b) { | ||
36 | - if (b & 1) { | ||
37 | - resl ^= al; | ||
38 | - resh ^= ah; | ||
39 | - } | ||
40 | - ah = (ah << 1) | (al >> 63); | ||
41 | - al <<= 1; | ||
42 | - b >>= 1; | ||
43 | - } | ||
44 | - | ||
45 | - *dest_l = resl; | ||
46 | - *dest_h = resh; | ||
47 | -} | ||
48 | -#endif | ||
49 | - | ||
50 | void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, | ||
51 | uint32_t ctrl) | ||
52 | { | ||
53 | - uint64_t a, b; | ||
54 | - int i; | ||
55 | + int a_idx = (ctrl & 1) != 0; | ||
56 | + int b_idx = (ctrl & 16) != 0; | ||
57 | |||
58 | - for (i = 0; i < 1 << SHIFT; i += 2) { | ||
59 | - a = v->Q(((ctrl & 1) != 0) + i); | ||
60 | - b = s->Q(((ctrl & 16) != 0) + i); | ||
61 | - clmulq(&d->Q(i), &d->Q(i + 1), a, b); | ||
62 | + for (int i = 0; i < SHIFT; i++) { | ||
63 | + uint64_t a = v->Q(2 * i + a_idx); | ||
64 | + uint64_t b = s->Q(2 * i + b_idx); | ||
65 | + Int128 *r = (Int128 *)&d->ZMM_X(i); | ||
66 | + | ||
67 | + *r = clmul_64(a, b); | ||
68 | } | ||
69 | } | ||
70 | |||
71 | -- | ||
72 | 2.34.1 | ||
73 | |||
74 | diff view generated by jsdifflib |
1 | Use the generic routine for 64-bit carry-less multiply. | 1 | Use the generic routine for 64-bit carry-less multiply. |
---|---|---|---|
2 | Remove our local version of galois_multiply64. | 2 | Remove our local version of galois_multiply64. |
3 | 3 | ||
4 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 5 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
5 | --- | 6 | --- |
6 | target/s390x/tcg/vec_int_helper.c | 62 +++++++------------------------ | 7 | target/s390x/tcg/vec_int_helper.c | 58 +++++++------------------------ |
7 | 1 file changed, 14 insertions(+), 48 deletions(-) | 8 | 1 file changed, 12 insertions(+), 46 deletions(-) |
8 | 9 | ||
9 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c | 10 | diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c |
10 | index XXXXXXX..XXXXXXX 100644 | 11 | index XXXXXXX..XXXXXXX 100644 |
11 | --- a/target/s390x/tcg/vec_int_helper.c | 12 | --- a/target/s390x/tcg/vec_int_helper.c |
12 | +++ b/target/s390x/tcg/vec_int_helper.c | 13 | +++ b/target/s390x/tcg/vec_int_helper.c |
... | ... | ||
46 | - s390_vec_shr(&vb, &vb, 1); | 47 | - s390_vec_shr(&vb, &vb, 1); |
47 | - } | 48 | - } |
48 | - return res; | 49 | - return res; |
49 | -} | 50 | -} |
50 | - | 51 | - |
51 | static Int128 do_gfm8(Int128 n, Int128 m) | 52 | /* |
52 | { | 53 | * There is no carry across the two doublewords, so their order does |
53 | Int128 e = clmul_8x8_even(n, m); | 54 | * not matter. Nor is there partial overlap between registers. |
54 | @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, | 55 | @@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, |
55 | *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4); | ||
56 | } | ||
57 | |||
58 | +static Int128 do_gfm64(Int128 n, Int128 m) | ||
59 | +{ | ||
60 | + /* | ||
61 | + * The two 64-bit halves are treated identically, | ||
62 | + * therefore host ordering does not matter. | ||
63 | + */ | ||
64 | + Int128 e = clmul_64(int128_getlo(n), int128_getlo(m)); | ||
65 | + Int128 o = clmul_64(int128_gethi(n), int128_gethi(m)); | ||
66 | + return int128_xor(e, o); | ||
67 | +} | ||
68 | + | ||
69 | void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, | 56 | void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, |
70 | uint32_t desc) | 57 | uint32_t desc) |
71 | { | 58 | { |
72 | - S390Vector tmp1, tmp2; | 59 | - S390Vector tmp1, tmp2; |
73 | - uint64_t a, b; | 60 | - uint64_t a, b; |
74 | - | 61 | + uint64_t *q1 = v1; |
62 | + const uint64_t *q2 = v2, *q3 = v3; | ||
63 | + Int128 r; | ||
64 | |||
75 | - a = s390_vec_read_element64(v2, 0); | 65 | - a = s390_vec_read_element64(v2, 0); |
76 | - b = s390_vec_read_element64(v3, 0); | 66 | - b = s390_vec_read_element64(v3, 0); |
77 | - tmp1 = galois_multiply64(a, b); | 67 | - tmp1 = galois_multiply64(a, b); |
78 | - a = s390_vec_read_element64(v2, 1); | 68 | - a = s390_vec_read_element64(v2, 1); |
79 | - b = s390_vec_read_element64(v3, 1); | 69 | - b = s390_vec_read_element64(v3, 1); |
80 | - tmp2 = galois_multiply64(a, b); | 70 | - tmp2 = galois_multiply64(a, b); |
81 | - s390_vec_xor(v1, &tmp1, &tmp2); | 71 | - s390_vec_xor(v1, &tmp1, &tmp2); |
82 | + *(Int128 *)v1 = do_gfm64(*(const Int128 *)v2, *(const Int128 *)v3); | 72 | + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); |
73 | + q1[0] = int128_gethi(r); | ||
74 | + q1[1] = int128_getlo(r); | ||
83 | } | 75 | } |
84 | 76 | ||
85 | void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, | 77 | void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, |
86 | const void *v4, uint32_t desc) | 78 | const void *v4, uint32_t desc) |
87 | { | 79 | { |
88 | - S390Vector tmp1, tmp2; | 80 | - S390Vector tmp1, tmp2; |
89 | - uint64_t a, b; | 81 | - uint64_t a, b; |
90 | - | 82 | + uint64_t *q1 = v1; |
83 | + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; | ||
84 | + Int128 r; | ||
85 | |||
91 | - a = s390_vec_read_element64(v2, 0); | 86 | - a = s390_vec_read_element64(v2, 0); |
92 | - b = s390_vec_read_element64(v3, 0); | 87 | - b = s390_vec_read_element64(v3, 0); |
93 | - tmp1 = galois_multiply64(a, b); | 88 | - tmp1 = galois_multiply64(a, b); |
94 | - a = s390_vec_read_element64(v2, 1); | 89 | - a = s390_vec_read_element64(v2, 1); |
95 | - b = s390_vec_read_element64(v3, 1); | 90 | - b = s390_vec_read_element64(v3, 1); |
96 | - tmp2 = galois_multiply64(a, b); | 91 | - tmp2 = galois_multiply64(a, b); |
97 | - s390_vec_xor(&tmp1, &tmp1, &tmp2); | 92 | - s390_vec_xor(&tmp1, &tmp1, &tmp2); |
98 | - s390_vec_xor(v1, &tmp1, v4); | 93 | - s390_vec_xor(v1, &tmp1, v4); |
99 | + Int128 r = do_gfm64(*(const Int128 *)v2, *(const Int128 *)v3); | 94 | + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); |
100 | + *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4); | 95 | + q1[0] = q4[0] ^ int128_gethi(r); |
96 | + q1[1] = q4[1] ^ int128_getlo(r); | ||
101 | } | 97 | } |
102 | 98 | ||
103 | #define DEF_VMAL(BITS) \ | 99 | #define DEF_VMAL(BITS) \ |
104 | -- | 100 | -- |
105 | 2.34.1 | 101 | 2.34.1 |
102 | |||
103 | diff view generated by jsdifflib |
1 | Use generic routine for 64-bit carry-less multiply. | 1 | Use generic routine for 64-bit carry-less multiply. |
---|---|---|---|
2 | 2 | ||
3 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
4 | --- | 5 | --- |
5 | target/ppc/int_helper.c | 17 +++-------------- | 6 | target/ppc/int_helper.c | 17 +++-------------- |
6 | 1 file changed, 3 insertions(+), 14 deletions(-) | 7 | 1 file changed, 3 insertions(+), 14 deletions(-) |
7 | 8 | ||
... | ... | ||
33 | } | 34 | } |
34 | 35 | ||
35 | #if HOST_BIG_ENDIAN | 36 | #if HOST_BIG_ENDIAN |
36 | -- | 37 | -- |
37 | 2.34.1 | 38 | 2.34.1 |
39 | |||
40 | diff view generated by jsdifflib |
1 | Detect PCLMUL in cpuinfo; implement the accel hooks. | 1 | Detect PCLMUL in cpuinfo; implement the accel hook. |
---|---|---|---|
2 | 2 | ||
3 | Reviewed-by: Ard Biesheuvel <ardb@kernel.org> | ||
3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
4 | --- | 5 | --- |
5 | host/include/i386/host/cpuinfo.h | 1 + | 6 | host/include/i386/host/cpuinfo.h | 1 + |
6 | host/include/i386/host/crypto/clmul.h | 187 ++++++++++++++++++++++++ | 7 | host/include/i386/host/crypto/clmul.h | 29 +++++++++++++++++++++++++ |
7 | host/include/x86_64/host/crypto/clmul.h | 1 + | 8 | host/include/x86_64/host/crypto/clmul.h | 1 + |
8 | util/cpuinfo-i386.c | 1 + | 9 | include/qemu/cpuid.h | 3 +++ |
9 | 4 files changed, 190 insertions(+) | 10 | util/cpuinfo-i386.c | 1 + |
11 | 5 files changed, 35 insertions(+) | ||
10 | create mode 100644 host/include/i386/host/crypto/clmul.h | 12 | create mode 100644 host/include/i386/host/crypto/clmul.h |
11 | create mode 100644 host/include/x86_64/host/crypto/clmul.h | 13 | create mode 100644 host/include/x86_64/host/crypto/clmul.h |
12 | 14 | ||
13 | diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h | 15 | diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h |
14 | index XXXXXXX..XXXXXXX 100644 | 16 | index XXXXXXX..XXXXXXX 100644 |
... | ... | ||
46 | +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL) | 48 | +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL) |
47 | +# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul"))) | 49 | +# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul"))) |
48 | +#endif | 50 | +#endif |
49 | + | 51 | + |
50 | +static inline Int128 ATTR_CLMUL_ACCEL | 52 | +static inline Int128 ATTR_CLMUL_ACCEL |
51 | +clmul_64(uint64_t n, uint64_t m) | 53 | +clmul_64_accel(uint64_t n, uint64_t m) |
52 | +{ | 54 | +{ |
53 | + union { __m128i v; Int128 s; } u; | 55 | + union { __m128i v; Int128 s; } u; |
54 | + | ||
55 | + if (!HAVE_CLMUL_ACCEL) { | ||
56 | + return clmul_64_gen(n, m); | ||
57 | + } | ||
58 | + | 56 | + |
59 | + u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0); | 57 | + u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0); |
60 | + return u.s; | 58 | + return u.s; |
61 | +} | 59 | +} |
62 | + | ||
63 | +static inline uint64_t ATTR_CLMUL_ACCEL | ||
64 | +clmul_32(uint32_t n, uint32_t m) | ||
65 | +{ | ||
66 | + __m128i r; | ||
67 | + | ||
68 | + if (!HAVE_CLMUL_ACCEL) { | ||
69 | + return clmul_32_gen(n, m); | ||
70 | + } | ||
71 | + | ||
72 | + r = _mm_clmulepi64_si128(_mm_cvtsi32_si128(n), _mm_cvtsi32_si128(m), 0); | ||
73 | + return ((__v2di)r)[0]; | ||
74 | +} | ||
75 | + | ||
76 | +static inline Int128 ATTR_CLMUL_ACCEL | ||
77 | +clmul_32x2_even(Int128 n, Int128 m) | ||
78 | +{ | ||
79 | + union { __m128i v; Int128 s; } ur, un, um; | ||
80 | + __m128i n02, m02, r0, r2; | ||
81 | + | ||
82 | + if (!HAVE_CLMUL_ACCEL) { | ||
83 | + return clmul_32x2_even_gen(n, m); | ||
84 | + } | ||
85 | + | ||
86 | + un.s = n; | ||
87 | + um.s = m; | ||
88 | + n02 = _mm_slli_epi64(un.v, 32); | ||
89 | + m02 = _mm_slli_epi64(um.v, 32); | ||
90 | + r0 = _mm_clmulepi64_si128(n02, m02, 0x00); | ||
91 | + r2 = _mm_clmulepi64_si128(n02, m02, 0x11); | ||
92 | + ur.v = _mm_unpackhi_epi64(r0, r2); | ||
93 | + return ur.s; | ||
94 | +} | ||
95 | + | ||
96 | +static inline Int128 ATTR_CLMUL_ACCEL | ||
97 | +clmul_32x2_odd(Int128 n, Int128 m) | ||
98 | +{ | ||
99 | + union { __m128i v; Int128 s; } ur, un, um; | ||
100 | + __m128i n13, m13, r1, r3; | ||
101 | + | ||
102 | + if (!HAVE_CLMUL_ACCEL) { | ||
103 | + return clmul_32x2_odd_gen(n, m); | ||
104 | + } | ||
105 | + | ||
106 | + un.s = n; | ||
107 | + um.s = m; | ||
108 | + n13 = _mm_srli_epi64(un.v, 32); | ||
109 | + m13 = _mm_srli_epi64(um.v, 32); | ||
110 | + r1 = _mm_clmulepi64_si128(n13, m13, 0x00); | ||
111 | + r3 = _mm_clmulepi64_si128(n13, m13, 0x11); | ||
112 | + ur.v = _mm_unpacklo_epi64(r1, r3); | ||
113 | + return ur.s; | ||
114 | +} | ||
115 | + | ||
116 | +static inline uint64_t ATTR_CLMUL_ACCEL | ||
117 | +clmul_16x2_even(uint64_t n, uint64_t m) | ||
118 | +{ | ||
119 | + __m128i r0, r2; | ||
120 | + | ||
121 | + if (!HAVE_CLMUL_ACCEL) { | ||
122 | + return clmul_16x2_even_gen(n, m); | ||
123 | + } | ||
124 | + | ||
125 | + r0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(n & 0xffff), | ||
126 | + _mm_cvtsi32_si128(m & 0xffff), 0); | ||
127 | + r2 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 32) & 0xffff), | ||
128 | + _mm_cvtsi32_si128((m >> 32) & 0xffff), 0); | ||
129 | + r0 = _mm_unpacklo_epi32(r0, r2); | ||
130 | + return ((__v2di)r0)[0]; | ||
131 | +} | ||
132 | + | ||
133 | +static inline uint64_t ATTR_CLMUL_ACCEL | ||
134 | +clmul_16x2_odd(uint64_t n, uint64_t m) | ||
135 | +{ | ||
136 | + __m128i r1, r3; | ||
137 | + | ||
138 | + if (!HAVE_CLMUL_ACCEL) { | ||
139 | + return clmul_16x2_even_gen(n, m); | ||
140 | + } | ||
141 | + | ||
142 | + r1 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 16) & 0xffff), | ||
143 | + _mm_cvtsi32_si128((m >> 16) & 0xffff), 0); | ||
144 | + r3 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 48) & 0xffff), | ||
145 | + _mm_cvtsi32_si128((m >> 48) & 0xffff), 0); | ||
146 | + r1 = _mm_unpacklo_epi32(r1, r3); | ||
147 | + return ((__v2di)r1)[0]; | ||
148 | +} | ||
149 | + | ||
150 | +static inline Int128 ATTR_CLMUL_ACCEL | ||
151 | +clmul_16x4_even(Int128 n, Int128 m) | ||
152 | +{ | ||
153 | + union { __m128i v; Int128 s; } ur, un, um; | ||
154 | + __m128i mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); | ||
155 | + __m128i n04, m04, n26, m26, r0, r2, r4, r6; | ||
156 | + | ||
157 | + if (!HAVE_CLMUL_ACCEL) { | ||
158 | + return clmul_16x4_even_gen(n, m); | ||
159 | + } | ||
160 | + | ||
161 | + un.s = n; | ||
162 | + um.s = m; | ||
163 | + n04 = _mm_and_si128(un.v, mask); | ||
164 | + m04 = _mm_and_si128(um.v, mask); | ||
165 | + r0 = _mm_clmulepi64_si128(n04, m04, 0x00); | ||
166 | + r4 = _mm_clmulepi64_si128(n04, m04, 0x11); | ||
167 | + n26 = _mm_and_si128(_mm_srli_epi64(un.v, 32), mask); | ||
168 | + m26 = _mm_and_si128(_mm_srli_epi64(um.v, 32), mask); | ||
169 | + r2 = _mm_clmulepi64_si128(n26, m26, 0x00); | ||
170 | + r6 = _mm_clmulepi64_si128(n26, m26, 0x11); | ||
171 | + | ||
172 | + r0 = _mm_unpacklo_epi32(r0, r2); | ||
173 | + r4 = _mm_unpacklo_epi32(r4, r6); | ||
174 | + ur.v = _mm_unpacklo_epi64(r0, r4); | ||
175 | + return ur.s; | ||
176 | +} | ||
177 | + | ||
178 | +static inline Int128 ATTR_CLMUL_ACCEL | ||
179 | +clmul_16x4_odd(Int128 n, Int128 m) | ||
180 | +{ | ||
181 | + union { __m128i v; Int128 s; } ur, un, um; | ||
182 | + __m128i mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1); | ||
183 | + __m128i n15, m15, n37, m37, r1, r3, r5, r7; | ||
184 | + | ||
185 | + if (!HAVE_CLMUL_ACCEL) { | ||
186 | + return clmul_16x4_odd_gen(n, m); | ||
187 | + } | ||
188 | + | ||
189 | + un.s = n; | ||
190 | + um.s = m; | ||
191 | + n15 = _mm_and_si128(_mm_srli_epi64(un.v, 16), mask); | ||
192 | + m15 = _mm_and_si128(_mm_srli_epi64(um.v, 16), mask); | ||
193 | + r1 = _mm_clmulepi64_si128(n15, m15, 0x00); | ||
194 | + r5 = _mm_clmulepi64_si128(n15, m15, 0x11); | ||
195 | + n37 = _mm_srli_epi64(un.v, 48); | ||
196 | + m37 = _mm_srli_epi64(um.v, 48); | ||
197 | + r3 = _mm_clmulepi64_si128(n37, m37, 0x00); | ||
198 | + r7 = _mm_clmulepi64_si128(n37, m37, 0x11); | ||
199 | + | ||
200 | + r1 = _mm_unpacklo_epi32(r1, r3); | ||
201 | + r5 = _mm_unpacklo_epi32(r5, r7); | ||
202 | + ur.v = _mm_unpacklo_epi64(r1, r5); | ||
203 | + return ur.s; | ||
204 | +} | ||
205 | + | ||
206 | +/* | ||
207 | + * Defer everything else to the generic routines. | ||
208 | + * We could implement them with even more element manipulation. | ||
209 | + */ | ||
210 | +#define clmul_8x8_low clmul_8x8_low_gen | ||
211 | +#define clmul_8x4_even clmul_8x4_even_gen | ||
212 | +#define clmul_8x4_odd clmul_8x4_odd_gen | ||
213 | +#define clmul_8x8_even clmul_8x8_even_gen | ||
214 | +#define clmul_8x8_odd clmul_8x8_odd_gen | ||
215 | +#define clmul_8x8_packed clmul_8x8_packed_gen | ||
216 | + | 60 | + |
217 | +#endif /* X86_HOST_CRYPTO_CLMUL_H */ | 61 | +#endif /* X86_HOST_CRYPTO_CLMUL_H */ |
218 | diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h | 62 | diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h |
219 | new file mode 100644 | 63 | new file mode 100644 |
220 | index XXXXXXX..XXXXXXX | 64 | index XXXXXXX..XXXXXXX |
221 | --- /dev/null | 65 | --- /dev/null |
222 | +++ b/host/include/x86_64/host/crypto/clmul.h | 66 | +++ b/host/include/x86_64/host/crypto/clmul.h |
223 | @@ -0,0 +1 @@ | 67 | @@ -0,0 +1 @@ |
224 | +#include "host/include/i386/host/crypto/clmul.h" | 68 | +#include "host/include/i386/host/crypto/clmul.h" |
69 | diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h | ||
70 | index XXXXXXX..XXXXXXX 100644 | ||
71 | --- a/include/qemu/cpuid.h | ||
72 | +++ b/include/qemu/cpuid.h | ||
73 | @@ -XXX,XX +XXX,XX @@ | ||
74 | #endif | ||
75 | |||
76 | /* Leaf 1, %ecx */ | ||
77 | +#ifndef bit_PCLMUL | ||
78 | +#define bit_PCLMUL (1 << 1) | ||
79 | +#endif | ||
80 | #ifndef bit_SSE4_1 | ||
81 | #define bit_SSE4_1 (1 << 19) | ||
82 | #endif | ||
225 | diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c | 83 | diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c |
226 | index XXXXXXX..XXXXXXX 100644 | 84 | index XXXXXXX..XXXXXXX 100644 |
227 | --- a/util/cpuinfo-i386.c | 85 | --- a/util/cpuinfo-i386.c |
228 | +++ b/util/cpuinfo-i386.c | 86 | +++ b/util/cpuinfo-i386.c |
229 | @@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void) | 87 | @@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void) |
230 | info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0); | 88 | info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0); |
231 | info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0); | 89 | info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0); |
232 | info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0); | 90 | info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0); |
233 | + info |= (c & bit_PCLMULQDQ ? CPUINFO_PCLMUL : 0); | 91 | + info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0); |
234 | 92 | ||
235 | /* Our AES support requires PSHUFB as well. */ | 93 | /* Our AES support requires PSHUFB as well. */ |
236 | info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0); | 94 | info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0); |
237 | -- | 95 | -- |
238 | 2.34.1 | 96 | 2.34.1 | diff view generated by jsdifflib |
1 | Detect PMULL in cpuinfo; implement the accel hooks. | 1 | Detect PMULL in cpuinfo; implement the accel hook. |
---|---|---|---|
2 | 2 | ||
3 | Acked-by: Ard Biesheuvel <ardb@kernel.org> | ||
4 | Tested-by: Ard Biesheuvel <ardb@kernel.org> | ||
5 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
3 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 6 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> |
4 | --- | 7 | --- |
5 | host/include/aarch64/host/cpuinfo.h | 1 + | 8 | host/include/aarch64/host/cpuinfo.h | 1 + |
6 | host/include/aarch64/host/crypto/clmul.h | 230 +++++++++++++++++++++++ | 9 | host/include/aarch64/host/crypto/clmul.h | 41 ++++++++++++++++++++++++ |
7 | util/cpuinfo-aarch64.c | 4 +- | 10 | util/cpuinfo-aarch64.c | 4 ++- |
8 | 3 files changed, 234 insertions(+), 1 deletion(-) | 11 | 3 files changed, 45 insertions(+), 1 deletion(-) |
9 | create mode 100644 host/include/aarch64/host/crypto/clmul.h | 12 | create mode 100644 host/include/aarch64/host/crypto/clmul.h |
10 | 13 | ||
11 | diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h | 14 | diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h |
12 | index XXXXXXX..XXXXXXX 100644 | 15 | index XXXXXXX..XXXXXXX 100644 |
13 | --- a/host/include/aarch64/host/cpuinfo.h | 16 | --- a/host/include/aarch64/host/cpuinfo.h |
... | ... | ||
35 | +#define AARCH64_HOST_CRYPTO_CLMUL_H | 38 | +#define AARCH64_HOST_CRYPTO_CLMUL_H |
36 | + | 39 | + |
37 | +#include "host/cpuinfo.h" | 40 | +#include "host/cpuinfo.h" |
38 | +#include <arm_neon.h> | 41 | +#include <arm_neon.h> |
39 | + | 42 | + |
40 | +/* Both FEAT_AES and FEAT_PMULL are covered under the same macro. */ | 43 | +/* |
44 | + * 64x64->128 pmull is available with FEAT_PMULL. | ||
45 | + * Both FEAT_AES and FEAT_PMULL are covered under the same macro. | ||
46 | + */ | ||
41 | +#ifdef __ARM_FEATURE_AES | 47 | +#ifdef __ARM_FEATURE_AES |
42 | +# define HAVE_CLMUL_ACCEL true | 48 | +# define HAVE_CLMUL_ACCEL true |
43 | +#else | 49 | +#else |
44 | +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL) | 50 | +# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL) |
45 | +#endif | 51 | +#endif |
46 | +#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN) | 52 | +#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN) |
47 | +# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto"))) | 53 | +# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto"))) |
48 | +#else | 54 | +#else |
49 | +# define ATTR_CLMUL_ACCEL | 55 | +# define ATTR_CLMUL_ACCEL |
50 | +#endif | 56 | +#endif |
51 | + | 57 | + |
52 | +/* | ||
53 | + * The 8x8->8 pmul and 8x8->16 pmull are available unconditionally. | ||
54 | + */ | ||
55 | + | ||
56 | +static inline uint64_t clmul_8x8_low(uint64_t n, uint64_t m) | ||
57 | +{ | ||
58 | + return (uint64_t)vmul_p8((poly8x8_t)n, (poly8x8_t)m); | ||
59 | +} | ||
60 | + | ||
61 | +static inline Int128 clmul_8x8_packed(uint64_t n, uint64_t m) | ||
62 | +{ | ||
63 | + union { poly16x8_t v; Int128 s; } u; | ||
64 | + u.v = vmull_p8((poly8x8_t)n, (poly8x8_t)m); | ||
65 | + return u.s; | ||
66 | +} | ||
67 | + | ||
68 | +static inline Int128 clmul_8x8_even(Int128 n, Int128 m) | ||
69 | +{ | ||
70 | + union { uint16x8_t v; Int128 s; } un, um; | ||
71 | + uint8x8_t pn, pm; | ||
72 | + | ||
73 | + un.s = n; | ||
74 | + um.s = m; | ||
75 | + pn = vmovn_u16(un.v); | ||
76 | + pm = vmovn_u16(um.v); | ||
77 | + return clmul_8x8_packed((uint64_t)pn, (uint64_t)pm); | ||
78 | +} | ||
79 | + | ||
80 | +static inline Int128 clmul_8x8_odd(Int128 n, Int128 m) | ||
81 | +{ | ||
82 | + union { uint8x16_t v; Int128 s; } un, um; | ||
83 | + uint8x8_t pn, pm; | ||
84 | + | ||
85 | + un.s = n; | ||
86 | + um.s = m; | ||
87 | + pn = vqtbl1_u8(un.v, (uint8x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 }); | ||
88 | + pm = vqtbl1_u8(um.v, (uint8x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 }); | ||
89 | + return clmul_8x8_packed((uint64_t)pn, (uint64_t)pm); | ||
90 | +} | ||
91 | + | ||
92 | +static inline uint64_t clmul_8x4_even(uint64_t n, uint64_t m) | ||
93 | +{ | ||
94 | + return int128_getlo(clmul_8x8_even(int128_make64(n), int128_make64(m))); | ||
95 | +} | ||
96 | + | ||
97 | +static inline uint64_t clmul_8x4_odd(uint64_t n, uint64_t m) | ||
98 | +{ | ||
99 | + return int128_getlo(clmul_8x8_odd(int128_make64(n), int128_make64(m))); | ||
100 | +} | ||
101 | + | ||
102 | +static inline Int128 clmul_16x4_packed_accel(uint16x4_t n, uint16x4_t m) | ||
103 | +{ | ||
104 | + union { uint32x4_t v; Int128 s; } u; | ||
105 | + uint32x4_t r0, r1, r2; | ||
106 | + | ||
107 | + /* | ||
108 | + * Considering the per-byte multiplication: | ||
109 | + * ab | ||
110 | + * cd | ||
111 | + * ----- | ||
112 | + * bd << 0 | ||
113 | + * bc << 8 | ||
114 | + * ad << 8 | ||
115 | + * ac << 16 | ||
116 | + * | ||
117 | + * We get the ac and bd rows of the result for free from the expanding | ||
118 | + * packed multiply. Reverse the two bytes in M, repeat, and we get the | ||
119 | + * ad and bc results, but in the wrong column; shift to fix and sum all. | ||
120 | + */ | ||
121 | + r0 = (uint32x4_t)vmull_p8((poly8x8_t)n, (poly8x8_t)m); | ||
122 | + r1 = (uint32x4_t)vmull_p8((poly8x8_t)n, vrev16_p8((poly8x8_t)m)); | ||
123 | + r2 = r1 << 8; /* bc */ | ||
124 | + r1 = r1 >> 8; /* ad */ | ||
125 | + r1 &= (uint32x4_t){ 0x00ffff00, 0x00ffff00, 0x00ffff00, 0x00ffff00 }; | ||
126 | + r2 &= (uint32x4_t){ 0x00ffff00, 0x00ffff00, 0x00ffff00, 0x00ffff00 }; | ||
127 | + r0 = r0 ^ r1 ^ r2; | ||
128 | + | ||
129 | + u.v = r0; | ||
130 | + return u.s; | ||
131 | +} | ||
132 | + | ||
133 | +static inline Int128 clmul_16x4_even(Int128 n, Int128 m) | ||
134 | +{ | ||
135 | + union { uint32x4_t v; Int128 s; } um, un; | ||
136 | + uint16x4_t pn, pm; | ||
137 | + | ||
138 | + /* Extract even uint16_t. */ | ||
139 | + un.s = n; | ||
140 | + um.s = m; | ||
141 | + pn = vmovn_u32(un.v); | ||
142 | + pm = vmovn_u32(um.v); | ||
143 | + return clmul_16x4_packed_accel(pn, pm); | ||
144 | +} | ||
145 | + | ||
146 | +static inline Int128 clmul_16x4_odd(Int128 n, Int128 m) | ||
147 | +{ | ||
148 | + union { uint8x16_t v; Int128 s; } um, un; | ||
149 | + uint16x4_t pn, pm; | ||
150 | + | ||
151 | + /* Extract odd uint16_t. */ | ||
152 | + un.s = n; | ||
153 | + um.s = m; | ||
154 | + pn = (uint16x4_t)vqtbl1_u8(un.v, (uint8x8_t){ 2, 3, 6, 7, 10, 11, 14, 15 }); | ||
155 | + pm = (uint16x4_t)vqtbl1_u8(um.v, (uint8x8_t){ 2, 3, 6, 7, 10, 11, 14, 15 }); | ||
156 | + return clmul_16x4_packed_accel(pn, pm); | ||
157 | +} | ||
158 | + | ||
159 | +static inline uint64_t clmul_16x2_even(uint64_t n, uint64_t m) | ||
160 | +{ | ||
161 | + return int128_getlo(clmul_16x4_even(int128_make64(n), int128_make64(m))); | ||
162 | +} | ||
163 | + | ||
164 | +static inline uint64_t clmul_16x2_odd(uint64_t n, uint64_t m) | ||
165 | +{ | ||
166 | + return int128_getlo(clmul_16x4_odd(int128_make64(n), int128_make64(m))); | ||
167 | +} | ||
168 | + | ||
169 | +/* | ||
170 | + * The 64x64->128 pmull is available with FEAT_PMULL. | ||
171 | + */ | ||
172 | + | ||
173 | +static inline Int128 ATTR_CLMUL_ACCEL | 58 | +static inline Int128 ATTR_CLMUL_ACCEL |
174 | +clmul_64(uint64_t n, uint64_t m) | 59 | +clmul_64_accel(uint64_t n, uint64_t m) |
175 | +{ | 60 | +{ |
176 | + union { poly128_t v; Int128 s; } u; | 61 | + union { poly128_t v; Int128 s; } u; |
177 | + | ||
178 | + if (!HAVE_CLMUL_ACCEL) { | ||
179 | + return clmul_64_gen(n, m); | ||
180 | + } | ||
181 | + | 62 | + |
182 | +#ifdef CONFIG_ARM_AES_BUILTIN | 63 | +#ifdef CONFIG_ARM_AES_BUILTIN |
183 | + u.v = vmull_p64((poly64_t)n, (poly64_t)m); | 64 | + u.v = vmull_p64((poly64_t)n, (poly64_t)m); |
184 | +#else | 65 | +#else |
185 | + asm(".arch_extension aes\n\t" | 66 | + asm(".arch_extension aes\n\t" |
186 | + "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m)); | 67 | + "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m)); |
187 | +#endif | 68 | +#endif |
188 | + return u.s; | 69 | + return u.s; |
189 | +} | ||
190 | + | ||
191 | +static inline uint64_t ATTR_CLMUL_ACCEL | ||
192 | +clmul_32(uint32_t n, uint32_t m) | ||
193 | +{ | ||
194 | + if (!HAVE_CLMUL_ACCEL) { | ||
195 | + return clmul_32_gen(n, m); | ||
196 | + } | ||
197 | + return int128_getlo(clmul_64(n, m)); | ||
198 | +} | ||
199 | + | ||
200 | +static inline Int128 ATTR_CLMUL_ACCEL | ||
201 | +clmul_32x2_even(Int128 n, Int128 m) | ||
202 | +{ | ||
203 | + union { uint64x2_t v; poly64_t h; Int128 s; } um, un, ur; | ||
204 | + uint64x2_t r0, r2; | ||
205 | + | ||
206 | + if (!HAVE_CLMUL_ACCEL) { | ||
207 | + return clmul_32x2_even_gen(n, m); | ||
208 | + } | ||
209 | + | ||
210 | + un.s = n; | ||
211 | + um.s = m; | ||
212 | + un.v &= (uint64x2_t){ 0xffffffffu, 0xffffffffu }; | ||
213 | + um.v &= (uint64x2_t){ 0xffffffffu, 0xffffffffu }; | ||
214 | + | ||
215 | +#ifdef CONFIG_ARM_AES_BUILTIN | ||
216 | + r0 = (uint64x2_t)vmull_p64(un.h, um.h); | ||
217 | + r2 = (uint64x2_t)vmull_high_p64((poly64x2_t)un.v, (poly64x2_t)um.v); | ||
218 | +#else | ||
219 | + asm(".arch_extension aes\n\t" | ||
220 | + "pmull %0.1q, %2.1d, %3.1d\n\t" | ||
221 | + "pmull2 %1.1q, %2.2d, %3.2d" | ||
222 | + : "=&w"(r0), "=w"(r2) : "w"(un.v), "w"(um.v)); | ||
223 | +#endif | ||
224 | + | ||
225 | + ur.v = vzip1q_u64(r0, r2); | ||
226 | + return ur.s; | ||
227 | +} | ||
228 | + | ||
229 | +static inline Int128 ATTR_CLMUL_ACCEL | ||
230 | +clmul_32x2_odd(Int128 n, Int128 m) | ||
231 | +{ | ||
232 | + union { uint64x2_t v; poly64_t h; Int128 s; } um, un, ur; | ||
233 | + uint64x2_t r0, r2; | ||
234 | + | ||
235 | + if (!HAVE_CLMUL_ACCEL) { | ||
236 | + return clmul_32x2_odd_gen(n, m); | ||
237 | + } | ||
238 | + | ||
239 | + un.s = n; | ||
240 | + um.s = m; | ||
241 | + un.v &= (uint64x2_t){ 0xffffffff00000000ull, 0xffffffff00000000ull }; | ||
242 | + um.v &= (uint64x2_t){ 0xffffffff00000000ull, 0xffffffff00000000ull }; | ||
243 | + | ||
244 | +#ifdef CONFIG_ARM_AES_BUILTIN | ||
245 | + r0 = (uint64x2_t)vmull_p64(un.h, um.h); | ||
246 | + r2 = (uint64x2_t)vmull_high_p64((poly64x2_t)un.v, (poly64x2_t)um.v); | ||
247 | +#else | ||
248 | + asm(".arch_extension aes\n\t" | ||
249 | + "pmull %0.1q, %2.1d, %3.1d\n\t" | ||
250 | + "pmull2 %1.1q, %2.2d, %3.2d" | ||
251 | + : "=&w"(r0), "=w"(r2) : "w"(un.v), "w"(um.v)); | ||
252 | +#endif | ||
253 | + | ||
254 | + ur.v = vzip2q_u64(r0, r2); | ||
255 | + return ur.s; | ||
256 | +} | 70 | +} |
257 | + | 71 | + |
258 | +#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */ | 72 | +#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */ |
259 | diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c | 73 | diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c |
260 | index XXXXXXX..XXXXXXX 100644 | 74 | index XXXXXXX..XXXXXXX 100644 |
... | ... | ||
276 | #endif | 90 | #endif |
277 | 91 | ||
278 | cpuinfo = info; | 92 | cpuinfo = info; |
279 | -- | 93 | -- |
280 | 2.34.1 | 94 | 2.34.1 |
95 | |||
96 | diff view generated by jsdifflib |