1
Inspired by Ard Biesheuvel's RFC patches [1] for accelerating
1
The following changes since commit 005ad32358f12fe9313a4a01918a55e60d4f39e5:
2
carry-less multiply under emulation.
3
2
4
This is less polished than the AES patch set:
3
Merge tag 'pull-tpm-2023-09-12-3' of https://github.com/stefanberger/qemu-tpm into staging (2023-09-13 13:41:57 -0400)
5
4
6
(1) Should I split HAVE_CLMUL_ACCEL into per-width HAVE_CLMUL{N}_ACCEL?
5
are available in the Git repository at:
7
The "_generic" and "_accel" split is different from aes-round.h
8
because of the difference in support for different widths, and it
9
means that each host accel has more boilerplate.
10
6
11
(2) Should I bother trying to accelerate anything other than 64x64->128?
7
https://gitlab.com/rth7680/qemu.git tags/pull-crypto-20230915
12
That seems to be the one that GSM really wants anyway. I'd keep all
13
of the sizes implemented generically, since that centralizes the 3
14
target implementations.
15
8
16
(3) The use of Int128 isn't fantastic -- better would be a vector type,
9
for you to fetch changes up to 055c99015a4ec3c608d0260592368adc604429ea:
17
though that has its own special problems for ppc64le (see the
18
endianness hoops within aes-round.h). Perhaps leave things in
19
env memory, like I was mostly able to do with AES?
20
10
21
(4) No guest test case(s).
11
host/include/aarch64: Implement clmul.h (2023-09-15 13:57:00 +0000)
22
12
13
----------------------------------------------------------------
14
Unify implementation of carry-less multiply.
15
Accelerate carry-less multiply for 64x64->128.
23
16
24
r~
17
----------------------------------------------------------------
25
18
Richard Henderson (19):
26
19
crypto: Add generic 8-bit carry-less multiply routines
27
[1] https://patchew.org/QEMU/20230601123332.3297404-1-ardb@kernel.org/
20
target/arm: Use clmul_8* routines
28
21
target/s390x: Use clmul_8* routines
29
Richard Henderson (18):
22
target/ppc: Use clmul_8* routines
30
crypto: Add generic 8-bit carry-less multiply routines
23
crypto: Add generic 16-bit carry-less multiply routines
31
target/arm: Use clmul_8* routines
24
target/arm: Use clmul_16* routines
32
target/s390x: Use clmul_8* routines
25
target/s390x: Use clmul_16* routines
33
target/ppc: Use clmul_8* routines
26
target/ppc: Use clmul_16* routines
34
crypto: Add generic 16-bit carry-less multiply routines
27
crypto: Add generic 32-bit carry-less multiply routines
35
target/arm: Use clmul_16* routines
28
target/arm: Use clmul_32* routines
36
target/s390x: Use clmul_16* routines
29
target/s390x: Use clmul_32* routines
37
target/ppc: Use clmul_16* routines
30
target/ppc: Use clmul_32* routines
38
crypto: Add generic 32-bit carry-less multiply routines
31
crypto: Add generic 64-bit carry-less multiply routine
39
target/arm: Use clmul_32* routines
32
target/arm: Use clmul_64
40
target/s390x: Use clmul_32* routines
33
target/i386: Use clmul_64
41
target/ppc: Use clmul_32* routines
34
target/s390x: Use clmul_64
42
crypto: Add generic 64-bit carry-less multiply routine
35
target/ppc: Use clmul_64
43
target/arm: Use clmul_64
36
host/include/i386: Implement clmul.h
44
target/s390x: Use clmul_64
37
host/include/aarch64: Implement clmul.h
45
target/ppc: Use clmul_64
46
host/include/i386: Implement clmul.h
47
host/include/aarch64: Implement clmul.h
48
38
49
host/include/aarch64/host/cpuinfo.h | 1 +
39
host/include/aarch64/host/cpuinfo.h | 1 +
50
host/include/aarch64/host/crypto/clmul.h | 230 +++++++++++++++++++++++
40
host/include/aarch64/host/crypto/clmul.h | 41 +++++++
51
host/include/generic/host/crypto/clmul.h | 28 +++
41
host/include/generic/host/crypto/clmul.h | 15 +++
52
host/include/i386/host/cpuinfo.h | 1 +
42
host/include/i386/host/cpuinfo.h | 1 +
53
host/include/i386/host/crypto/clmul.h | 187 ++++++++++++++++++
43
host/include/i386/host/crypto/clmul.h | 29 +++++
54
host/include/x86_64/host/crypto/clmul.h | 1 +
44
host/include/x86_64/host/crypto/clmul.h | 1 +
55
include/crypto/clmul.h | 123 ++++++++++++
45
include/crypto/clmul.h | 83 ++++++++++++++
46
include/qemu/cpuid.h | 3 +
56
target/arm/tcg/vec_internal.h | 11 --
47
target/arm/tcg/vec_internal.h | 11 --
57
crypto/clmul.c | 163 ++++++++++++++++
48
target/i386/ops_sse.h | 40 ++-----
58
target/arm/tcg/mve_helper.c | 16 +-
49
crypto/clmul.c | 111 ++++++++++++++++++
59
target/arm/tcg/vec_helper.c | 112 ++---------
50
target/arm/tcg/mve_helper.c | 16 +--
60
target/ppc/int_helper.c | 63 +++----
51
target/arm/tcg/vec_helper.c | 102 ++---------------
61
target/s390x/tcg/vec_int_helper.c | 175 +++++++----------
52
target/ppc/int_helper.c | 64 +++++------
53
target/s390x/tcg/vec_int_helper.c | 186 ++++++++++++++-----------------
62
util/cpuinfo-aarch64.c | 4 +-
54
util/cpuinfo-aarch64.c | 4 +-
63
util/cpuinfo-i386.c | 1 +
55
util/cpuinfo-i386.c | 1 +
64
crypto/meson.build | 9 +-
56
crypto/meson.build | 9 +-
65
16 files changed, 865 insertions(+), 260 deletions(-)
57
18 files changed, 433 insertions(+), 285 deletions(-)
66
create mode 100644 host/include/aarch64/host/crypto/clmul.h
58
create mode 100644 host/include/aarch64/host/crypto/clmul.h
67
create mode 100644 host/include/generic/host/crypto/clmul.h
59
create mode 100644 host/include/generic/host/crypto/clmul.h
68
create mode 100644 host/include/i386/host/crypto/clmul.h
60
create mode 100644 host/include/i386/host/crypto/clmul.h
69
create mode 100644 host/include/x86_64/host/crypto/clmul.h
61
create mode 100644 host/include/x86_64/host/crypto/clmul.h
70
create mode 100644 include/crypto/clmul.h
62
create mode 100644 include/crypto/clmul.h
71
create mode 100644 crypto/clmul.c
63
create mode 100644 crypto/clmul.c
72
73
--
74
2.34.1
diff view generated by jsdifflib
1
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
---
3
---
3
host/include/generic/host/crypto/clmul.h | 17 ++++++
4
include/crypto/clmul.h | 41 +++++++++++++++++++++++++++++
4
include/crypto/clmul.h | 61 +++++++++++++++++++
5
crypto/clmul.c | 60 ++++++++++++++++++++++++++++++++++++++++++
5
crypto/clmul.c | 76 ++++++++++++++++++++++++
6
crypto/meson.build | 9 ++++---
6
crypto/meson.build | 9 ++-
7
3 files changed, 107 insertions(+), 3 deletions(-)
7
4 files changed, 160 insertions(+), 3 deletions(-)
8
create mode 100644 host/include/generic/host/crypto/clmul.h
9
create mode 100644 include/crypto/clmul.h
8
create mode 100644 include/crypto/clmul.h
10
create mode 100644 crypto/clmul.c
9
create mode 100644 crypto/clmul.c
11
10
12
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
13
new file mode 100644
14
index XXXXXXX..XXXXXXX
15
--- /dev/null
16
+++ b/host/include/generic/host/crypto/clmul.h
17
@@ -XXX,XX +XXX,XX @@
18
+/*
19
+ * No host specific carry-less multiply acceleration.
20
+ * SPDX-License-Identifier: GPL-2.0-or-later
21
+ */
22
+
23
+#ifndef GENERIC_HOST_CRYPTO_CLMUL_H
24
+#define GENERIC_HOST_CRYPTO_CLMUL_H
25
+
26
+/* Defer everything to the generic routines. */
27
+#define clmul_8x8_low clmul_8x8_low_gen
28
+#define clmul_8x4_even clmul_8x4_even_gen
29
+#define clmul_8x4_odd clmul_8x4_odd_gen
30
+#define clmul_8x8_even clmul_8x8_even_gen
31
+#define clmul_8x8_odd clmul_8x8_odd_gen
32
+#define clmul_8x8_packed clmul_8x8_packed_gen
33
+
34
+#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
35
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
11
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
36
new file mode 100644
12
new file mode 100644
37
index XXXXXXX..XXXXXXX
13
index XXXXXXX..XXXXXXX
38
--- /dev/null
14
--- /dev/null
39
+++ b/include/crypto/clmul.h
15
+++ b/include/crypto/clmul.h
40
@@ -XXX,XX +XXX,XX @@
16
@@ -XXX,XX +XXX,XX @@
41
+/*
17
+/*
42
+ * Carry-less multiply
18
+ * Carry-less multiply operations.
43
+ * SPDX-License-Identifier: GPL-2.0-or-later
19
+ * SPDX-License-Identifier: GPL-2.0-or-later
44
+ *
20
+ *
45
+ * Copyright (C) 2023 Linaro, Ltd.
21
+ * Copyright (C) 2023 Linaro, Ltd.
46
+ */
22
+ */
47
+
23
+
48
+#ifndef CRYPTO_CLMUL_H
24
+#ifndef CRYPTO_CLMUL_H
49
+#define CRYPTO_CLMUL_H
25
+#define CRYPTO_CLMUL_H
50
+
26
+
51
+#include "qemu/int128.h"
52
+
53
+/**
27
+/**
54
+ * clmul_8x8_low:
28
+ * clmul_8x8_low:
55
+ *
29
+ *
56
+ * Perform eight 8x8->8 carry-less multiplies.
30
+ * Perform eight 8x8->8 carry-less multiplies.
57
+ */
31
+ */
58
+uint64_t clmul_8x8_low_gen(uint64_t, uint64_t);
32
+uint64_t clmul_8x8_low(uint64_t, uint64_t);
59
+
33
+
60
+/**
34
+/**
61
+ * clmul_8x4_even:
35
+ * clmul_8x4_even:
62
+ *
36
+ *
63
+ * Perform four 8x8->16 carry-less multiplies.
37
+ * Perform four 8x8->16 carry-less multiplies.
64
+ * The odd bytes of the inputs are ignored.
38
+ * The odd bytes of the inputs are ignored.
65
+ */
39
+ */
66
+uint64_t clmul_8x4_even_gen(uint64_t, uint64_t);
40
+uint64_t clmul_8x4_even(uint64_t, uint64_t);
67
+
41
+
68
+/**
42
+/**
69
+ * clmul_8x4_odd:
43
+ * clmul_8x4_odd:
70
+ *
44
+ *
71
+ * Perform four 8x8->16 carry-less multiplies.
45
+ * Perform four 8x8->16 carry-less multiplies.
72
+ * The even bytes of the inputs are ignored.
46
+ * The even bytes of the inputs are ignored.
73
+ */
47
+ */
74
+uint64_t clmul_8x4_odd_gen(uint64_t, uint64_t);
48
+uint64_t clmul_8x4_odd(uint64_t, uint64_t);
75
+
49
+
76
+/**
50
+/**
77
+ * clmul_8x8_even:
51
+ * clmul_8x4_packed:
78
+ *
52
+ *
79
+ * Perform eight 8x8->16 carry-less multiplies.
53
+ * Perform four 8x8->16 carry-less multiplies.
80
+ * The odd bytes of the inputs are ignored.
81
+ */
54
+ */
82
+Int128 clmul_8x8_even_gen(Int128, Int128);
55
+uint64_t clmul_8x4_packed(uint32_t, uint32_t);
83
+
84
+/**
85
+ * clmul_8x8_odd:
86
+ *
87
+ * Perform eight 8x8->16 carry-less multiplies.
88
+ * The even bytes of the inputs are ignored.
89
+ */
90
+Int128 clmul_8x8_odd_gen(Int128, Int128);
91
+
92
+/**
93
+ * clmul_8x8_packed:
94
+ *
95
+ * Perform eight 8x8->16 carry-less multiplies.
96
+ */
97
+Int128 clmul_8x8_packed_gen(uint64_t, uint64_t);
98
+
99
+#include "host/crypto/clmul.h"
100
+
56
+
101
+#endif /* CRYPTO_CLMUL_H */
57
+#endif /* CRYPTO_CLMUL_H */
102
diff --git a/crypto/clmul.c b/crypto/clmul.c
58
diff --git a/crypto/clmul.c b/crypto/clmul.c
103
new file mode 100644
59
new file mode 100644
104
index XXXXXXX..XXXXXXX
60
index XXXXXXX..XXXXXXX
105
--- /dev/null
61
--- /dev/null
106
+++ b/crypto/clmul.c
62
+++ b/crypto/clmul.c
107
@@ -XXX,XX +XXX,XX @@
63
@@ -XXX,XX +XXX,XX @@
108
+/*
64
+/*
109
+ * No host specific carry-less multiply acceleration.
65
+ * Carry-less multiply operations.
110
+ * SPDX-License-Identifier: GPL-2.0-or-later
66
+ * SPDX-License-Identifier: GPL-2.0-or-later
67
+ *
68
+ * Copyright (C) 2023 Linaro, Ltd.
111
+ */
69
+ */
112
+
70
+
113
+#include "qemu/osdep.h"
71
+#include "qemu/osdep.h"
114
+#include "crypto/clmul.h"
72
+#include "crypto/clmul.h"
115
+
73
+
116
+uint64_t clmul_8x8_low_gen(uint64_t n, uint64_t m)
74
+uint64_t clmul_8x8_low(uint64_t n, uint64_t m)
117
+{
75
+{
118
+ uint64_t r = 0;
76
+ uint64_t r = 0;
119
+
77
+
120
+ for (int i = 0; i < 8; ++i) {
78
+ for (int i = 0; i < 8; ++i) {
121
+ uint64_t mask = (n & 0x0101010101010101ull) * 0xff;
79
+ uint64_t mask = (n & 0x0101010101010101ull) * 0xff;
...
...
124
+ n >>= 1;
82
+ n >>= 1;
125
+ }
83
+ }
126
+ return r;
84
+ return r;
127
+}
85
+}
128
+
86
+
129
+uint64_t clmul_8x4_even_gen(uint64_t n, uint64_t m)
87
+static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m)
130
+{
88
+{
131
+ uint64_t r = 0;
89
+ uint64_t r = 0;
132
+
133
+ n &= 0x00ff00ff00ff00ffull;
134
+ m &= 0x00ff00ff00ff00ffull;
135
+
90
+
136
+ for (int i = 0; i < 8; ++i) {
91
+ for (int i = 0; i < 8; ++i) {
137
+ uint64_t mask = (n & 0x0001000100010001ull) * 0xffff;
92
+ uint64_t mask = (n & 0x0001000100010001ull) * 0xffff;
138
+ r ^= m & mask;
93
+ r ^= m & mask;
139
+ n >>= 1;
94
+ n >>= 1;
140
+ m <<= 1;
95
+ m <<= 1;
141
+ }
96
+ }
142
+ return r;
97
+ return r;
143
+}
98
+}
144
+
99
+
145
+uint64_t clmul_8x4_odd_gen(uint64_t n, uint64_t m)
100
+uint64_t clmul_8x4_even(uint64_t n, uint64_t m)
146
+{
101
+{
147
+ return clmul_8x4_even_gen(n >> 8, m >> 8);
102
+ n &= 0x00ff00ff00ff00ffull;
103
+ m &= 0x00ff00ff00ff00ffull;
104
+ return clmul_8x4_even_int(n, m);
148
+}
105
+}
149
+
106
+
150
+Int128 clmul_8x8_even_gen(Int128 n, Int128 m)
107
+uint64_t clmul_8x4_odd(uint64_t n, uint64_t m)
151
+{
108
+{
152
+ uint64_t rl, rh;
109
+ return clmul_8x4_even(n >> 8, m >> 8);
153
+
154
+ rl = clmul_8x4_even_gen(int128_getlo(n), int128_getlo(m));
155
+ rh = clmul_8x4_even_gen(int128_gethi(n), int128_gethi(m));
156
+ return int128_make128(rl, rh);
157
+}
158
+
159
+Int128 clmul_8x8_odd_gen(Int128 n, Int128 m)
160
+{
161
+ uint64_t rl, rh;
162
+
163
+ rl = clmul_8x4_odd_gen(int128_getlo(n), int128_getlo(m));
164
+ rh = clmul_8x4_odd_gen(int128_gethi(n), int128_gethi(m));
165
+ return int128_make128(rl, rh);
166
+}
110
+}
167
+
111
+
168
+static uint64_t unpack_8_to_16(uint64_t x)
112
+static uint64_t unpack_8_to_16(uint64_t x)
169
+{
113
+{
170
+ return (x & 0x000000ff)
114
+ return (x & 0x000000ff)
171
+ | ((x & 0x0000ff00) << 8)
115
+ | ((x & 0x0000ff00) << 8)
172
+ | ((x & 0x00ff0000) << 16)
116
+ | ((x & 0x00ff0000) << 16)
173
+ | ((x & 0xff000000) << 24);
117
+ | ((x & 0xff000000) << 24);
174
+}
118
+}
175
+
119
+
176
+Int128 clmul_8x8_packed_gen(uint64_t n, uint64_t m)
120
+uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
177
+{
121
+{
178
+ uint64_t rl, rh;
122
+ return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
179
+
180
+ rl = clmul_8x4_even_gen(unpack_8_to_16(n), unpack_8_to_16(m));
181
+ rh = clmul_8x4_even_gen(unpack_8_to_16(n >> 32), unpack_8_to_16(m >> 32));
182
+ return int128_make128(rl, rh);
183
+}
123
+}
184
diff --git a/crypto/meson.build b/crypto/meson.build
124
diff --git a/crypto/meson.build b/crypto/meson.build
185
index XXXXXXX..XXXXXXX 100644
125
index XXXXXXX..XXXXXXX 100644
186
--- a/crypto/meson.build
126
--- a/crypto/meson.build
187
+++ b/crypto/meson.build
127
+++ b/crypto/meson.build
...
...
diff view generated by jsdifflib
1
Use generic routines for 8-bit carry-less multiply.
1
Use generic routines for 8-bit carry-less multiply.
2
Remove our local version of pmull_h.
2
Remove our local version of pmull_h.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
---
6
target/arm/tcg/vec_internal.h | 5 ---
7
target/arm/tcg/vec_internal.h | 5 ----
7
target/arm/tcg/mve_helper.c | 8 ++---
8
target/arm/tcg/mve_helper.c | 8 ++----
8
target/arm/tcg/vec_helper.c | 63 +++++++----------------------------
9
target/arm/tcg/vec_helper.c | 53 ++++-------------------------------
9
3 files changed, 15 insertions(+), 61 deletions(-)
10
3 files changed, 9 insertions(+), 57 deletions(-)
10
11
11
diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
12
diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h
12
index XXXXXXX..XXXXXXX 100644
13
index XXXXXXX..XXXXXXX 100644
13
--- a/target/arm/tcg/vec_internal.h
14
--- a/target/arm/tcg/vec_internal.h
14
+++ b/target/arm/tcg/vec_internal.h
15
+++ b/target/arm/tcg/vec_internal.h
...
...
132
-
133
-
133
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
134
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
134
{
135
{
135
int hi = simd_data(desc);
136
int hi = simd_data(desc);
136
uint64_t *d = vd, *n = vn, *m = vm;
137
uint64_t *d = vd, *n = vn, *m = vm;
137
- uint64_t nn = n[hi], mm = m[hi];
138
uint64_t nn = n[hi], mm = m[hi];
138
-
139
139
- d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
140
- d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
140
- nn >>= 32;
141
+ d[0] = clmul_8x4_packed(nn, mm);
141
- mm >>= 32;
142
nn >>= 32;
143
mm >>= 32;
142
- d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
144
- d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
143
+ Int128 r = clmul_8x8_packed(n[hi], m[hi]);
145
+ d[1] = clmul_8x4_packed(nn, mm);
144
146
145
+ d[0] = int128_getlo(r);
146
+ d[1] = int128_gethi(r);
147
clear_tail(d, 16, simd_maxsz(desc));
147
clear_tail(d, 16, simd_maxsz(desc));
148
}
148
}
149
150
@@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
149
@@ -XXX,XX +XXX,XX @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
151
intptr_t i, opr_sz = simd_oprsz(desc);
152
uint64_t *d = vd, *n = vn, *m = vm;
150
uint64_t *d = vd, *n = vn, *m = vm;
153
151
154
- for (i = 0; i < opr_sz / 8; ++i) {
152
for (i = 0; i < opr_sz / 8; ++i) {
155
- uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
153
- uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
156
- uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
154
- uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
157
+ for (i = 0; i < opr_sz / 8; i += 2) {
155
-
158
+ Int128 nn = int128_make128(n[i] >> shift, n[i + 1] >> shift);
159
+ Int128 mm = int128_make128(m[i] >> shift, m[i + 1] >> shift);
160
+ Int128 r = clmul_8x8_even(nn, mm);
161
162
- d[i] = pmull_h(nn, mm);
156
- d[i] = pmull_h(nn, mm);
163
+ d[0] = int128_getlo(r);
157
+ d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
164
+ d[1] = int128_gethi(r);
165
}
158
}
166
}
159
}
167
160
168
--
161
--
169
2.34.1
162
2.34.1
163
164
diff view generated by jsdifflib
1
Use generic routines for 8-bit carry-less multiply.
1
Use generic routines for 8-bit carry-less multiply.
2
Remove our local version of galois_multiply8.
2
Remove our local version of galois_multiply8.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
---
6
target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++---
7
target/s390x/tcg/vec_int_helper.c | 32 ++++++++++++++++++++++++++++---
7
1 file changed, 24 insertions(+), 3 deletions(-)
8
1 file changed, 29 insertions(+), 3 deletions(-)
8
9
9
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
11
--- a/target/s390x/tcg/vec_int_helper.c
12
--- a/target/s390x/tcg/vec_int_helper.c
12
+++ b/target/s390x/tcg/vec_int_helper.c
13
+++ b/target/s390x/tcg/vec_int_helper.c
...
...
28
29
29
@@ -XXX,XX +XXX,XX @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b)
30
@@ -XXX,XX +XXX,XX @@ static S390Vector galois_multiply64(uint64_t a, uint64_t b)
30
return res;
31
return res;
31
}
32
}
32
33
33
+static Int128 do_gfm8(Int128 n, Int128 m)
34
+/*
35
+ * There is no carry across the two doublewords, so their order does
36
+ * not matter. Nor is there partial overlap between registers.
37
+ */
38
+static inline uint64_t do_gfma8(uint64_t n, uint64_t m, uint64_t a)
34
+{
39
+{
35
+ Int128 e = clmul_8x8_even(n, m);
40
+ return clmul_8x4_even(n, m) ^ clmul_8x4_odd(n, m) ^ a;
36
+ Int128 o = clmul_8x8_odd(n, m);
37
+ return int128_xor(e, o);
38
+}
41
+}
39
+
42
+
40
+void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d)
43
+void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d)
41
+{
44
+{
42
+ /*
45
+ uint64_t *q1 = v1;
43
+ * There is no carry across the two doublewords, so their order
46
+ const uint64_t *q2 = v2, *q3 = v3;
44
+ * does not matter, so we need not care for host endianness.
47
+
45
+ */
48
+ q1[0] = do_gfma8(q2[0], q3[0], 0);
46
+ *(Int128 *)v1 = do_gfm8(*(const Int128 *)v2, *(const Int128 *)v3);
49
+ q1[1] = do_gfma8(q2[1], q3[1], 0);
47
+}
50
+}
48
+
51
+
49
+void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
52
+void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
50
+ const void *v4, uint32_t d)
53
+ const void *v4, uint32_t desc)
51
+{
54
+{
52
+ Int128 r = do_gfm8(*(const Int128 *)v2, *(const Int128 *)v3);
55
+ uint64_t *q1 = v1;
53
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
56
+ const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
57
+
58
+ q1[0] = do_gfma8(q2[0], q3[0], q4[0]);
59
+ q1[1] = do_gfma8(q2[1], q3[1], q4[1]);
54
+}
60
+}
55
+
61
+
56
#define DEF_VGFM(BITS, TBITS) \
62
#define DEF_VGFM(BITS, TBITS) \
57
void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
63
void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
58
uint32_t desc) \
64
uint32_t desc) \
...
...
72
DEF_VGFMA(16, 32)
78
DEF_VGFMA(16, 32)
73
DEF_VGFMA(32, 64)
79
DEF_VGFMA(32, 64)
74
80
75
--
81
--
76
2.34.1
82
2.34.1
83
84
diff view generated by jsdifflib
1
Use generic routines for 8-bit carry-less multiply.
1
Use generic routines for 8-bit carry-less multiply.
2
2
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
4
---
5
target/ppc/int_helper.c | 11 ++++++++++-
5
target/ppc/int_helper.c | 14 +++++++++++++-
6
1 file changed, 10 insertions(+), 1 deletion(-)
6
1 file changed, 13 insertions(+), 1 deletion(-)
7
7
8
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
8
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
9
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
10
--- a/target/ppc/int_helper.c
10
--- a/target/ppc/int_helper.c
11
+++ b/target/ppc/int_helper.c
11
+++ b/target/ppc/int_helper.c
...
...
19
#include "qemu/guest-random.h"
19
#include "qemu/guest-random.h"
20
@@ -XXX,XX +XXX,XX @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
20
@@ -XXX,XX +XXX,XX @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
21
#undef VBPERMQ_INDEX
21
#undef VBPERMQ_INDEX
22
#undef VBPERMQ_DW
22
#undef VBPERMQ_DW
23
23
24
+/*
25
+ * There is no carry across the two doublewords, so their order does
26
+ * not matter. Nor is there partial overlap between registers.
27
+ */
24
+void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
28
+void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
25
+{
29
+{
26
+ Int128 ia = a->s128;
30
+ for (int i = 0; i < 2; ++i) {
27
+ Int128 ib = b->s128;
31
+ uint64_t aa = a->u64[i], bb = b->u64[i];
28
+ Int128 e = clmul_8x8_even(ia, ib);
32
+ r->u64[i] = clmul_8x4_even(aa, bb) ^ clmul_8x4_odd(aa, bb);
29
+ Int128 o = clmul_8x8_odd(ia, ib);
33
+ }
30
+ r->s128 = int128_xor(e, o);
31
+}
34
+}
32
+
35
+
33
#define PMSUM(name, srcfld, trgfld, trgtyp) \
36
#define PMSUM(name, srcfld, trgfld, trgtyp) \
34
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
37
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
35
{ \
38
{ \
...
...
diff view generated by jsdifflib
1
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
---
3
---
3
host/include/generic/host/crypto/clmul.h | 5 +++
4
include/crypto/clmul.h | 16 ++++++++++++++++
4
include/crypto/clmul.h | 32 +++++++++++++++++++
5
crypto/clmul.c | 21 +++++++++++++++++++++
5
crypto/clmul.c | 39 ++++++++++++++++++++++++
6
2 files changed, 37 insertions(+)
6
3 files changed, 76 insertions(+)
7
7
8
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
9
index XXXXXXX..XXXXXXX 100644
10
--- a/host/include/generic/host/crypto/clmul.h
11
+++ b/host/include/generic/host/crypto/clmul.h
12
@@ -XXX,XX +XXX,XX @@
13
#define clmul_8x8_odd clmul_8x8_odd_gen
14
#define clmul_8x8_packed clmul_8x8_packed_gen
15
16
+#define clmul_16x2_even clmul_16x2_even_gen
17
+#define clmul_16x2_odd clmul_16x2_odd_gen
18
+#define clmul_16x4_even clmul_16x4_even_gen
19
+#define clmul_16x4_odd clmul_16x4_odd_gen
20
+
21
#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
22
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
8
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
23
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
24
--- a/include/crypto/clmul.h
10
--- a/include/crypto/clmul.h
25
+++ b/include/crypto/clmul.h
11
+++ b/include/crypto/clmul.h
26
@@ -XXX,XX +XXX,XX @@ Int128 clmul_8x8_odd_gen(Int128, Int128);
12
@@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_odd(uint64_t, uint64_t);
27
*/
13
*/
28
Int128 clmul_8x8_packed_gen(uint64_t, uint64_t);
14
uint64_t clmul_8x4_packed(uint32_t, uint32_t);
29
15
30
+/**
16
+/**
31
+ * clmul_16x2_even:
17
+ * clmul_16x2_even:
32
+ *
18
+ *
33
+ * Perform two 16x16->32 carry-less multiplies.
19
+ * Perform two 16x16->32 carry-less multiplies.
34
+ * The odd words of the inputs are ignored.
20
+ * The odd words of the inputs are ignored.
35
+ */
21
+ */
36
+uint64_t clmul_16x2_even_gen(uint64_t, uint64_t);
22
+uint64_t clmul_16x2_even(uint64_t, uint64_t);
37
+
23
+
38
+/**
24
+/**
39
+ * clmul_16x2_odd:
25
+ * clmul_16x2_odd:
40
+ *
26
+ *
41
+ * Perform two 16x16->32 carry-less multiplies.
27
+ * Perform two 16x16->32 carry-less multiplies.
42
+ * The even bytes of the inputs are ignored.
28
+ * The even words of the inputs are ignored.
43
+ */
29
+ */
44
+uint64_t clmul_16x2_odd_gen(uint64_t, uint64_t);
30
+uint64_t clmul_16x2_odd(uint64_t, uint64_t);
45
+
31
+
46
+/**
47
+ * clmul_16x4_even:
48
+ *
49
+ * Perform four 16x16->32 carry-less multiplies.
50
+ * The odd bytes of the inputs are ignored.
51
+ */
52
+Int128 clmul_16x4_even_gen(Int128, Int128);
53
+
54
+/**
55
+ * clmul_16x4_odd:
56
+ *
57
+ * Perform eight 16x16->32 carry-less multiplies.
58
+ * The even bytes of the inputs are ignored.
59
+ */
60
+Int128 clmul_16x4_odd_gen(Int128, Int128);
61
+
62
#include "host/crypto/clmul.h"
63
64
#endif /* CRYPTO_CLMUL_H */
32
#endif /* CRYPTO_CLMUL_H */
65
diff --git a/crypto/clmul.c b/crypto/clmul.c
33
diff --git a/crypto/clmul.c b/crypto/clmul.c
66
index XXXXXXX..XXXXXXX 100644
34
index XXXXXXX..XXXXXXX 100644
67
--- a/crypto/clmul.c
35
--- a/crypto/clmul.c
68
+++ b/crypto/clmul.c
36
+++ b/crypto/clmul.c
69
@@ -XXX,XX +XXX,XX @@ Int128 clmul_8x8_packed_gen(uint64_t n, uint64_t m)
37
@@ -XXX,XX +XXX,XX @@ uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
70
rh = clmul_8x4_even_gen(unpack_8_to_16(n >> 32), unpack_8_to_16(m >> 32));
38
{
71
return int128_make128(rl, rh);
39
return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
72
}
40
}
73
+
41
+
74
+uint64_t clmul_16x2_even_gen(uint64_t n, uint64_t m)
42
+uint64_t clmul_16x2_even(uint64_t n, uint64_t m)
75
+{
43
+{
76
+ uint64_t r = 0;
44
+ uint64_t r = 0;
77
+
45
+
78
+ n &= 0x0000ffff0000ffffull;
46
+ n &= 0x0000ffff0000ffffull;
79
+ m &= 0x0000ffff0000ffffull;
47
+ m &= 0x0000ffff0000ffffull;
...
...
85
+ m <<= 1;
53
+ m <<= 1;
86
+ }
54
+ }
87
+ return r;
55
+ return r;
88
+}
56
+}
89
+
57
+
90
+uint64_t clmul_16x2_odd_gen(uint64_t n, uint64_t m)
58
+uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
91
+{
59
+{
92
+ return clmul_16x2_even_gen(n >> 16, m >> 16);
60
+ return clmul_16x2_even(n >> 16, m >> 16);
93
+}
94
+
95
+Int128 clmul_16x4_even_gen(Int128 n, Int128 m)
96
+{
97
+ uint64_t rl, rh;
98
+
99
+ rl = clmul_16x2_even_gen(int128_getlo(n), int128_getlo(m));
100
+ rh = clmul_16x2_even_gen(int128_gethi(n), int128_gethi(m));
101
+ return int128_make128(rl, rh);
102
+}
103
+
104
+Int128 clmul_16x4_odd_gen(Int128 n, Int128 m)
105
+{
106
+ uint64_t rl, rh;
107
+
108
+ rl = clmul_16x2_odd_gen(int128_getlo(n), int128_getlo(m));
109
+ rh = clmul_16x2_odd_gen(int128_gethi(n), int128_gethi(m));
110
+ return int128_make128(rl, rh);
111
+}
61
+}
112
--
62
--
113
2.34.1
63
2.34.1
diff view generated by jsdifflib
1
Use generic routines for 16-bit carry-less multiply.
1
Use generic routines for 16-bit carry-less multiply.
2
Remove our local version of pmull_w.
2
Remove our local version of pmull_w.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
---
6
target/arm/tcg/vec_internal.h | 6 ------
7
target/arm/tcg/vec_internal.h | 6 ------
7
target/arm/tcg/mve_helper.c | 8 ++------
8
target/arm/tcg/mve_helper.c | 8 ++------
8
target/arm/tcg/vec_helper.c | 13 -------------
9
target/arm/tcg/vec_helper.c | 13 -------------
...
...
70
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
71
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
71
{
72
{
72
int hi = simd_data(desc);
73
int hi = simd_data(desc);
73
--
74
--
74
2.34.1
75
2.34.1
76
77
diff view generated by jsdifflib
1
Use generic routines for 16-bit carry-less multiply.
1
Use generic routines for 16-bit carry-less multiply.
2
Remove our local version of galois_multiply16.
2
Remove our local version of galois_multiply16.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
---
6
target/s390x/tcg/vec_int_helper.c | 22 +++++++++++++++++++---
7
target/s390x/tcg/vec_int_helper.c | 27 ++++++++++++++++++++++++---
7
1 file changed, 19 insertions(+), 3 deletions(-)
8
1 file changed, 24 insertions(+), 3 deletions(-)
8
9
9
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
11
--- a/target/s390x/tcg/vec_int_helper.c
12
--- a/target/s390x/tcg/vec_int_helper.c
12
+++ b/target/s390x/tcg/vec_int_helper.c
13
+++ b/target/s390x/tcg/vec_int_helper.c
...
...
17
-DEF_GALOIS_MULTIPLY(16, 32)
18
-DEF_GALOIS_MULTIPLY(16, 32)
18
DEF_GALOIS_MULTIPLY(32, 64)
19
DEF_GALOIS_MULTIPLY(32, 64)
19
20
20
static S390Vector galois_multiply64(uint64_t a, uint64_t b)
21
static S390Vector galois_multiply64(uint64_t a, uint64_t b)
21
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
22
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
22
*(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
23
q1[1] = do_gfma8(q2[1], q3[1], q4[1]);
23
}
24
}
24
25
25
+static Int128 do_gfm16(Int128 n, Int128 m)
26
+static inline uint64_t do_gfma16(uint64_t n, uint64_t m, uint64_t a)
26
+{
27
+{
27
+ Int128 e = clmul_16x4_even(n, m);
28
+ return clmul_16x2_even(n, m) ^ clmul_16x2_odd(n, m) ^ a;
28
+ Int128 o = clmul_16x4_odd(n, m);
29
+ return int128_xor(e, o);
30
+}
29
+}
31
+
30
+
32
+void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d)
31
+void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d)
33
+{
32
+{
34
+ *(Int128 *)v1 = do_gfm16(*(const Int128 *)v2, *(const Int128 *)v3);
33
+ uint64_t *q1 = v1;
34
+ const uint64_t *q2 = v2, *q3 = v3;
35
+
36
+ q1[0] = do_gfma16(q2[0], q3[0], 0);
37
+ q1[1] = do_gfma16(q2[1], q3[1], 0);
35
+}
38
+}
36
+
39
+
37
+void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
40
+void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
38
+ const void *v4, uint32_t d)
41
+ const void *v4, uint32_t d)
39
+{
42
+{
40
+ Int128 r = do_gfm16(*(const Int128 *)v2, *(const Int128 *)v3);
43
+ uint64_t *q1 = v1;
41
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
44
+ const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
45
+
46
+ q1[0] = do_gfma16(q2[0], q3[0], q4[0]);
47
+ q1[1] = do_gfma16(q2[1], q3[1], q4[1]);
42
+}
48
+}
43
+
49
+
44
#define DEF_VGFM(BITS, TBITS) \
50
#define DEF_VGFM(BITS, TBITS) \
45
void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
51
void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
46
uint32_t desc) \
52
uint32_t desc) \
...
...
60
DEF_VGFMA(32, 64)
66
DEF_VGFMA(32, 64)
61
67
62
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
68
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
63
--
69
--
64
2.34.1
70
2.34.1
71
72
diff view generated by jsdifflib
1
Use generic routines for 16-bit carry-less multiply.
1
Use generic routines for 16-bit carry-less multiply.
2
2
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
4
---
5
target/ppc/int_helper.c | 10 +++++++++-
5
target/ppc/int_helper.c | 9 ++++++++-
6
1 file changed, 9 insertions(+), 1 deletion(-)
6
1 file changed, 8 insertions(+), 1 deletion(-)
7
7
8
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
8
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
9
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
10
--- a/target/ppc/int_helper.c
10
--- a/target/ppc/int_helper.c
11
+++ b/target/ppc/int_helper.c
11
+++ b/target/ppc/int_helper.c
12
@@ -XXX,XX +XXX,XX @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
12
@@ -XXX,XX +XXX,XX @@ void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
13
r->s128 = int128_xor(e, o);
13
}
14
}
14
}
15
15
16
+void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
16
+void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
17
+{
17
+{
18
+ Int128 ia = a->s128;
18
+ for (int i = 0; i < 2; ++i) {
19
+ Int128 ib = b->s128;
19
+ uint64_t aa = a->u64[i], bb = b->u64[i];
20
+ Int128 e = clmul_16x4_even(ia, ib);
20
+ r->u64[i] = clmul_16x2_even(aa, bb) ^ clmul_16x2_odd(aa, bb);
21
+ Int128 o = clmul_16x4_odd(ia, ib);
21
+ }
22
+ r->s128 = int128_xor(e, o);
23
+}
22
+}
24
+
23
+
25
#define PMSUM(name, srcfld, trgfld, trgtyp) \
24
#define PMSUM(name, srcfld, trgfld, trgtyp) \
26
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
25
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
27
{ \
26
{ \
...
...
diff view generated by jsdifflib
1
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
---
3
---
3
host/include/generic/host/crypto/clmul.h | 4 +++
4
include/crypto/clmul.h | 7 +++++++
4
include/crypto/clmul.h | 23 ++++++++++++++++++
5
crypto/clmul.c | 13 +++++++++++++
5
crypto/clmul.c | 31 ++++++++++++++++++++++++
6
2 files changed, 20 insertions(+)
6
3 files changed, 58 insertions(+)
7
7
8
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
9
index XXXXXXX..XXXXXXX 100644
10
--- a/host/include/generic/host/crypto/clmul.h
11
+++ b/host/include/generic/host/crypto/clmul.h
12
@@ -XXX,XX +XXX,XX @@
13
#define clmul_16x4_even clmul_16x4_even_gen
14
#define clmul_16x4_odd clmul_16x4_odd_gen
15
16
+#define clmul_32 clmul_32_gen
17
+#define clmul_32x2_even clmul_32x2_even_gen
18
+#define clmul_32x2_odd clmul_32x2_odd_gen
19
+
20
#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
21
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
8
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
22
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
23
--- a/include/crypto/clmul.h
10
--- a/include/crypto/clmul.h
24
+++ b/include/crypto/clmul.h
11
+++ b/include/crypto/clmul.h
25
@@ -XXX,XX +XXX,XX @@ Int128 clmul_16x4_even_gen(Int128, Int128);
12
@@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_even(uint64_t, uint64_t);
26
*/
13
*/
27
Int128 clmul_16x4_odd_gen(Int128, Int128);
14
uint64_t clmul_16x2_odd(uint64_t, uint64_t);
28
15
29
+/**
16
+/**
30
+ * clmul_32:
17
+ * clmul_32:
31
+ *
18
+ *
32
+ * Perform a 32x32->64 carry-less multiply.
19
+ * Perform a 32x32->64 carry-less multiply.
33
+ */
20
+ */
34
+uint64_t clmul_32_gen(uint32_t, uint32_t);
21
+uint64_t clmul_32(uint32_t, uint32_t);
35
+
22
+
36
+/**
37
+ * clmul_32x2_even:
38
+ *
39
+ * Perform two 32x32->64 carry-less multiplies.
40
+ * The odd words of the inputs are ignored.
41
+ */
42
+Int128 clmul_32x2_even_gen(Int128, Int128);
43
+
44
+/**
45
+ * clmul_32x2_odd:
46
+ *
47
+ * Perform two 32x32->64 carry-less multiplies.
48
+ * The even words of the inputs are ignored.
49
+ */
50
+Int128 clmul_32x2_odd_gen(Int128, Int128);
51
+
52
#include "host/crypto/clmul.h"
53
54
#endif /* CRYPTO_CLMUL_H */
23
#endif /* CRYPTO_CLMUL_H */
55
diff --git a/crypto/clmul.c b/crypto/clmul.c
24
diff --git a/crypto/clmul.c b/crypto/clmul.c
56
index XXXXXXX..XXXXXXX 100644
25
index XXXXXXX..XXXXXXX 100644
57
--- a/crypto/clmul.c
26
--- a/crypto/clmul.c
58
+++ b/crypto/clmul.c
27
+++ b/crypto/clmul.c
59
@@ -XXX,XX +XXX,XX @@ Int128 clmul_16x4_odd_gen(Int128 n, Int128 m)
28
@@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
60
rh = clmul_16x2_odd_gen(int128_gethi(n), int128_gethi(m));
29
{
61
return int128_make128(rl, rh);
30
return clmul_16x2_even(n >> 16, m >> 16);
62
}
31
}
63
+
32
+
64
+uint64_t clmul_32_gen(uint32_t n, uint32_t m32)
33
+uint64_t clmul_32(uint32_t n, uint32_t m32)
65
+{
34
+{
66
+ uint64_t r = 0;
35
+ uint64_t r = 0;
67
+ uint64_t m = m32;
36
+ uint64_t m = m32;
68
+
37
+
69
+ for (int i = 0; i < 32; ++i) {
38
+ for (int i = 0; i < 32; ++i) {
70
+ r ^= n & 1 ? m : 0;
39
+ r ^= n & 1 ? m : 0;
71
+ n >>= 1;
40
+ n >>= 1;
72
+ m <<= 1;
41
+ m <<= 1;
73
+ }
42
+ }
74
+ return r;
43
+ return r;
75
+}
44
+}
76
+
77
+Int128 clmul_32x2_even_gen(Int128 n, Int128 m)
78
+{
79
+ uint64_t rl, rh;
80
+
81
+ rl = clmul_32_gen(int128_getlo(n), int128_getlo(m));
82
+ rh = clmul_32_gen(int128_gethi(n), int128_gethi(m));
83
+ return int128_make128(rl, rh);
84
+}
85
+
86
+Int128 clmul_32x2_odd_gen(Int128 n, Int128 m)
87
+{
88
+ uint64_t rl, rh;
89
+
90
+ rl = clmul_32_gen(int128_getlo(n) >> 32, int128_getlo(m) >> 32);
91
+ rh = clmul_32_gen(int128_gethi(n) >> 32, int128_gethi(m) >> 32);
92
+ return int128_make128(rl, rh);
93
+}
94
--
45
--
95
2.34.1
46
2.34.1
diff view generated by jsdifflib
1
Use generic routines for 32-bit carry-less multiply.
1
Use generic routines for 32-bit carry-less multiply.
2
Remove our local version of pmull_d.
2
Remove our local version of pmull_d.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
---
6
target/arm/tcg/vec_helper.c | 14 +-------------
7
target/arm/tcg/vec_helper.c | 14 +-------------
7
1 file changed, 1 insertion(+), 13 deletions(-)
8
1 file changed, 1 insertion(+), 13 deletions(-)
8
9
...
...
38
}
39
}
39
}
40
}
40
#endif
41
#endif
41
--
42
--
42
2.34.1
43
2.34.1
44
45
diff view generated by jsdifflib
1
Use generic routines for 32-bit carry-less multiply.
1
Use generic routines for 32-bit carry-less multiply.
2
Remove our local version of galois_multiply32.
2
Remove our local version of galois_multiply32.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
---
6
target/s390x/tcg/vec_int_helper.c | 70 ++++++++-----------------------
7
target/s390x/tcg/vec_int_helper.c | 75 +++++++++----------------------
7
1 file changed, 17 insertions(+), 53 deletions(-)
8
1 file changed, 22 insertions(+), 53 deletions(-)
8
9
9
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
11
--- a/target/s390x/tcg/vec_int_helper.c
12
--- a/target/s390x/tcg/vec_int_helper.c
12
+++ b/target/s390x/tcg/vec_int_helper.c
13
+++ b/target/s390x/tcg/vec_int_helper.c
...
...
32
-DEF_GALOIS_MULTIPLY(32, 64)
33
-DEF_GALOIS_MULTIPLY(32, 64)
33
34
34
static S390Vector galois_multiply64(uint64_t a, uint64_t b)
35
static S390Vector galois_multiply64(uint64_t a, uint64_t b)
35
{
36
{
36
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
37
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
37
*(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
38
q1[1] = do_gfma16(q2[1], q3[1], q4[1]);
38
}
39
}
39
40
40
-#define DEF_VGFM(BITS, TBITS) \
41
-#define DEF_VGFM(BITS, TBITS) \
41
-void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
42
-void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
42
- uint32_t desc) \
43
- uint32_t desc) \
...
...
51
- a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
52
- a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
52
- b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
53
- b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
53
- d = d ^ galois_multiply32(a, b); \
54
- d = d ^ galois_multiply32(a, b); \
54
- s390_vec_write_element##TBITS(v1, i, d); \
55
- s390_vec_write_element##TBITS(v1, i, d); \
55
- } \
56
- } \
56
+static Int128 do_gfm32(Int128 n, Int128 m)
57
+static inline uint64_t do_gfma32(uint64_t n, uint64_t m, uint64_t a)
57
+{
58
+{
58
+ Int128 e = clmul_32x2_even(n, m);
59
+ return clmul_32(n, m) ^ clmul_32(n >> 32, m >> 32) ^ a;
59
+ Int128 o = clmul_32x2_odd(n, m);
60
+ return int128_xor(e, o);
61
+}
60
+}
62
+
61
+
63
+void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d)
62
+void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d)
64
+{
63
+{
65
+ *(Int128 *)v1 = do_gfm32(*(const Int128 *)v2, *(const Int128 *)v3);
64
+ uint64_t *q1 = v1;
65
+ const uint64_t *q2 = v2, *q3 = v3;
66
+
67
+ q1[0] = do_gfma32(q2[0], q3[0], 0);
68
+ q1[1] = do_gfma32(q2[1], q3[1], 0);
66
+}
69
+}
67
+
70
+
68
+void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
71
+void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
69
+ const void *v4, uint32_t d)
72
+ const void *v4, uint32_t d)
70
+{
73
+{
71
+ Int128 r = do_gfm32(*(const Int128 *)v2, *(const Int128 *)v3);
74
+ uint64_t *q1 = v1;
72
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
75
+ const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
76
+
77
+ q1[0] = do_gfma32(q2[0], q3[0], q4[0]);
78
+ q1[1] = do_gfma32(q2[1], q3[1], q4[1]);
73
}
79
}
74
-DEF_VGFM(32, 64)
80
-DEF_VGFM(32, 64)
75
81
76
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
82
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
77
uint32_t desc)
83
uint32_t desc)
...
...
102
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
108
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
103
const void *v4, uint32_t desc)
109
const void *v4, uint32_t desc)
104
{
110
{
105
--
111
--
106
2.34.1
112
2.34.1
113
114
diff view generated by jsdifflib
1
Use generic routines for 32-bit carry-less multiply.
1
Use generic routines for 32-bit carry-less multiply.
2
2
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
4
---
5
target/ppc/int_helper.c | 27 +++++++--------------------
5
target/ppc/int_helper.c | 26 ++++++--------------------
6
1 file changed, 7 insertions(+), 20 deletions(-)
6
1 file changed, 6 insertions(+), 20 deletions(-)
7
7
8
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
8
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
9
index XXXXXXX..XXXXXXX 100644
9
index XXXXXXX..XXXXXXX 100644
10
--- a/target/ppc/int_helper.c
10
--- a/target/ppc/int_helper.c
11
+++ b/target/ppc/int_helper.c
11
+++ b/target/ppc/int_helper.c
12
@@ -XXX,XX +XXX,XX @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
12
@@ -XXX,XX +XXX,XX @@ void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
13
r->s128 = int128_xor(e, o);
13
}
14
}
14
}
15
15
16
-#define PMSUM(name, srcfld, trgfld, trgtyp) \
16
-#define PMSUM(name, srcfld, trgfld, trgtyp) \
17
-void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
17
-void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
18
-{ \
18
-{ \
...
...
31
- VECTOR_FOR_INORDER_I(i, trgfld) { \
31
- VECTOR_FOR_INORDER_I(i, trgfld) { \
32
- r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \
32
- r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \
33
- } \
33
- } \
34
+void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
34
+void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
35
+{
35
+{
36
+ Int128 ia = a->s128;
36
+ for (int i = 0; i < 2; ++i) {
37
+ Int128 ib = b->s128;
37
+ uint64_t aa = a->u64[i], bb = b->u64[i];
38
+ Int128 e = clmul_32x2_even(ia, ib);
38
+ r->u64[i] = clmul_32(aa, bb) ^ clmul_32(aa >> 32, bb >> 32);
39
+ Int128 o = clmul_32x2_odd(ia, ib);
39
+ }
40
+ r->s128 = int128_xor(e, o);
41
}
40
}
42
41
43
-PMSUM(vpmsumw, u32, u64, uint64_t)
42
-PMSUM(vpmsumw, u32, u64, uint64_t)
44
-
43
-
45
void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
44
void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
46
{
45
{
47
int i, j;
46
int i, j;
48
--
47
--
49
2.34.1
48
2.34.1
diff view generated by jsdifflib
1
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
1
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2
---
3
---
3
host/include/generic/host/crypto/clmul.h | 2 ++
4
host/include/generic/host/crypto/clmul.h | 15 +++++++++++++++
4
include/crypto/clmul.h | 7 +++++++
5
include/crypto/clmul.h | 19 +++++++++++++++++++
5
crypto/clmul.c | 17 +++++++++++++++++
6
crypto/clmul.c | 17 +++++++++++++++++
6
3 files changed, 26 insertions(+)
7
3 files changed, 51 insertions(+)
8
create mode 100644 host/include/generic/host/crypto/clmul.h
7
9
8
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
10
diff --git a/host/include/generic/host/crypto/clmul.h b/host/include/generic/host/crypto/clmul.h
9
index XXXXXXX..XXXXXXX 100644
11
new file mode 100644
10
--- a/host/include/generic/host/crypto/clmul.h
12
index XXXXXXX..XXXXXXX
13
--- /dev/null
11
+++ b/host/include/generic/host/crypto/clmul.h
14
+++ b/host/include/generic/host/crypto/clmul.h
12
@@ -XXX,XX +XXX,XX @@
15
@@ -XXX,XX +XXX,XX @@
13
#define clmul_32x2_even clmul_32x2_even_gen
16
+/*
14
#define clmul_32x2_odd clmul_32x2_odd_gen
17
+ * No host specific carry-less multiply acceleration.
15
18
+ * SPDX-License-Identifier: GPL-2.0-or-later
16
+#define clmul_64 clmul_64_gen
19
+ */
17
+
20
+
18
#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
21
+#ifndef GENERIC_HOST_CRYPTO_CLMUL_H
22
+#define GENERIC_HOST_CRYPTO_CLMUL_H
23
+
24
+#define HAVE_CLMUL_ACCEL false
25
+#define ATTR_CLMUL_ACCEL
26
+
27
+Int128 clmul_64_accel(uint64_t, uint64_t)
28
+ QEMU_ERROR("unsupported accel");
29
+
30
+#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
19
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
31
diff --git a/include/crypto/clmul.h b/include/crypto/clmul.h
20
index XXXXXXX..XXXXXXX 100644
32
index XXXXXXX..XXXXXXX 100644
21
--- a/include/crypto/clmul.h
33
--- a/include/crypto/clmul.h
22
+++ b/include/crypto/clmul.h
34
+++ b/include/crypto/clmul.h
23
@@ -XXX,XX +XXX,XX @@ Int128 clmul_32x2_even_gen(Int128, Int128);
35
@@ -XXX,XX +XXX,XX @@
36
#ifndef CRYPTO_CLMUL_H
37
#define CRYPTO_CLMUL_H
38
39
+#include "qemu/int128.h"
40
+#include "host/crypto/clmul.h"
41
+
42
/**
43
* clmul_8x8_low:
44
*
45
@@ -XXX,XX +XXX,XX @@ uint64_t clmul_16x2_odd(uint64_t, uint64_t);
24
*/
46
*/
25
Int128 clmul_32x2_odd_gen(Int128, Int128);
47
uint64_t clmul_32(uint32_t, uint32_t);
26
48
27
+/**
49
+/**
28
+ * clmul_64:
50
+ * clmul_64:
29
+ *
51
+ *
30
+ * Perform a 64x64->128 carry-less multiply.
52
+ * Perform a 64x64->128 carry-less multiply.
31
+ */
53
+ */
32
+Int128 clmul_64_gen(uint64_t, uint64_t);
54
+Int128 clmul_64_gen(uint64_t, uint64_t);
33
+
55
+
34
#include "host/crypto/clmul.h"
56
+static inline Int128 clmul_64(uint64_t a, uint64_t b)
35
57
+{
58
+ if (HAVE_CLMUL_ACCEL) {
59
+ return clmul_64_accel(a, b);
60
+ } else {
61
+ return clmul_64_gen(a, b);
62
+ }
63
+}
64
+
36
#endif /* CRYPTO_CLMUL_H */
65
#endif /* CRYPTO_CLMUL_H */
37
diff --git a/crypto/clmul.c b/crypto/clmul.c
66
diff --git a/crypto/clmul.c b/crypto/clmul.c
38
index XXXXXXX..XXXXXXX 100644
67
index XXXXXXX..XXXXXXX 100644
39
--- a/crypto/clmul.c
68
--- a/crypto/clmul.c
40
+++ b/crypto/clmul.c
69
+++ b/crypto/clmul.c
41
@@ -XXX,XX +XXX,XX @@ Int128 clmul_32x2_odd_gen(Int128 n, Int128 m)
70
@@ -XXX,XX +XXX,XX @@ uint64_t clmul_32(uint32_t n, uint32_t m32)
42
rh = clmul_32_gen(int128_gethi(n) >> 32, int128_gethi(m) >> 32);
71
}
43
return int128_make128(rl, rh);
72
return r;
44
}
73
}
45
+
74
+
46
+Int128 clmul_64_gen(uint64_t n, uint64_t m)
75
+Int128 clmul_64_gen(uint64_t n, uint64_t m)
47
+{
76
+{
48
+ uint64_t rl = 0, rh = 0;
77
+ uint64_t rl = 0, rh = 0;
...
...
diff view generated by jsdifflib
1
Use generic routine for 64-bit carry-less multiply.
1
Use generic routine for 64-bit carry-less multiply.
2
2
3
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
---
5
target/arm/tcg/vec_helper.c | 22 ++++------------------
6
target/arm/tcg/vec_helper.c | 22 ++++------------------
6
1 file changed, 4 insertions(+), 18 deletions(-)
7
1 file changed, 4 insertions(+), 18 deletions(-)
7
8
...
...
42
}
43
}
43
clear_tail(d, opr_sz, simd_maxsz(desc));
44
clear_tail(d, opr_sz, simd_maxsz(desc));
44
}
45
}
45
--
46
--
46
2.34.1
47
2.34.1
48
49
diff view generated by jsdifflib
New patch
1
Use generic routine for 64-bit carry-less multiply.
1
2
3
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
target/i386/ops_sse.h | 40 +++++++++-------------------------------
7
1 file changed, 9 insertions(+), 31 deletions(-)
8
9
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
10
index XXXXXXX..XXXXXXX 100644
11
--- a/target/i386/ops_sse.h
12
+++ b/target/i386/ops_sse.h
13
@@ -XXX,XX +XXX,XX @@
14
15
#include "crypto/aes.h"
16
#include "crypto/aes-round.h"
17
+#include "crypto/clmul.h"
18
19
#if SHIFT == 0
20
#define Reg MMXReg
21
@@ -XXX,XX +XXX,XX @@ target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
22
23
#endif
24
25
-#if SHIFT == 1
26
-static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
27
- uint64_t a, uint64_t b)
28
-{
29
- uint64_t al, ah, resh, resl;
30
-
31
- ah = 0;
32
- al = a;
33
- resh = resl = 0;
34
-
35
- while (b) {
36
- if (b & 1) {
37
- resl ^= al;
38
- resh ^= ah;
39
- }
40
- ah = (ah << 1) | (al >> 63);
41
- al <<= 1;
42
- b >>= 1;
43
- }
44
-
45
- *dest_l = resl;
46
- *dest_h = resh;
47
-}
48
-#endif
49
-
50
void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
51
uint32_t ctrl)
52
{
53
- uint64_t a, b;
54
- int i;
55
+ int a_idx = (ctrl & 1) != 0;
56
+ int b_idx = (ctrl & 16) != 0;
57
58
- for (i = 0; i < 1 << SHIFT; i += 2) {
59
- a = v->Q(((ctrl & 1) != 0) + i);
60
- b = s->Q(((ctrl & 16) != 0) + i);
61
- clmulq(&d->Q(i), &d->Q(i + 1), a, b);
62
+ for (int i = 0; i < SHIFT; i++) {
63
+ uint64_t a = v->Q(2 * i + a_idx);
64
+ uint64_t b = s->Q(2 * i + b_idx);
65
+ Int128 *r = (Int128 *)&d->ZMM_X(i);
66
+
67
+ *r = clmul_64(a, b);
68
}
69
}
70
71
--
72
2.34.1
73
74
diff view generated by jsdifflib
1
Use the generic routine for 64-bit carry-less multiply.
1
Use the generic routine for 64-bit carry-less multiply.
2
Remove our local version of galois_multiply64.
2
Remove our local version of galois_multiply64.
3
3
4
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
5
---
6
---
6
target/s390x/tcg/vec_int_helper.c | 62 +++++++------------------------
7
target/s390x/tcg/vec_int_helper.c | 58 +++++++------------------------
7
1 file changed, 14 insertions(+), 48 deletions(-)
8
1 file changed, 12 insertions(+), 46 deletions(-)
8
9
9
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
10
index XXXXXXX..XXXXXXX 100644
11
index XXXXXXX..XXXXXXX 100644
11
--- a/target/s390x/tcg/vec_int_helper.c
12
--- a/target/s390x/tcg/vec_int_helper.c
12
+++ b/target/s390x/tcg/vec_int_helper.c
13
+++ b/target/s390x/tcg/vec_int_helper.c
...
...
46
- s390_vec_shr(&vb, &vb, 1);
47
- s390_vec_shr(&vb, &vb, 1);
47
- }
48
- }
48
- return res;
49
- return res;
49
-}
50
-}
50
-
51
-
51
static Int128 do_gfm8(Int128 n, Int128 m)
52
/*
52
{
53
* There is no carry across the two doublewords, so their order does
53
Int128 e = clmul_8x8_even(n, m);
54
* not matter. Nor is there partial overlap between registers.
54
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
55
@@ -XXX,XX +XXX,XX @@ void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
55
*(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
56
}
57
58
+static Int128 do_gfm64(Int128 n, Int128 m)
59
+{
60
+ /*
61
+ * The two 64-bit halves are treated identically,
62
+ * therefore host ordering does not matter.
63
+ */
64
+ Int128 e = clmul_64(int128_getlo(n), int128_getlo(m));
65
+ Int128 o = clmul_64(int128_gethi(n), int128_gethi(m));
66
+ return int128_xor(e, o);
67
+}
68
+
69
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
56
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
70
uint32_t desc)
57
uint32_t desc)
71
{
58
{
72
- S390Vector tmp1, tmp2;
59
- S390Vector tmp1, tmp2;
73
- uint64_t a, b;
60
- uint64_t a, b;
74
-
61
+ uint64_t *q1 = v1;
62
+ const uint64_t *q2 = v2, *q3 = v3;
63
+ Int128 r;
64
75
- a = s390_vec_read_element64(v2, 0);
65
- a = s390_vec_read_element64(v2, 0);
76
- b = s390_vec_read_element64(v3, 0);
66
- b = s390_vec_read_element64(v3, 0);
77
- tmp1 = galois_multiply64(a, b);
67
- tmp1 = galois_multiply64(a, b);
78
- a = s390_vec_read_element64(v2, 1);
68
- a = s390_vec_read_element64(v2, 1);
79
- b = s390_vec_read_element64(v3, 1);
69
- b = s390_vec_read_element64(v3, 1);
80
- tmp2 = galois_multiply64(a, b);
70
- tmp2 = galois_multiply64(a, b);
81
- s390_vec_xor(v1, &tmp1, &tmp2);
71
- s390_vec_xor(v1, &tmp1, &tmp2);
82
+ *(Int128 *)v1 = do_gfm64(*(const Int128 *)v2, *(const Int128 *)v3);
72
+ r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1]));
73
+ q1[0] = int128_gethi(r);
74
+ q1[1] = int128_getlo(r);
83
}
75
}
84
76
85
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
77
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
86
const void *v4, uint32_t desc)
78
const void *v4, uint32_t desc)
87
{
79
{
88
- S390Vector tmp1, tmp2;
80
- S390Vector tmp1, tmp2;
89
- uint64_t a, b;
81
- uint64_t a, b;
90
-
82
+ uint64_t *q1 = v1;
83
+ const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
84
+ Int128 r;
85
91
- a = s390_vec_read_element64(v2, 0);
86
- a = s390_vec_read_element64(v2, 0);
92
- b = s390_vec_read_element64(v3, 0);
87
- b = s390_vec_read_element64(v3, 0);
93
- tmp1 = galois_multiply64(a, b);
88
- tmp1 = galois_multiply64(a, b);
94
- a = s390_vec_read_element64(v2, 1);
89
- a = s390_vec_read_element64(v2, 1);
95
- b = s390_vec_read_element64(v3, 1);
90
- b = s390_vec_read_element64(v3, 1);
96
- tmp2 = galois_multiply64(a, b);
91
- tmp2 = galois_multiply64(a, b);
97
- s390_vec_xor(&tmp1, &tmp1, &tmp2);
92
- s390_vec_xor(&tmp1, &tmp1, &tmp2);
98
- s390_vec_xor(v1, &tmp1, v4);
93
- s390_vec_xor(v1, &tmp1, v4);
99
+ Int128 r = do_gfm64(*(const Int128 *)v2, *(const Int128 *)v3);
94
+ r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1]));
100
+ *(Int128 *)v1 = int128_xor(r, *(Int128 *)v4);
95
+ q1[0] = q4[0] ^ int128_gethi(r);
96
+ q1[1] = q4[1] ^ int128_getlo(r);
101
}
97
}
102
98
103
#define DEF_VMAL(BITS) \
99
#define DEF_VMAL(BITS) \
104
--
100
--
105
2.34.1
101
2.34.1
102
103
diff view generated by jsdifflib
1
Use generic routine for 64-bit carry-less multiply.
1
Use generic routine for 64-bit carry-less multiply.
2
2
3
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
---
5
target/ppc/int_helper.c | 17 +++--------------
6
target/ppc/int_helper.c | 17 +++--------------
6
1 file changed, 3 insertions(+), 14 deletions(-)
7
1 file changed, 3 insertions(+), 14 deletions(-)
7
8
...
...
33
}
34
}
34
35
35
#if HOST_BIG_ENDIAN
36
#if HOST_BIG_ENDIAN
36
--
37
--
37
2.34.1
38
2.34.1
39
40
diff view generated by jsdifflib
1
Detect PCLMUL in cpuinfo; implement the accel hooks.
1
Detect PCLMUL in cpuinfo; implement the accel hook.
2
2
3
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
5
---
5
host/include/i386/host/cpuinfo.h | 1 +
6
host/include/i386/host/cpuinfo.h | 1 +
6
host/include/i386/host/crypto/clmul.h | 187 ++++++++++++++++++++++++
7
host/include/i386/host/crypto/clmul.h | 29 +++++++++++++++++++++++++
7
host/include/x86_64/host/crypto/clmul.h | 1 +
8
host/include/x86_64/host/crypto/clmul.h | 1 +
8
util/cpuinfo-i386.c | 1 +
9
include/qemu/cpuid.h | 3 +++
9
4 files changed, 190 insertions(+)
10
util/cpuinfo-i386.c | 1 +
11
5 files changed, 35 insertions(+)
10
create mode 100644 host/include/i386/host/crypto/clmul.h
12
create mode 100644 host/include/i386/host/crypto/clmul.h
11
create mode 100644 host/include/x86_64/host/crypto/clmul.h
13
create mode 100644 host/include/x86_64/host/crypto/clmul.h
12
14
13
diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
15
diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
14
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
...
...
46
+# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL)
48
+# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL)
47
+# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul")))
49
+# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul")))
48
+#endif
50
+#endif
49
+
51
+
50
+static inline Int128 ATTR_CLMUL_ACCEL
52
+static inline Int128 ATTR_CLMUL_ACCEL
51
+clmul_64(uint64_t n, uint64_t m)
53
+clmul_64_accel(uint64_t n, uint64_t m)
52
+{
54
+{
53
+ union { __m128i v; Int128 s; } u;
55
+ union { __m128i v; Int128 s; } u;
54
+
55
+ if (!HAVE_CLMUL_ACCEL) {
56
+ return clmul_64_gen(n, m);
57
+ }
58
+
56
+
59
+ u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0);
57
+ u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0);
60
+ return u.s;
58
+ return u.s;
61
+}
59
+}
62
+
63
+static inline uint64_t ATTR_CLMUL_ACCEL
64
+clmul_32(uint32_t n, uint32_t m)
65
+{
66
+ __m128i r;
67
+
68
+ if (!HAVE_CLMUL_ACCEL) {
69
+ return clmul_32_gen(n, m);
70
+ }
71
+
72
+ r = _mm_clmulepi64_si128(_mm_cvtsi32_si128(n), _mm_cvtsi32_si128(m), 0);
73
+ return ((__v2di)r)[0];
74
+}
75
+
76
+static inline Int128 ATTR_CLMUL_ACCEL
77
+clmul_32x2_even(Int128 n, Int128 m)
78
+{
79
+ union { __m128i v; Int128 s; } ur, un, um;
80
+ __m128i n02, m02, r0, r2;
81
+
82
+ if (!HAVE_CLMUL_ACCEL) {
83
+ return clmul_32x2_even_gen(n, m);
84
+ }
85
+
86
+ un.s = n;
87
+ um.s = m;
88
+ n02 = _mm_slli_epi64(un.v, 32);
89
+ m02 = _mm_slli_epi64(um.v, 32);
90
+ r0 = _mm_clmulepi64_si128(n02, m02, 0x00);
91
+ r2 = _mm_clmulepi64_si128(n02, m02, 0x11);
92
+ ur.v = _mm_unpackhi_epi64(r0, r2);
93
+ return ur.s;
94
+}
95
+
96
+static inline Int128 ATTR_CLMUL_ACCEL
97
+clmul_32x2_odd(Int128 n, Int128 m)
98
+{
99
+ union { __m128i v; Int128 s; } ur, un, um;
100
+ __m128i n13, m13, r1, r3;
101
+
102
+ if (!HAVE_CLMUL_ACCEL) {
103
+ return clmul_32x2_odd_gen(n, m);
104
+ }
105
+
106
+ un.s = n;
107
+ um.s = m;
108
+ n13 = _mm_srli_epi64(un.v, 32);
109
+ m13 = _mm_srli_epi64(um.v, 32);
110
+ r1 = _mm_clmulepi64_si128(n13, m13, 0x00);
111
+ r3 = _mm_clmulepi64_si128(n13, m13, 0x11);
112
+ ur.v = _mm_unpacklo_epi64(r1, r3);
113
+ return ur.s;
114
+}
115
+
116
+static inline uint64_t ATTR_CLMUL_ACCEL
117
+clmul_16x2_even(uint64_t n, uint64_t m)
118
+{
119
+ __m128i r0, r2;
120
+
121
+ if (!HAVE_CLMUL_ACCEL) {
122
+ return clmul_16x2_even_gen(n, m);
123
+ }
124
+
125
+ r0 = _mm_clmulepi64_si128(_mm_cvtsi32_si128(n & 0xffff),
126
+ _mm_cvtsi32_si128(m & 0xffff), 0);
127
+ r2 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 32) & 0xffff),
128
+ _mm_cvtsi32_si128((m >> 32) & 0xffff), 0);
129
+ r0 = _mm_unpacklo_epi32(r0, r2);
130
+ return ((__v2di)r0)[0];
131
+}
132
+
133
+static inline uint64_t ATTR_CLMUL_ACCEL
134
+clmul_16x2_odd(uint64_t n, uint64_t m)
135
+{
136
+ __m128i r1, r3;
137
+
138
+ if (!HAVE_CLMUL_ACCEL) {
139
+ return clmul_16x2_even_gen(n, m);
140
+ }
141
+
142
+ r1 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 16) & 0xffff),
143
+ _mm_cvtsi32_si128((m >> 16) & 0xffff), 0);
144
+ r3 = _mm_clmulepi64_si128(_mm_cvtsi32_si128((n >> 48) & 0xffff),
145
+ _mm_cvtsi32_si128((m >> 48) & 0xffff), 0);
146
+ r1 = _mm_unpacklo_epi32(r1, r3);
147
+ return ((__v2di)r1)[0];
148
+}
149
+
150
+static inline Int128 ATTR_CLMUL_ACCEL
151
+clmul_16x4_even(Int128 n, Int128 m)
152
+{
153
+ union { __m128i v; Int128 s; } ur, un, um;
154
+ __m128i mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
155
+ __m128i n04, m04, n26, m26, r0, r2, r4, r6;
156
+
157
+ if (!HAVE_CLMUL_ACCEL) {
158
+ return clmul_16x4_even_gen(n, m);
159
+ }
160
+
161
+ un.s = n;
162
+ um.s = m;
163
+ n04 = _mm_and_si128(un.v, mask);
164
+ m04 = _mm_and_si128(um.v, mask);
165
+ r0 = _mm_clmulepi64_si128(n04, m04, 0x00);
166
+ r4 = _mm_clmulepi64_si128(n04, m04, 0x11);
167
+ n26 = _mm_and_si128(_mm_srli_epi64(un.v, 32), mask);
168
+ m26 = _mm_and_si128(_mm_srli_epi64(um.v, 32), mask);
169
+ r2 = _mm_clmulepi64_si128(n26, m26, 0x00);
170
+ r6 = _mm_clmulepi64_si128(n26, m26, 0x11);
171
+
172
+ r0 = _mm_unpacklo_epi32(r0, r2);
173
+ r4 = _mm_unpacklo_epi32(r4, r6);
174
+ ur.v = _mm_unpacklo_epi64(r0, r4);
175
+ return ur.s;
176
+}
177
+
178
+static inline Int128 ATTR_CLMUL_ACCEL
179
+clmul_16x4_odd(Int128 n, Int128 m)
180
+{
181
+ union { __m128i v; Int128 s; } ur, un, um;
182
+ __m128i mask = _mm_set_epi16(0, 0, 0, -1, 0, 0, 0, -1);
183
+ __m128i n15, m15, n37, m37, r1, r3, r5, r7;
184
+
185
+ if (!HAVE_CLMUL_ACCEL) {
186
+ return clmul_16x4_odd_gen(n, m);
187
+ }
188
+
189
+ un.s = n;
190
+ um.s = m;
191
+ n15 = _mm_and_si128(_mm_srli_epi64(un.v, 16), mask);
192
+ m15 = _mm_and_si128(_mm_srli_epi64(um.v, 16), mask);
193
+ r1 = _mm_clmulepi64_si128(n15, m15, 0x00);
194
+ r5 = _mm_clmulepi64_si128(n15, m15, 0x11);
195
+ n37 = _mm_srli_epi64(un.v, 48);
196
+ m37 = _mm_srli_epi64(um.v, 48);
197
+ r3 = _mm_clmulepi64_si128(n37, m37, 0x00);
198
+ r7 = _mm_clmulepi64_si128(n37, m37, 0x11);
199
+
200
+ r1 = _mm_unpacklo_epi32(r1, r3);
201
+ r5 = _mm_unpacklo_epi32(r5, r7);
202
+ ur.v = _mm_unpacklo_epi64(r1, r5);
203
+ return ur.s;
204
+}
205
+
206
+/*
207
+ * Defer everything else to the generic routines.
208
+ * We could implement them with even more element manipulation.
209
+ */
210
+#define clmul_8x8_low clmul_8x8_low_gen
211
+#define clmul_8x4_even clmul_8x4_even_gen
212
+#define clmul_8x4_odd clmul_8x4_odd_gen
213
+#define clmul_8x8_even clmul_8x8_even_gen
214
+#define clmul_8x8_odd clmul_8x8_odd_gen
215
+#define clmul_8x8_packed clmul_8x8_packed_gen
216
+
60
+
217
+#endif /* X86_HOST_CRYPTO_CLMUL_H */
61
+#endif /* X86_HOST_CRYPTO_CLMUL_H */
218
diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h
62
diff --git a/host/include/x86_64/host/crypto/clmul.h b/host/include/x86_64/host/crypto/clmul.h
219
new file mode 100644
63
new file mode 100644
220
index XXXXXXX..XXXXXXX
64
index XXXXXXX..XXXXXXX
221
--- /dev/null
65
--- /dev/null
222
+++ b/host/include/x86_64/host/crypto/clmul.h
66
+++ b/host/include/x86_64/host/crypto/clmul.h
223
@@ -0,0 +1 @@
67
@@ -0,0 +1 @@
224
+#include "host/include/i386/host/crypto/clmul.h"
68
+#include "host/include/i386/host/crypto/clmul.h"
69
diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
70
index XXXXXXX..XXXXXXX 100644
71
--- a/include/qemu/cpuid.h
72
+++ b/include/qemu/cpuid.h
73
@@ -XXX,XX +XXX,XX @@
74
#endif
75
76
/* Leaf 1, %ecx */
77
+#ifndef bit_PCLMUL
78
+#define bit_PCLMUL (1 << 1)
79
+#endif
80
#ifndef bit_SSE4_1
81
#define bit_SSE4_1 (1 << 19)
82
#endif
225
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
83
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
226
index XXXXXXX..XXXXXXX 100644
84
index XXXXXXX..XXXXXXX 100644
227
--- a/util/cpuinfo-i386.c
85
--- a/util/cpuinfo-i386.c
228
+++ b/util/cpuinfo-i386.c
86
+++ b/util/cpuinfo-i386.c
229
@@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
87
@@ -XXX,XX +XXX,XX @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
230
info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
88
info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
231
info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
89
info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
232
info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
90
info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
233
+ info |= (c & bit_PCLMULQDQ ? CPUINFO_PCLMUL : 0);
91
+ info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0);
234
92
235
/* Our AES support requires PSHUFB as well. */
93
/* Our AES support requires PSHUFB as well. */
236
info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0);
94
info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0);
237
--
95
--
238
2.34.1
96
2.34.1
diff view generated by jsdifflib
1
Detect PMULL in cpuinfo; implement the accel hooks.
1
Detect PMULL in cpuinfo; implement the accel hook.
2
2
3
Acked-by: Ard Biesheuvel <ardb@kernel.org>
4
Tested-by: Ard Biesheuvel <ardb@kernel.org>
5
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
3
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
6
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
4
---
7
---
5
host/include/aarch64/host/cpuinfo.h | 1 +
8
host/include/aarch64/host/cpuinfo.h | 1 +
6
host/include/aarch64/host/crypto/clmul.h | 230 +++++++++++++++++++++++
9
host/include/aarch64/host/crypto/clmul.h | 41 ++++++++++++++++++++++++
7
util/cpuinfo-aarch64.c | 4 +-
10
util/cpuinfo-aarch64.c | 4 ++-
8
3 files changed, 234 insertions(+), 1 deletion(-)
11
3 files changed, 45 insertions(+), 1 deletion(-)
9
create mode 100644 host/include/aarch64/host/crypto/clmul.h
12
create mode 100644 host/include/aarch64/host/crypto/clmul.h
10
13
11
diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
14
diff --git a/host/include/aarch64/host/cpuinfo.h b/host/include/aarch64/host/cpuinfo.h
12
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
13
--- a/host/include/aarch64/host/cpuinfo.h
16
--- a/host/include/aarch64/host/cpuinfo.h
...
...
35
+#define AARCH64_HOST_CRYPTO_CLMUL_H
38
+#define AARCH64_HOST_CRYPTO_CLMUL_H
36
+
39
+
37
+#include "host/cpuinfo.h"
40
+#include "host/cpuinfo.h"
38
+#include <arm_neon.h>
41
+#include <arm_neon.h>
39
+
42
+
40
+/* Both FEAT_AES and FEAT_PMULL are covered under the same macro. */
43
+/*
44
+ * 64x64->128 pmull is available with FEAT_PMULL.
45
+ * Both FEAT_AES and FEAT_PMULL are covered under the same macro.
46
+ */
41
+#ifdef __ARM_FEATURE_AES
47
+#ifdef __ARM_FEATURE_AES
42
+# define HAVE_CLMUL_ACCEL true
48
+# define HAVE_CLMUL_ACCEL true
43
+#else
49
+#else
44
+# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL)
50
+# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL)
45
+#endif
51
+#endif
46
+#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN)
52
+#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN)
47
+# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto")))
53
+# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto")))
48
+#else
54
+#else
49
+# define ATTR_CLMUL_ACCEL
55
+# define ATTR_CLMUL_ACCEL
50
+#endif
56
+#endif
51
+
57
+
52
+/*
53
+ * The 8x8->8 pmul and 8x8->16 pmull are available unconditionally.
54
+ */
55
+
56
+static inline uint64_t clmul_8x8_low(uint64_t n, uint64_t m)
57
+{
58
+ return (uint64_t)vmul_p8((poly8x8_t)n, (poly8x8_t)m);
59
+}
60
+
61
+static inline Int128 clmul_8x8_packed(uint64_t n, uint64_t m)
62
+{
63
+ union { poly16x8_t v; Int128 s; } u;
64
+ u.v = vmull_p8((poly8x8_t)n, (poly8x8_t)m);
65
+ return u.s;
66
+}
67
+
68
+static inline Int128 clmul_8x8_even(Int128 n, Int128 m)
69
+{
70
+ union { uint16x8_t v; Int128 s; } un, um;
71
+ uint8x8_t pn, pm;
72
+
73
+ un.s = n;
74
+ um.s = m;
75
+ pn = vmovn_u16(un.v);
76
+ pm = vmovn_u16(um.v);
77
+ return clmul_8x8_packed((uint64_t)pn, (uint64_t)pm);
78
+}
79
+
80
+static inline Int128 clmul_8x8_odd(Int128 n, Int128 m)
81
+{
82
+ union { uint8x16_t v; Int128 s; } un, um;
83
+ uint8x8_t pn, pm;
84
+
85
+ un.s = n;
86
+ um.s = m;
87
+ pn = vqtbl1_u8(un.v, (uint8x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 });
88
+ pm = vqtbl1_u8(um.v, (uint8x8_t){ 1, 3, 5, 7, 9, 11, 13, 15 });
89
+ return clmul_8x8_packed((uint64_t)pn, (uint64_t)pm);
90
+}
91
+
92
+static inline uint64_t clmul_8x4_even(uint64_t n, uint64_t m)
93
+{
94
+ return int128_getlo(clmul_8x8_even(int128_make64(n), int128_make64(m)));
95
+}
96
+
97
+static inline uint64_t clmul_8x4_odd(uint64_t n, uint64_t m)
98
+{
99
+ return int128_getlo(clmul_8x8_odd(int128_make64(n), int128_make64(m)));
100
+}
101
+
102
+static inline Int128 clmul_16x4_packed_accel(uint16x4_t n, uint16x4_t m)
103
+{
104
+ union { uint32x4_t v; Int128 s; } u;
105
+ uint32x4_t r0, r1, r2;
106
+
107
+ /*
108
+ * Considering the per-byte multiplication:
109
+ * ab
110
+ * cd
111
+ * -----
112
+ * bd << 0
113
+ * bc << 8
114
+ * ad << 8
115
+ * ac << 16
116
+ *
117
+ * We get the ac and bd rows of the result for free from the expanding
118
+ * packed multiply. Reverse the two bytes in M, repeat, and we get the
119
+ * ad and bc results, but in the wrong column; shift to fix and sum all.
120
+ */
121
+ r0 = (uint32x4_t)vmull_p8((poly8x8_t)n, (poly8x8_t)m);
122
+ r1 = (uint32x4_t)vmull_p8((poly8x8_t)n, vrev16_p8((poly8x8_t)m));
123
+ r2 = r1 << 8; /* bc */
124
+ r1 = r1 >> 8; /* ad */
125
+ r1 &= (uint32x4_t){ 0x00ffff00, 0x00ffff00, 0x00ffff00, 0x00ffff00 };
126
+ r2 &= (uint32x4_t){ 0x00ffff00, 0x00ffff00, 0x00ffff00, 0x00ffff00 };
127
+ r0 = r0 ^ r1 ^ r2;
128
+
129
+ u.v = r0;
130
+ return u.s;
131
+}
132
+
133
+static inline Int128 clmul_16x4_even(Int128 n, Int128 m)
134
+{
135
+ union { uint32x4_t v; Int128 s; } um, un;
136
+ uint16x4_t pn, pm;
137
+
138
+ /* Extract even uint16_t. */
139
+ un.s = n;
140
+ um.s = m;
141
+ pn = vmovn_u32(un.v);
142
+ pm = vmovn_u32(um.v);
143
+ return clmul_16x4_packed_accel(pn, pm);
144
+}
145
+
146
+static inline Int128 clmul_16x4_odd(Int128 n, Int128 m)
147
+{
148
+ union { uint8x16_t v; Int128 s; } um, un;
149
+ uint16x4_t pn, pm;
150
+
151
+ /* Extract odd uint16_t. */
152
+ un.s = n;
153
+ um.s = m;
154
+ pn = (uint16x4_t)vqtbl1_u8(un.v, (uint8x8_t){ 2, 3, 6, 7, 10, 11, 14, 15 });
155
+ pm = (uint16x4_t)vqtbl1_u8(um.v, (uint8x8_t){ 2, 3, 6, 7, 10, 11, 14, 15 });
156
+ return clmul_16x4_packed_accel(pn, pm);
157
+}
158
+
159
+static inline uint64_t clmul_16x2_even(uint64_t n, uint64_t m)
160
+{
161
+ return int128_getlo(clmul_16x4_even(int128_make64(n), int128_make64(m)));
162
+}
163
+
164
+static inline uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
165
+{
166
+ return int128_getlo(clmul_16x4_odd(int128_make64(n), int128_make64(m)));
167
+}
168
+
169
+/*
170
+ * The 64x64->128 pmull is available with FEAT_PMULL.
171
+ */
172
+
173
+static inline Int128 ATTR_CLMUL_ACCEL
58
+static inline Int128 ATTR_CLMUL_ACCEL
174
+clmul_64(uint64_t n, uint64_t m)
59
+clmul_64_accel(uint64_t n, uint64_t m)
175
+{
60
+{
176
+ union { poly128_t v; Int128 s; } u;
61
+ union { poly128_t v; Int128 s; } u;
177
+
178
+ if (!HAVE_CLMUL_ACCEL) {
179
+ return clmul_64_gen(n, m);
180
+ }
181
+
62
+
182
+#ifdef CONFIG_ARM_AES_BUILTIN
63
+#ifdef CONFIG_ARM_AES_BUILTIN
183
+ u.v = vmull_p64((poly64_t)n, (poly64_t)m);
64
+ u.v = vmull_p64((poly64_t)n, (poly64_t)m);
184
+#else
65
+#else
185
+ asm(".arch_extension aes\n\t"
66
+ asm(".arch_extension aes\n\t"
186
+ "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m));
67
+ "pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m));
187
+#endif
68
+#endif
188
+ return u.s;
69
+ return u.s;
189
+}
190
+
191
+static inline uint64_t ATTR_CLMUL_ACCEL
192
+clmul_32(uint32_t n, uint32_t m)
193
+{
194
+ if (!HAVE_CLMUL_ACCEL) {
195
+ return clmul_32_gen(n, m);
196
+ }
197
+ return int128_getlo(clmul_64(n, m));
198
+}
199
+
200
+static inline Int128 ATTR_CLMUL_ACCEL
201
+clmul_32x2_even(Int128 n, Int128 m)
202
+{
203
+ union { uint64x2_t v; poly64_t h; Int128 s; } um, un, ur;
204
+ uint64x2_t r0, r2;
205
+
206
+ if (!HAVE_CLMUL_ACCEL) {
207
+ return clmul_32x2_even_gen(n, m);
208
+ }
209
+
210
+ un.s = n;
211
+ um.s = m;
212
+ un.v &= (uint64x2_t){ 0xffffffffu, 0xffffffffu };
213
+ um.v &= (uint64x2_t){ 0xffffffffu, 0xffffffffu };
214
+
215
+#ifdef CONFIG_ARM_AES_BUILTIN
216
+ r0 = (uint64x2_t)vmull_p64(un.h, um.h);
217
+ r2 = (uint64x2_t)vmull_high_p64((poly64x2_t)un.v, (poly64x2_t)um.v);
218
+#else
219
+ asm(".arch_extension aes\n\t"
220
+ "pmull %0.1q, %2.1d, %3.1d\n\t"
221
+ "pmull2 %1.1q, %2.2d, %3.2d"
222
+ : "=&w"(r0), "=w"(r2) : "w"(un.v), "w"(um.v));
223
+#endif
224
+
225
+ ur.v = vzip1q_u64(r0, r2);
226
+ return ur.s;
227
+}
228
+
229
+static inline Int128 ATTR_CLMUL_ACCEL
230
+clmul_32x2_odd(Int128 n, Int128 m)
231
+{
232
+ union { uint64x2_t v; poly64_t h; Int128 s; } um, un, ur;
233
+ uint64x2_t r0, r2;
234
+
235
+ if (!HAVE_CLMUL_ACCEL) {
236
+ return clmul_32x2_odd_gen(n, m);
237
+ }
238
+
239
+ un.s = n;
240
+ um.s = m;
241
+ un.v &= (uint64x2_t){ 0xffffffff00000000ull, 0xffffffff00000000ull };
242
+ um.v &= (uint64x2_t){ 0xffffffff00000000ull, 0xffffffff00000000ull };
243
+
244
+#ifdef CONFIG_ARM_AES_BUILTIN
245
+ r0 = (uint64x2_t)vmull_p64(un.h, um.h);
246
+ r2 = (uint64x2_t)vmull_high_p64((poly64x2_t)un.v, (poly64x2_t)um.v);
247
+#else
248
+ asm(".arch_extension aes\n\t"
249
+ "pmull %0.1q, %2.1d, %3.1d\n\t"
250
+ "pmull2 %1.1q, %2.2d, %3.2d"
251
+ : "=&w"(r0), "=w"(r2) : "w"(un.v), "w"(um.v));
252
+#endif
253
+
254
+ ur.v = vzip2q_u64(r0, r2);
255
+ return ur.s;
256
+}
70
+}
257
+
71
+
258
+#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */
72
+#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */
259
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
73
diff --git a/util/cpuinfo-aarch64.c b/util/cpuinfo-aarch64.c
260
index XXXXXXX..XXXXXXX 100644
74
index XXXXXXX..XXXXXXX 100644
...
...
276
#endif
90
#endif
277
91
278
cpuinfo = info;
92
cpuinfo = info;
279
--
93
--
280
2.34.1
94
2.34.1
95
96
diff view generated by jsdifflib