Line data Source code
1 :
2 : // Source originally from https://github.com/BLAKE3-team/BLAKE3
3 : // From commit: 80b83effbd50425939483e5503b186db4dac4d9d
4 :
5 : #include "blake3_impl.h"
6 :
7 : #include <immintrin.h>
8 :
9 : #define _mm_shuffle_ps2(a, b, c) \
10 415643520 : (_mm_castps_si128( \
11 415643520 : _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
12 :
13 190816016 : INLINE __m128i loadu_128(const uint8_t src[16]) {
14 190816016 : return _mm_loadu_si128((void*)src);
15 190816016 : }
16 :
17 9002528 : INLINE __m256i loadu_256(const uint8_t src[32]) {
18 9002528 : return _mm256_loadu_si256((void*)src);
19 9002528 : }
20 :
21 44367664 : INLINE __m512i loadu_512(const uint8_t src[64]) {
22 44367664 : return _mm512_loadu_si512((void*)src);
23 44367664 : }
24 :
25 69959264 : INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
26 69959264 : _mm_storeu_si128((void*)dest, src);
27 69959264 : }
28 :
29 529264 : INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
30 529264 : _mm256_storeu_si256((void*)dest, src);
31 529264 : }
32 :
33 9760 : INLINE void storeu_512(__m512i src, uint8_t dest[16]) {
34 9760 : _mm512_storeu_si512((void*)dest, src);
35 9760 : }
36 :
37 2560227264 : INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
38 :
39 189053088 : INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
40 :
41 931925904 : INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
42 :
43 1784720960 : INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
44 :
45 130536656 : INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
46 :
47 643477528 : INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
48 :
49 7810596 : INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
50 :
51 3905212 : INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
52 :
53 21409198 : INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
54 :
55 51955440 : INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
56 51955440 : return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
57 51955440 : }
58 :
59 426704544 : INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
60 :
61 31508848 : INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
62 :
63 155320984 : INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
64 :
65 426704544 : INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
66 :
67 31508848 : INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
68 :
69 155320984 : INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
70 :
71 426704544 : INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
72 :
73 31508848 : INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
74 :
75 155320984 : INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
76 :
77 426704544 : INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
78 :
79 31508848 : INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
80 :
81 155320984 : INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
82 :
83 : /*
84 : * ----------------------------------------------------------------------------
85 : * compress_avx512
86 : * ----------------------------------------------------------------------------
87 : */
88 :
89 : INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
90 363688080 : __m128i m) {
91 363688080 : *row0 = add_128(add_128(*row0, m), *row1);
92 363688080 : *row3 = xor_128(*row3, *row0);
93 363688080 : *row3 = rot16_128(*row3);
94 363688080 : *row2 = add_128(*row2, *row3);
95 363688080 : *row1 = xor_128(*row1, *row2);
96 363688080 : *row1 = rot12_128(*row1);
97 363688080 : }
98 :
99 : INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
100 363688080 : __m128i m) {
101 363688080 : *row0 = add_128(add_128(*row0, m), *row1);
102 363688080 : *row3 = xor_128(*row3, *row0);
103 363688080 : *row3 = rot8_128(*row3);
104 363688080 : *row2 = add_128(*row2, *row3);
105 363688080 : *row1 = xor_128(*row1, *row2);
106 363688080 : *row1 = rot7_128(*row1);
107 363688080 : }
108 :
109 : // Note the optimization here of leaving row1 as the unrotated row, rather than
110 : // row0. All the message loads below are adjusted to compensate for this. See
111 : // discussion at https://github.com/sneves/blake2-avx2/pull/4
112 181844040 : INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
113 181844040 : *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
114 181844040 : *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
115 181844040 : *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
116 181844040 : }
117 :
118 181844040 : INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
119 181844040 : *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
120 181844040 : *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
121 181844040 : *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
122 181844040 : }
123 :
124 : INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
125 : const uint8_t block[BLAKE3_BLOCK_LEN],
126 25977720 : uint8_t block_len, uint64_t counter, uint8_t flags) {
127 25977720 : rows[0] = loadu_128((uint8_t *)&cv[0]);
128 25977720 : rows[1] = loadu_128((uint8_t *)&cv[4]);
129 25977720 : rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
130 25977720 : rows[3] = set4(counter_low(counter), counter_high(counter),
131 25977720 : (uint32_t)block_len, (uint32_t)flags);
132 :
133 25977720 : __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
134 25977720 : __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
135 25977720 : __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
136 25977720 : __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
137 :
138 25977720 : __m128i t0, t1, t2, t3, tt;
139 :
140 : // Round 1. The first round permutes the message words from the original
141 : // input order, into the groups that get mixed in parallel.
142 25977720 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
143 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
144 25977720 : t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
145 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
146 25977720 : diagonalize(&rows[0], &rows[2], &rows[3]);
147 25977720 : t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
148 25977720 : t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
149 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
150 25977720 : t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
151 25977720 : t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
152 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
153 25977720 : undiagonalize(&rows[0], &rows[2], &rows[3]);
154 25977720 : m0 = t0;
155 25977720 : m1 = t1;
156 25977720 : m2 = t2;
157 25977720 : m3 = t3;
158 :
159 : // Round 2. This round and all following rounds apply a fixed permutation
160 : // to the message words from the round before.
161 25977720 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
162 25977720 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
163 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
164 25977720 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
165 25977720 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
166 25977720 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
167 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
168 25977720 : diagonalize(&rows[0], &rows[2], &rows[3]);
169 25977720 : t2 = _mm_unpacklo_epi64(m3, m1);
170 25977720 : tt = _mm_blend_epi16(t2, m2, 0xC0);
171 25977720 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
172 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
173 25977720 : t3 = _mm_unpackhi_epi32(m1, m3);
174 25977720 : tt = _mm_unpacklo_epi32(m2, t3);
175 25977720 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
176 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
177 25977720 : undiagonalize(&rows[0], &rows[2], &rows[3]);
178 25977720 : m0 = t0;
179 25977720 : m1 = t1;
180 25977720 : m2 = t2;
181 25977720 : m3 = t3;
182 :
183 : // Round 3
184 25977720 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
185 25977720 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
186 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
187 25977720 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
188 25977720 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
189 25977720 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
190 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
191 25977720 : diagonalize(&rows[0], &rows[2], &rows[3]);
192 25977720 : t2 = _mm_unpacklo_epi64(m3, m1);
193 25977720 : tt = _mm_blend_epi16(t2, m2, 0xC0);
194 25977720 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
195 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
196 25977720 : t3 = _mm_unpackhi_epi32(m1, m3);
197 25977720 : tt = _mm_unpacklo_epi32(m2, t3);
198 25977720 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
199 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
200 25977720 : undiagonalize(&rows[0], &rows[2], &rows[3]);
201 25977720 : m0 = t0;
202 25977720 : m1 = t1;
203 25977720 : m2 = t2;
204 25977720 : m3 = t3;
205 :
206 : // Round 4
207 25977720 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
208 25977720 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
209 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
210 25977720 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
211 25977720 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
212 25977720 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
213 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
214 25977720 : diagonalize(&rows[0], &rows[2], &rows[3]);
215 25977720 : t2 = _mm_unpacklo_epi64(m3, m1);
216 25977720 : tt = _mm_blend_epi16(t2, m2, 0xC0);
217 25977720 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
218 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
219 25977720 : t3 = _mm_unpackhi_epi32(m1, m3);
220 25977720 : tt = _mm_unpacklo_epi32(m2, t3);
221 25977720 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
222 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
223 25977720 : undiagonalize(&rows[0], &rows[2], &rows[3]);
224 25977720 : m0 = t0;
225 25977720 : m1 = t1;
226 25977720 : m2 = t2;
227 25977720 : m3 = t3;
228 :
229 : // Round 5
230 25977720 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
231 25977720 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
232 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
233 25977720 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
234 25977720 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
235 25977720 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
236 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
237 25977720 : diagonalize(&rows[0], &rows[2], &rows[3]);
238 25977720 : t2 = _mm_unpacklo_epi64(m3, m1);
239 25977720 : tt = _mm_blend_epi16(t2, m2, 0xC0);
240 25977720 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
241 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
242 25977720 : t3 = _mm_unpackhi_epi32(m1, m3);
243 25977720 : tt = _mm_unpacklo_epi32(m2, t3);
244 25977720 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
245 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
246 25977720 : undiagonalize(&rows[0], &rows[2], &rows[3]);
247 25977720 : m0 = t0;
248 25977720 : m1 = t1;
249 25977720 : m2 = t2;
250 25977720 : m3 = t3;
251 :
252 : // Round 6
253 25977720 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
254 25977720 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
255 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
256 25977720 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
257 25977720 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
258 25977720 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
259 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
260 25977720 : diagonalize(&rows[0], &rows[2], &rows[3]);
261 25977720 : t2 = _mm_unpacklo_epi64(m3, m1);
262 25977720 : tt = _mm_blend_epi16(t2, m2, 0xC0);
263 25977720 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
264 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
265 25977720 : t3 = _mm_unpackhi_epi32(m1, m3);
266 25977720 : tt = _mm_unpacklo_epi32(m2, t3);
267 25977720 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
268 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
269 25977720 : undiagonalize(&rows[0], &rows[2], &rows[3]);
270 25977720 : m0 = t0;
271 25977720 : m1 = t1;
272 25977720 : m2 = t2;
273 25977720 : m3 = t3;
274 :
275 : // Round 7
276 25977720 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
277 25977720 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
278 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
279 25977720 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
280 25977720 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
281 25977720 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
282 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
283 25977720 : diagonalize(&rows[0], &rows[2], &rows[3]);
284 25977720 : t2 = _mm_unpacklo_epi64(m3, m1);
285 25977720 : tt = _mm_blend_epi16(t2, m2, 0xC0);
286 25977720 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
287 25977720 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
288 25977720 : t3 = _mm_unpackhi_epi32(m1, m3);
289 25977720 : tt = _mm_unpacklo_epi32(m2, t3);
290 25977720 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
291 25977720 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
292 25977720 : undiagonalize(&rows[0], &rows[2], &rows[3]);
293 25977720 : }
294 :
295 : void blake3_compress_xof_avx512(const uint32_t cv[8],
296 : const uint8_t block[BLAKE3_BLOCK_LEN],
297 : uint8_t block_len, uint64_t counter,
298 8472496 : uint8_t flags, uint8_t out[64]) {
299 8472496 : __m128i rows[4];
300 8472496 : compress_pre(rows, cv, block, block_len, counter, flags);
301 8472496 : storeu_128(xor_128(rows[0], rows[2]), &out[0]);
302 8472496 : storeu_128(xor_128(rows[1], rows[3]), &out[16]);
303 8472496 : storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
304 8472496 : storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
305 8472496 : }
306 :
307 : void blake3_compress_in_place_avx512(uint32_t cv[8],
308 : const uint8_t block[BLAKE3_BLOCK_LEN],
309 : uint8_t block_len, uint64_t counter,
310 17505224 : uint8_t flags) {
311 17505224 : __m128i rows[4];
312 17505224 : compress_pre(rows, cv, block, block_len, counter, flags);
313 17505224 : storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
314 17505224 : storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
315 17505224 : }
316 :
317 : /*
318 : * ----------------------------------------------------------------------------
319 : * hash4_avx512
320 : * ----------------------------------------------------------------------------
321 : */
322 :
323 7877058 : INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
324 7877058 : v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
325 7877058 : v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
326 7877058 : v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
327 7877058 : v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
328 7877058 : v[0] = add_128(v[0], v[4]);
329 7877058 : v[1] = add_128(v[1], v[5]);
330 7877058 : v[2] = add_128(v[2], v[6]);
331 7877058 : v[3] = add_128(v[3], v[7]);
332 7877058 : v[12] = xor_128(v[12], v[0]);
333 7877058 : v[13] = xor_128(v[13], v[1]);
334 7877058 : v[14] = xor_128(v[14], v[2]);
335 7877058 : v[15] = xor_128(v[15], v[3]);
336 7877058 : v[12] = rot16_128(v[12]);
337 7877058 : v[13] = rot16_128(v[13]);
338 7877058 : v[14] = rot16_128(v[14]);
339 7877058 : v[15] = rot16_128(v[15]);
340 7877058 : v[8] = add_128(v[8], v[12]);
341 7877058 : v[9] = add_128(v[9], v[13]);
342 7877058 : v[10] = add_128(v[10], v[14]);
343 7877058 : v[11] = add_128(v[11], v[15]);
344 7877058 : v[4] = xor_128(v[4], v[8]);
345 7877058 : v[5] = xor_128(v[5], v[9]);
346 7877058 : v[6] = xor_128(v[6], v[10]);
347 7877058 : v[7] = xor_128(v[7], v[11]);
348 7877058 : v[4] = rot12_128(v[4]);
349 7877058 : v[5] = rot12_128(v[5]);
350 7877058 : v[6] = rot12_128(v[6]);
351 7877058 : v[7] = rot12_128(v[7]);
352 7877058 : v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
353 7877058 : v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
354 7877058 : v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
355 7877058 : v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
356 7877058 : v[0] = add_128(v[0], v[4]);
357 7877058 : v[1] = add_128(v[1], v[5]);
358 7877058 : v[2] = add_128(v[2], v[6]);
359 7877058 : v[3] = add_128(v[3], v[7]);
360 7877058 : v[12] = xor_128(v[12], v[0]);
361 7877058 : v[13] = xor_128(v[13], v[1]);
362 7877058 : v[14] = xor_128(v[14], v[2]);
363 7877058 : v[15] = xor_128(v[15], v[3]);
364 7877058 : v[12] = rot8_128(v[12]);
365 7877058 : v[13] = rot8_128(v[13]);
366 7877058 : v[14] = rot8_128(v[14]);
367 7877058 : v[15] = rot8_128(v[15]);
368 7877058 : v[8] = add_128(v[8], v[12]);
369 7877058 : v[9] = add_128(v[9], v[13]);
370 7877058 : v[10] = add_128(v[10], v[14]);
371 7877058 : v[11] = add_128(v[11], v[15]);
372 7877058 : v[4] = xor_128(v[4], v[8]);
373 7877058 : v[5] = xor_128(v[5], v[9]);
374 7877058 : v[6] = xor_128(v[6], v[10]);
375 7877058 : v[7] = xor_128(v[7], v[11]);
376 7877058 : v[4] = rot7_128(v[4]);
377 7877058 : v[5] = rot7_128(v[5]);
378 7877058 : v[6] = rot7_128(v[6]);
379 7877058 : v[7] = rot7_128(v[7]);
380 :
381 7877058 : v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
382 7877058 : v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
383 7877058 : v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
384 7877058 : v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
385 7877058 : v[0] = add_128(v[0], v[5]);
386 7877058 : v[1] = add_128(v[1], v[6]);
387 7877058 : v[2] = add_128(v[2], v[7]);
388 7877058 : v[3] = add_128(v[3], v[4]);
389 7877058 : v[15] = xor_128(v[15], v[0]);
390 7877058 : v[12] = xor_128(v[12], v[1]);
391 7877058 : v[13] = xor_128(v[13], v[2]);
392 7877058 : v[14] = xor_128(v[14], v[3]);
393 7877058 : v[15] = rot16_128(v[15]);
394 7877058 : v[12] = rot16_128(v[12]);
395 7877058 : v[13] = rot16_128(v[13]);
396 7877058 : v[14] = rot16_128(v[14]);
397 7877058 : v[10] = add_128(v[10], v[15]);
398 7877058 : v[11] = add_128(v[11], v[12]);
399 7877058 : v[8] = add_128(v[8], v[13]);
400 7877058 : v[9] = add_128(v[9], v[14]);
401 7877058 : v[5] = xor_128(v[5], v[10]);
402 7877058 : v[6] = xor_128(v[6], v[11]);
403 7877058 : v[7] = xor_128(v[7], v[8]);
404 7877058 : v[4] = xor_128(v[4], v[9]);
405 7877058 : v[5] = rot12_128(v[5]);
406 7877058 : v[6] = rot12_128(v[6]);
407 7877058 : v[7] = rot12_128(v[7]);
408 7877058 : v[4] = rot12_128(v[4]);
409 7877058 : v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
410 7877058 : v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
411 7877058 : v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
412 7877058 : v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
413 7877058 : v[0] = add_128(v[0], v[5]);
414 7877058 : v[1] = add_128(v[1], v[6]);
415 7877058 : v[2] = add_128(v[2], v[7]);
416 7877058 : v[3] = add_128(v[3], v[4]);
417 7877058 : v[15] = xor_128(v[15], v[0]);
418 7877058 : v[12] = xor_128(v[12], v[1]);
419 7877058 : v[13] = xor_128(v[13], v[2]);
420 7877058 : v[14] = xor_128(v[14], v[3]);
421 7877058 : v[15] = rot8_128(v[15]);
422 7877058 : v[12] = rot8_128(v[12]);
423 7877058 : v[13] = rot8_128(v[13]);
424 7877058 : v[14] = rot8_128(v[14]);
425 7877058 : v[10] = add_128(v[10], v[15]);
426 7877058 : v[11] = add_128(v[11], v[12]);
427 7877058 : v[8] = add_128(v[8], v[13]);
428 7877058 : v[9] = add_128(v[9], v[14]);
429 7877058 : v[5] = xor_128(v[5], v[10]);
430 7877058 : v[6] = xor_128(v[6], v[11]);
431 7877058 : v[7] = xor_128(v[7], v[8]);
432 7877058 : v[4] = xor_128(v[4], v[9]);
433 7877058 : v[5] = rot7_128(v[5]);
434 7877058 : v[6] = rot7_128(v[6]);
435 7877058 : v[7] = rot7_128(v[7]);
436 7877058 : v[4] = rot7_128(v[4]);
437 7877058 : }
438 :
439 4765884 : INLINE void transpose_vecs_128(__m128i vecs[4]) {
440 : // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
441 : // 22/33. Note that this doesn't split the vector into two lanes, as the
442 : // AVX2 counterparts do.
443 4765884 : __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
444 4765884 : __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
445 4765884 : __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
446 4765884 : __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
447 :
448 : // Interleave 64-bit lanes.
449 4765884 : __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
450 4765884 : __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
451 4765884 : __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
452 4765884 : __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
453 :
454 4765884 : vecs[0] = abcd_0;
455 4765884 : vecs[1] = abcd_1;
456 4765884 : vecs[2] = abcd_2;
457 4765884 : vecs[3] = abcd_3;
458 4765884 : }
459 :
460 : INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
461 1125294 : size_t block_offset, __m128i out[16]) {
462 1125294 : out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
463 1125294 : out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
464 1125294 : out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
465 1125294 : out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
466 1125294 : out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
467 1125294 : out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
468 1125294 : out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
469 1125294 : out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
470 1125294 : out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
471 1125294 : out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
472 1125294 : out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
473 1125294 : out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
474 1125294 : out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
475 1125294 : out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
476 1125294 : out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
477 1125294 : out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
478 5626470 : for (size_t i = 0; i < 4; ++i) {
479 4501176 : _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
480 4501176 : }
481 1125294 : transpose_vecs_128(&out[0]);
482 1125294 : transpose_vecs_128(&out[4]);
483 1125294 : transpose_vecs_128(&out[8]);
484 1125294 : transpose_vecs_128(&out[12]);
485 1125294 : }
486 :
487 : INLINE void load_counters4(uint64_t counter, bool increment_counter,
488 132354 : __m128i *out_lo, __m128i *out_hi) {
489 132354 : int64_t mask = (increment_counter ? ~0 : 0);
490 132354 : __m256i mask_vec = _mm256_set1_epi64x(mask);
491 132354 : __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
492 132354 : deltas = _mm256_and_si256(mask_vec, deltas);
493 132354 : __m256i counters =
494 132354 : _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
495 132354 : *out_lo = _mm256_cvtepi64_epi32(counters);
496 132354 : *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
497 132354 : }
498 :
499 : static
500 : void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
501 : const uint32_t key[8], uint64_t counter,
502 : bool increment_counter, uint8_t flags,
503 132354 : uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
504 132354 : __m128i h_vecs[8] = {
505 132354 : set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
506 132354 : set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
507 132354 : };
508 132354 : __m128i counter_low_vec, counter_high_vec;
509 132354 : load_counters4(counter, increment_counter, &counter_low_vec,
510 132354 : &counter_high_vec);
511 132354 : uint8_t block_flags = flags | flags_start;
512 :
513 1257648 : for (size_t block = 0; block < blocks; block++) {
514 1125294 : if (block + 1 == blocks) {
515 132354 : block_flags |= flags_end;
516 132354 : }
517 1125294 : __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
518 1125294 : __m128i block_flags_vec = set1_128(block_flags);
519 1125294 : __m128i msg_vecs[16];
520 1125294 : transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
521 :
522 1125294 : __m128i v[16] = {
523 1125294 : h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
524 1125294 : h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
525 1125294 : set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
526 1125294 : counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
527 1125294 : };
528 1125294 : round_fn4(v, msg_vecs, 0);
529 1125294 : round_fn4(v, msg_vecs, 1);
530 1125294 : round_fn4(v, msg_vecs, 2);
531 1125294 : round_fn4(v, msg_vecs, 3);
532 1125294 : round_fn4(v, msg_vecs, 4);
533 1125294 : round_fn4(v, msg_vecs, 5);
534 1125294 : round_fn4(v, msg_vecs, 6);
535 1125294 : h_vecs[0] = xor_128(v[0], v[8]);
536 1125294 : h_vecs[1] = xor_128(v[1], v[9]);
537 1125294 : h_vecs[2] = xor_128(v[2], v[10]);
538 1125294 : h_vecs[3] = xor_128(v[3], v[11]);
539 1125294 : h_vecs[4] = xor_128(v[4], v[12]);
540 1125294 : h_vecs[5] = xor_128(v[5], v[13]);
541 1125294 : h_vecs[6] = xor_128(v[6], v[14]);
542 1125294 : h_vecs[7] = xor_128(v[7], v[15]);
543 :
544 1125294 : block_flags = flags;
545 1125294 : }
546 :
547 132354 : transpose_vecs_128(&h_vecs[0]);
548 132354 : transpose_vecs_128(&h_vecs[4]);
549 : // The first four vecs now contain the first half of each output, and the
550 : // second four vecs contain the second half of each output.
551 132354 : storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
552 132354 : storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
553 132354 : storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
554 132354 : storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
555 132354 : storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
556 132354 : storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
557 132354 : storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
558 132354 : storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
559 132354 : }
560 :
561 : static
562 : void blake3_xof4_avx512(const uint32_t cv[8],
563 : const uint8_t block[BLAKE3_BLOCK_LEN],
564 : uint8_t block_len, uint64_t counter, uint8_t flags,
565 0 : uint8_t out[4 * 64]) {
566 0 : __m128i h_vecs[8] = {
567 0 : set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
568 0 : set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
569 0 : };
570 0 : uint32_t block_words[16];
571 0 : load_block_words(block, block_words);
572 0 : __m128i msg_vecs[16];
573 0 : for (size_t i = 0; i < 16; i++) {
574 0 : msg_vecs[i] = set1_128(block_words[i]);
575 0 : }
576 0 : __m128i counter_low_vec, counter_high_vec;
577 0 : load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
578 0 : __m128i block_len_vec = set1_128(block_len);
579 0 : __m128i block_flags_vec = set1_128(flags);
580 0 : __m128i v[16] = {
581 0 : h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
582 0 : h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
583 0 : set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
584 0 : counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
585 0 : };
586 0 : round_fn4(v, msg_vecs, 0);
587 0 : round_fn4(v, msg_vecs, 1);
588 0 : round_fn4(v, msg_vecs, 2);
589 0 : round_fn4(v, msg_vecs, 3);
590 0 : round_fn4(v, msg_vecs, 4);
591 0 : round_fn4(v, msg_vecs, 5);
592 0 : round_fn4(v, msg_vecs, 6);
593 0 : for (size_t i = 0; i < 8; i++) {
594 0 : v[i] = xor_128(v[i], v[i+8]);
595 0 : v[i+8] = xor_128(v[i+8], h_vecs[i]);
596 0 : }
597 0 : transpose_vecs_128(&v[0]);
598 0 : transpose_vecs_128(&v[4]);
599 0 : transpose_vecs_128(&v[8]);
600 0 : transpose_vecs_128(&v[12]);
601 0 : for (size_t i = 0; i < 4; i++) {
602 0 : storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
603 0 : storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
604 0 : storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
605 0 : storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
606 0 : }
607 0 : }
608 :
609 : /*
610 : * ----------------------------------------------------------------------------
611 : * hash8_avx512
612 : * ----------------------------------------------------------------------------
613 : */
614 :
615 3938606 : INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
616 3938606 : v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
617 3938606 : v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
618 3938606 : v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
619 3938606 : v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
620 3938606 : v[0] = add_256(v[0], v[4]);
621 3938606 : v[1] = add_256(v[1], v[5]);
622 3938606 : v[2] = add_256(v[2], v[6]);
623 3938606 : v[3] = add_256(v[3], v[7]);
624 3938606 : v[12] = xor_256(v[12], v[0]);
625 3938606 : v[13] = xor_256(v[13], v[1]);
626 3938606 : v[14] = xor_256(v[14], v[2]);
627 3938606 : v[15] = xor_256(v[15], v[3]);
628 3938606 : v[12] = rot16_256(v[12]);
629 3938606 : v[13] = rot16_256(v[13]);
630 3938606 : v[14] = rot16_256(v[14]);
631 3938606 : v[15] = rot16_256(v[15]);
632 3938606 : v[8] = add_256(v[8], v[12]);
633 3938606 : v[9] = add_256(v[9], v[13]);
634 3938606 : v[10] = add_256(v[10], v[14]);
635 3938606 : v[11] = add_256(v[11], v[15]);
636 3938606 : v[4] = xor_256(v[4], v[8]);
637 3938606 : v[5] = xor_256(v[5], v[9]);
638 3938606 : v[6] = xor_256(v[6], v[10]);
639 3938606 : v[7] = xor_256(v[7], v[11]);
640 3938606 : v[4] = rot12_256(v[4]);
641 3938606 : v[5] = rot12_256(v[5]);
642 3938606 : v[6] = rot12_256(v[6]);
643 3938606 : v[7] = rot12_256(v[7]);
644 3938606 : v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
645 3938606 : v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
646 3938606 : v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
647 3938606 : v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
648 3938606 : v[0] = add_256(v[0], v[4]);
649 3938606 : v[1] = add_256(v[1], v[5]);
650 3938606 : v[2] = add_256(v[2], v[6]);
651 3938606 : v[3] = add_256(v[3], v[7]);
652 3938606 : v[12] = xor_256(v[12], v[0]);
653 3938606 : v[13] = xor_256(v[13], v[1]);
654 3938606 : v[14] = xor_256(v[14], v[2]);
655 3938606 : v[15] = xor_256(v[15], v[3]);
656 3938606 : v[12] = rot8_256(v[12]);
657 3938606 : v[13] = rot8_256(v[13]);
658 3938606 : v[14] = rot8_256(v[14]);
659 3938606 : v[15] = rot8_256(v[15]);
660 3938606 : v[8] = add_256(v[8], v[12]);
661 3938606 : v[9] = add_256(v[9], v[13]);
662 3938606 : v[10] = add_256(v[10], v[14]);
663 3938606 : v[11] = add_256(v[11], v[15]);
664 3938606 : v[4] = xor_256(v[4], v[8]);
665 3938606 : v[5] = xor_256(v[5], v[9]);
666 3938606 : v[6] = xor_256(v[6], v[10]);
667 3938606 : v[7] = xor_256(v[7], v[11]);
668 3938606 : v[4] = rot7_256(v[4]);
669 3938606 : v[5] = rot7_256(v[5]);
670 3938606 : v[6] = rot7_256(v[6]);
671 3938606 : v[7] = rot7_256(v[7]);
672 :
673 3938606 : v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
674 3938606 : v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
675 3938606 : v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
676 3938606 : v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
677 3938606 : v[0] = add_256(v[0], v[5]);
678 3938606 : v[1] = add_256(v[1], v[6]);
679 3938606 : v[2] = add_256(v[2], v[7]);
680 3938606 : v[3] = add_256(v[3], v[4]);
681 3938606 : v[15] = xor_256(v[15], v[0]);
682 3938606 : v[12] = xor_256(v[12], v[1]);
683 3938606 : v[13] = xor_256(v[13], v[2]);
684 3938606 : v[14] = xor_256(v[14], v[3]);
685 3938606 : v[15] = rot16_256(v[15]);
686 3938606 : v[12] = rot16_256(v[12]);
687 3938606 : v[13] = rot16_256(v[13]);
688 3938606 : v[14] = rot16_256(v[14]);
689 3938606 : v[10] = add_256(v[10], v[15]);
690 3938606 : v[11] = add_256(v[11], v[12]);
691 3938606 : v[8] = add_256(v[8], v[13]);
692 3938606 : v[9] = add_256(v[9], v[14]);
693 3938606 : v[5] = xor_256(v[5], v[10]);
694 3938606 : v[6] = xor_256(v[6], v[11]);
695 3938606 : v[7] = xor_256(v[7], v[8]);
696 3938606 : v[4] = xor_256(v[4], v[9]);
697 3938606 : v[5] = rot12_256(v[5]);
698 3938606 : v[6] = rot12_256(v[6]);
699 3938606 : v[7] = rot12_256(v[7]);
700 3938606 : v[4] = rot12_256(v[4]);
701 3938606 : v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
702 3938606 : v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
703 3938606 : v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
704 3938606 : v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
705 3938606 : v[0] = add_256(v[0], v[5]);
706 3938606 : v[1] = add_256(v[1], v[6]);
707 3938606 : v[2] = add_256(v[2], v[7]);
708 3938606 : v[3] = add_256(v[3], v[4]);
709 3938606 : v[15] = xor_256(v[15], v[0]);
710 3938606 : v[12] = xor_256(v[12], v[1]);
711 3938606 : v[13] = xor_256(v[13], v[2]);
712 3938606 : v[14] = xor_256(v[14], v[3]);
713 3938606 : v[15] = rot8_256(v[15]);
714 3938606 : v[12] = rot8_256(v[12]);
715 3938606 : v[13] = rot8_256(v[13]);
716 3938606 : v[14] = rot8_256(v[14]);
717 3938606 : v[10] = add_256(v[10], v[15]);
718 3938606 : v[11] = add_256(v[11], v[12]);
719 3938606 : v[8] = add_256(v[8], v[13]);
720 3938606 : v[9] = add_256(v[9], v[14]);
721 3938606 : v[5] = xor_256(v[5], v[10]);
722 3938606 : v[6] = xor_256(v[6], v[11]);
723 3938606 : v[7] = xor_256(v[7], v[8]);
724 3938606 : v[4] = xor_256(v[4], v[9]);
725 3938606 : v[5] = rot7_256(v[5]);
726 3938606 : v[6] = rot7_256(v[6]);
727 3938606 : v[7] = rot7_256(v[7]);
728 3938606 : v[4] = rot7_256(v[4]);
729 3938606 : }
730 :
731 1191474 : INLINE void transpose_vecs_256(__m256i vecs[8]) {
732 : // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
733 : // is 22/33/66/77.
734 1191474 : __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
735 1191474 : __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
736 1191474 : __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
737 1191474 : __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
738 1191474 : __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
739 1191474 : __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
740 1191474 : __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
741 1191474 : __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
742 :
743 : // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
744 : // 11/33.
745 1191474 : __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
746 1191474 : __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
747 1191474 : __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
748 1191474 : __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
749 1191474 : __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
750 1191474 : __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
751 1191474 : __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
752 1191474 : __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
753 :
754 : // Interleave 128-bit lanes.
755 1191474 : vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
756 1191474 : vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
757 1191474 : vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
758 1191474 : vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
759 1191474 : vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
760 1191474 : vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
761 1191474 : vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
762 1191474 : vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
763 1191474 : }
764 :
765 : INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
766 562658 : size_t block_offset, __m256i out[16]) {
767 562658 : out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
768 562658 : out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
769 562658 : out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
770 562658 : out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
771 562658 : out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
772 562658 : out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
773 562658 : out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
774 562658 : out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
775 562658 : out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
776 562658 : out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
777 562658 : out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
778 562658 : out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
779 562658 : out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
780 562658 : out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
781 562658 : out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
782 562658 : out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
783 5063922 : for (size_t i = 0; i < 8; ++i) {
784 4501264 : _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
785 4501264 : }
786 562658 : transpose_vecs_256(&out[0]);
787 562658 : transpose_vecs_256(&out[8]);
788 562658 : }
789 :
790 : INLINE void load_counters8(uint64_t counter, bool increment_counter,
791 66158 : __m256i *out_lo, __m256i *out_hi) {
792 66158 : int64_t mask = (increment_counter ? ~0 : 0);
793 66158 : __m512i mask_vec = _mm512_set1_epi64(mask);
794 66158 : __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
795 66158 : deltas = _mm512_and_si512(mask_vec, deltas);
796 66158 : __m512i counters =
797 66158 : _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
798 66158 : *out_lo = _mm512_cvtepi64_epi32(counters);
799 66158 : *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
800 66158 : }
801 :
802 : static
803 : void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
804 : const uint32_t key[8], uint64_t counter,
805 : bool increment_counter, uint8_t flags,
806 66158 : uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
807 66158 : __m256i h_vecs[8] = {
808 66158 : set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
809 66158 : set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
810 66158 : };
811 66158 : __m256i counter_low_vec, counter_high_vec;
812 66158 : load_counters8(counter, increment_counter, &counter_low_vec,
813 66158 : &counter_high_vec);
814 66158 : uint8_t block_flags = flags | flags_start;
815 :
816 628816 : for (size_t block = 0; block < blocks; block++) {
817 562658 : if (block + 1 == blocks) {
818 66158 : block_flags |= flags_end;
819 66158 : }
820 562658 : __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
821 562658 : __m256i block_flags_vec = set1_256(block_flags);
822 562658 : __m256i msg_vecs[16];
823 562658 : transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
824 :
825 562658 : __m256i v[16] = {
826 562658 : h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
827 562658 : h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
828 562658 : set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
829 562658 : counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
830 562658 : };
831 562658 : round_fn8(v, msg_vecs, 0);
832 562658 : round_fn8(v, msg_vecs, 1);
833 562658 : round_fn8(v, msg_vecs, 2);
834 562658 : round_fn8(v, msg_vecs, 3);
835 562658 : round_fn8(v, msg_vecs, 4);
836 562658 : round_fn8(v, msg_vecs, 5);
837 562658 : round_fn8(v, msg_vecs, 6);
838 562658 : h_vecs[0] = xor_256(v[0], v[8]);
839 562658 : h_vecs[1] = xor_256(v[1], v[9]);
840 562658 : h_vecs[2] = xor_256(v[2], v[10]);
841 562658 : h_vecs[3] = xor_256(v[3], v[11]);
842 562658 : h_vecs[4] = xor_256(v[4], v[12]);
843 562658 : h_vecs[5] = xor_256(v[5], v[13]);
844 562658 : h_vecs[6] = xor_256(v[6], v[14]);
845 562658 : h_vecs[7] = xor_256(v[7], v[15]);
846 :
847 562658 : block_flags = flags;
848 562658 : }
849 :
850 66158 : transpose_vecs_256(h_vecs);
851 66158 : storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
852 66158 : storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
853 66158 : storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
854 66158 : storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
855 66158 : storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
856 66158 : storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
857 66158 : storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
858 66158 : storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
859 66158 : }
860 :
861 : static
862 : void blake3_xof8_avx512(const uint32_t cv[8],
863 : const uint8_t block[BLAKE3_BLOCK_LEN],
864 : uint8_t block_len, uint64_t counter, uint8_t flags,
865 0 : uint8_t out[8 * 64]) {
866 0 : __m256i h_vecs[8] = {
867 0 : set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
868 0 : set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
869 0 : };
870 0 : uint32_t block_words[16];
871 0 : load_block_words(block, block_words);
872 0 : __m256i msg_vecs[16];
873 0 : for (size_t i = 0; i < 16; i++) {
874 0 : msg_vecs[i] = set1_256(block_words[i]);
875 0 : }
876 0 : __m256i counter_low_vec, counter_high_vec;
877 0 : load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
878 0 : __m256i block_len_vec = set1_256(block_len);
879 0 : __m256i block_flags_vec = set1_256(flags);
880 0 : __m256i v[16] = {
881 0 : h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
882 0 : h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
883 0 : set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
884 0 : counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
885 0 : };
886 0 : round_fn8(v, msg_vecs, 0);
887 0 : round_fn8(v, msg_vecs, 1);
888 0 : round_fn8(v, msg_vecs, 2);
889 0 : round_fn8(v, msg_vecs, 3);
890 0 : round_fn8(v, msg_vecs, 4);
891 0 : round_fn8(v, msg_vecs, 5);
892 0 : round_fn8(v, msg_vecs, 6);
893 0 : for (size_t i = 0; i < 8; i++) {
894 0 : v[i] = xor_256(v[i], v[i+8]);
895 0 : v[i+8] = xor_256(v[i+8], h_vecs[i]);
896 0 : }
897 0 : transpose_vecs_256(&v[0]);
898 0 : transpose_vecs_256(&v[8]);
899 0 : for (size_t i = 0; i < 8; i++) {
900 0 : storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
901 0 : storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
902 0 : }
903 0 : }
904 :
905 : /*
906 : * ----------------------------------------------------------------------------
907 : * hash16_avx512
908 : * ----------------------------------------------------------------------------
909 : */
910 :
911 19415123 : INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
912 19415123 : v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
913 19415123 : v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
914 19415123 : v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
915 19415123 : v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
916 19415123 : v[0] = add_512(v[0], v[4]);
917 19415123 : v[1] = add_512(v[1], v[5]);
918 19415123 : v[2] = add_512(v[2], v[6]);
919 19415123 : v[3] = add_512(v[3], v[7]);
920 19415123 : v[12] = xor_512(v[12], v[0]);
921 19415123 : v[13] = xor_512(v[13], v[1]);
922 19415123 : v[14] = xor_512(v[14], v[2]);
923 19415123 : v[15] = xor_512(v[15], v[3]);
924 19415123 : v[12] = rot16_512(v[12]);
925 19415123 : v[13] = rot16_512(v[13]);
926 19415123 : v[14] = rot16_512(v[14]);
927 19415123 : v[15] = rot16_512(v[15]);
928 19415123 : v[8] = add_512(v[8], v[12]);
929 19415123 : v[9] = add_512(v[9], v[13]);
930 19415123 : v[10] = add_512(v[10], v[14]);
931 19415123 : v[11] = add_512(v[11], v[15]);
932 19415123 : v[4] = xor_512(v[4], v[8]);
933 19415123 : v[5] = xor_512(v[5], v[9]);
934 19415123 : v[6] = xor_512(v[6], v[10]);
935 19415123 : v[7] = xor_512(v[7], v[11]);
936 19415123 : v[4] = rot12_512(v[4]);
937 19415123 : v[5] = rot12_512(v[5]);
938 19415123 : v[6] = rot12_512(v[6]);
939 19415123 : v[7] = rot12_512(v[7]);
940 19415123 : v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
941 19415123 : v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
942 19415123 : v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
943 19415123 : v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
944 19415123 : v[0] = add_512(v[0], v[4]);
945 19415123 : v[1] = add_512(v[1], v[5]);
946 19415123 : v[2] = add_512(v[2], v[6]);
947 19415123 : v[3] = add_512(v[3], v[7]);
948 19415123 : v[12] = xor_512(v[12], v[0]);
949 19415123 : v[13] = xor_512(v[13], v[1]);
950 19415123 : v[14] = xor_512(v[14], v[2]);
951 19415123 : v[15] = xor_512(v[15], v[3]);
952 19415123 : v[12] = rot8_512(v[12]);
953 19415123 : v[13] = rot8_512(v[13]);
954 19415123 : v[14] = rot8_512(v[14]);
955 19415123 : v[15] = rot8_512(v[15]);
956 19415123 : v[8] = add_512(v[8], v[12]);
957 19415123 : v[9] = add_512(v[9], v[13]);
958 19415123 : v[10] = add_512(v[10], v[14]);
959 19415123 : v[11] = add_512(v[11], v[15]);
960 19415123 : v[4] = xor_512(v[4], v[8]);
961 19415123 : v[5] = xor_512(v[5], v[9]);
962 19415123 : v[6] = xor_512(v[6], v[10]);
963 19415123 : v[7] = xor_512(v[7], v[11]);
964 19415123 : v[4] = rot7_512(v[4]);
965 19415123 : v[5] = rot7_512(v[5]);
966 19415123 : v[6] = rot7_512(v[6]);
967 19415123 : v[7] = rot7_512(v[7]);
968 :
969 19415123 : v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
970 19415123 : v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
971 19415123 : v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
972 19415123 : v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
973 19415123 : v[0] = add_512(v[0], v[5]);
974 19415123 : v[1] = add_512(v[1], v[6]);
975 19415123 : v[2] = add_512(v[2], v[7]);
976 19415123 : v[3] = add_512(v[3], v[4]);
977 19415123 : v[15] = xor_512(v[15], v[0]);
978 19415123 : v[12] = xor_512(v[12], v[1]);
979 19415123 : v[13] = xor_512(v[13], v[2]);
980 19415123 : v[14] = xor_512(v[14], v[3]);
981 19415123 : v[15] = rot16_512(v[15]);
982 19415123 : v[12] = rot16_512(v[12]);
983 19415123 : v[13] = rot16_512(v[13]);
984 19415123 : v[14] = rot16_512(v[14]);
985 19415123 : v[10] = add_512(v[10], v[15]);
986 19415123 : v[11] = add_512(v[11], v[12]);
987 19415123 : v[8] = add_512(v[8], v[13]);
988 19415123 : v[9] = add_512(v[9], v[14]);
989 19415123 : v[5] = xor_512(v[5], v[10]);
990 19415123 : v[6] = xor_512(v[6], v[11]);
991 19415123 : v[7] = xor_512(v[7], v[8]);
992 19415123 : v[4] = xor_512(v[4], v[9]);
993 19415123 : v[5] = rot12_512(v[5]);
994 19415123 : v[6] = rot12_512(v[6]);
995 19415123 : v[7] = rot12_512(v[7]);
996 19415123 : v[4] = rot12_512(v[4]);
997 19415123 : v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
998 19415123 : v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
999 19415123 : v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
1000 19415123 : v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
1001 19415123 : v[0] = add_512(v[0], v[5]);
1002 19415123 : v[1] = add_512(v[1], v[6]);
1003 19415123 : v[2] = add_512(v[2], v[7]);
1004 19415123 : v[3] = add_512(v[3], v[4]);
1005 19415123 : v[15] = xor_512(v[15], v[0]);
1006 19415123 : v[12] = xor_512(v[12], v[1]);
1007 19415123 : v[13] = xor_512(v[13], v[2]);
1008 19415123 : v[14] = xor_512(v[14], v[3]);
1009 19415123 : v[15] = rot8_512(v[15]);
1010 19415123 : v[12] = rot8_512(v[12]);
1011 19415123 : v[13] = rot8_512(v[13]);
1012 19415123 : v[14] = rot8_512(v[14]);
1013 19415123 : v[10] = add_512(v[10], v[15]);
1014 19415123 : v[11] = add_512(v[11], v[12]);
1015 19415123 : v[8] = add_512(v[8], v[13]);
1016 19415123 : v[9] = add_512(v[9], v[14]);
1017 19415123 : v[5] = xor_512(v[5], v[10]);
1018 19415123 : v[6] = xor_512(v[6], v[11]);
1019 19415123 : v[7] = xor_512(v[7], v[8]);
1020 19415123 : v[4] = xor_512(v[4], v[9]);
1021 19415123 : v[5] = rot7_512(v[5]);
1022 19415123 : v[6] = rot7_512(v[6]);
1023 19415123 : v[7] = rot7_512(v[7]);
1024 19415123 : v[4] = rot7_512(v[4]);
1025 19415123 : }
1026 :
1027 : // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
1028 : #define LO_IMM8 0x88
1029 :
1030 49130448 : INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
1031 49130448 : return _mm512_shuffle_i32x4(a, b, LO_IMM8);
1032 49130448 : }
1033 :
1034 : // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
1035 : #define HI_IMM8 0xdd
1036 :
1037 49130448 : INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
1038 49130448 : return _mm512_shuffle_i32x4(a, b, HI_IMM8);
1039 49130448 : }
1040 :
1041 3070653 : INLINE void transpose_vecs_512(__m512i vecs[16]) {
1042 : // Interleave 32-bit lanes. The _0 unpack is lanes
1043 : // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
1044 : // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
1045 3070653 : __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
1046 3070653 : __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
1047 3070653 : __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
1048 3070653 : __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
1049 3070653 : __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
1050 3070653 : __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
1051 3070653 : __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
1052 3070653 : __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
1053 3070653 : __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
1054 3070653 : __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
1055 3070653 : __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
1056 3070653 : __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
1057 3070653 : __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
1058 3070653 : __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
1059 3070653 : __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
1060 3070653 : __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
1061 :
1062 : // Interleave 64-bit lanes. The _0 unpack is lanes
1063 : // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
1064 : // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
1065 : // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
1066 : // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
1067 3070653 : __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
1068 3070653 : __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
1069 3070653 : __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
1070 3070653 : __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
1071 3070653 : __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
1072 3070653 : __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
1073 3070653 : __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
1074 3070653 : __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
1075 3070653 : __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
1076 3070653 : __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
1077 3070653 : __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
1078 3070653 : __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
1079 3070653 : __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
1080 3070653 : __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
1081 3070653 : __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
1082 3070653 : __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
1083 :
1084 : // Interleave 128-bit lanes. The _0 unpack is
1085 : // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
1086 : // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
1087 3070653 : __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
1088 3070653 : __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
1089 3070653 : __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
1090 3070653 : __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
1091 3070653 : __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
1092 3070653 : __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
1093 3070653 : __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
1094 3070653 : __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
1095 3070653 : __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
1096 3070653 : __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
1097 3070653 : __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
1098 3070653 : __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
1099 3070653 : __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
1100 3070653 : __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
1101 3070653 : __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
1102 3070653 : __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
1103 :
1104 : // Interleave 128-bit lanes again for the final outputs.
1105 3070653 : vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
1106 3070653 : vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
1107 3070653 : vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
1108 3070653 : vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
1109 3070653 : vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
1110 3070653 : vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
1111 3070653 : vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
1112 3070653 : vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
1113 3070653 : vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
1114 3070653 : vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
1115 3070653 : vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
1116 3070653 : vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
1117 3070653 : vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
1118 3070653 : vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
1119 3070653 : vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
1120 3070653 : vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
1121 3070653 : }
1122 :
1123 : INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1124 2772979 : size_t block_offset, __m512i out[16]) {
1125 2772979 : out[0] = loadu_512(&inputs[0][block_offset]);
1126 2772979 : out[1] = loadu_512(&inputs[1][block_offset]);
1127 2772979 : out[2] = loadu_512(&inputs[2][block_offset]);
1128 2772979 : out[3] = loadu_512(&inputs[3][block_offset]);
1129 2772979 : out[4] = loadu_512(&inputs[4][block_offset]);
1130 2772979 : out[5] = loadu_512(&inputs[5][block_offset]);
1131 2772979 : out[6] = loadu_512(&inputs[6][block_offset]);
1132 2772979 : out[7] = loadu_512(&inputs[7][block_offset]);
1133 2772979 : out[8] = loadu_512(&inputs[8][block_offset]);
1134 2772979 : out[9] = loadu_512(&inputs[9][block_offset]);
1135 2772979 : out[10] = loadu_512(&inputs[10][block_offset]);
1136 2772979 : out[11] = loadu_512(&inputs[11][block_offset]);
1137 2772979 : out[12] = loadu_512(&inputs[12][block_offset]);
1138 2772979 : out[13] = loadu_512(&inputs[13][block_offset]);
1139 2772979 : out[14] = loadu_512(&inputs[14][block_offset]);
1140 2772979 : out[15] = loadu_512(&inputs[15][block_offset]);
1141 47140643 : for (size_t i = 0; i < 16; ++i) {
1142 44367664 : _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1143 44367664 : }
1144 2772979 : transpose_vecs_512(out);
1145 2772979 : }
1146 :
1147 : INLINE void load_counters16(uint64_t counter, bool increment_counter,
1148 297674 : __m512i *out_lo, __m512i *out_hi) {
1149 297674 : const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1150 297674 : const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1151 297674 : const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
1152 297674 : const __m512i low_words = _mm512_add_epi32(
1153 297674 : _mm512_set1_epi32((int32_t)counter),
1154 297674 : masked_deltas);
1155 : // The carry bit is 1 if the high bit of the word was 1 before addition and is
1156 : // 0 after.
1157 : // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
1158 : // compute the carry bits here, and originally we did, but that intrinsic is
1159 : // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
1160 297674 : const __m512i carries = _mm512_srli_epi32(
1161 297674 : _mm512_andnot_si512(
1162 297674 : low_words, // 0 after (gets inverted by andnot)
1163 297674 : _mm512_set1_epi32((int32_t)counter)), // and 1 before
1164 297674 : 31);
1165 297674 : const __m512i high_words = _mm512_add_epi32(
1166 297674 : _mm512_set1_epi32((int32_t)(counter >> 32)),
1167 297674 : carries);
1168 297674 : *out_lo = low_words;
1169 297674 : *out_hi = high_words;
1170 297674 : }
1171 :
1172 : static
1173 : void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
1174 : const uint32_t key[8], uint64_t counter,
1175 : bool increment_counter, uint8_t flags,
1176 : uint8_t flags_start, uint8_t flags_end,
1177 297064 : uint8_t *out) {
1178 297064 : __m512i h_vecs[8] = {
1179 297064 : set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
1180 297064 : set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
1181 297064 : };
1182 297064 : __m512i counter_low_vec, counter_high_vec;
1183 297064 : load_counters16(counter, increment_counter, &counter_low_vec,
1184 297064 : &counter_high_vec);
1185 297064 : uint8_t block_flags = flags | flags_start;
1186 :
1187 3070043 : for (size_t block = 0; block < blocks; block++) {
1188 2772979 : if (block + 1 == blocks) {
1189 297064 : block_flags |= flags_end;
1190 297064 : }
1191 2772979 : __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
1192 2772979 : __m512i block_flags_vec = set1_512(block_flags);
1193 2772979 : __m512i msg_vecs[16];
1194 2772979 : transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1195 :
1196 2772979 : __m512i v[16] = {
1197 2772979 : h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1198 2772979 : h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1199 2772979 : set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
1200 2772979 : counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
1201 2772979 : };
1202 2772979 : round_fn16(v, msg_vecs, 0);
1203 2772979 : round_fn16(v, msg_vecs, 1);
1204 2772979 : round_fn16(v, msg_vecs, 2);
1205 2772979 : round_fn16(v, msg_vecs, 3);
1206 2772979 : round_fn16(v, msg_vecs, 4);
1207 2772979 : round_fn16(v, msg_vecs, 5);
1208 2772979 : round_fn16(v, msg_vecs, 6);
1209 2772979 : h_vecs[0] = xor_512(v[0], v[8]);
1210 2772979 : h_vecs[1] = xor_512(v[1], v[9]);
1211 2772979 : h_vecs[2] = xor_512(v[2], v[10]);
1212 2772979 : h_vecs[3] = xor_512(v[3], v[11]);
1213 2772979 : h_vecs[4] = xor_512(v[4], v[12]);
1214 2772979 : h_vecs[5] = xor_512(v[5], v[13]);
1215 2772979 : h_vecs[6] = xor_512(v[6], v[14]);
1216 2772979 : h_vecs[7] = xor_512(v[7], v[15]);
1217 :
1218 2772979 : block_flags = flags;
1219 2772979 : }
1220 :
1221 : // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
1222 : // state vectors. Pad the matrix with zeros. After transposition, store the
1223 : // lower half of each vector.
1224 297064 : __m512i padded[16] = {
1225 297064 : h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1226 297064 : h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1227 297064 : set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1228 297064 : set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1229 297064 : };
1230 297064 : transpose_vecs_512(padded);
1231 297064 : _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
1232 297064 : _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
1233 297064 : _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
1234 297064 : _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
1235 297064 : _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
1236 297064 : _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
1237 297064 : _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
1238 297064 : _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
1239 297064 : _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
1240 297064 : _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
1241 297064 : _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
1242 297064 : _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
1243 297064 : _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
1244 297064 : _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
1245 297064 : _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
1246 297064 : _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
1247 297064 : }
1248 :
1249 : static
1250 : void blake3_xof16_avx512(const uint32_t cv[8],
1251 : const uint8_t block[BLAKE3_BLOCK_LEN],
1252 : uint8_t block_len, uint64_t counter, uint8_t flags,
1253 610 : uint8_t out[16 * 64]) {
1254 610 : __m512i h_vecs[8] = {
1255 610 : set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
1256 610 : set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
1257 610 : };
1258 610 : uint32_t block_words[16];
1259 610 : load_block_words(block, block_words);
1260 610 : __m512i msg_vecs[16];
1261 10370 : for (size_t i = 0; i < 16; i++) {
1262 9760 : msg_vecs[i] = set1_512(block_words[i]);
1263 9760 : }
1264 610 : __m512i counter_low_vec, counter_high_vec;
1265 610 : load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
1266 610 : __m512i block_len_vec = set1_512(block_len);
1267 610 : __m512i block_flags_vec = set1_512(flags);
1268 610 : __m512i v[16] = {
1269 610 : h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1270 610 : h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1271 610 : set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
1272 610 : counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
1273 610 : };
1274 610 : round_fn16(v, msg_vecs, 0);
1275 610 : round_fn16(v, msg_vecs, 1);
1276 610 : round_fn16(v, msg_vecs, 2);
1277 610 : round_fn16(v, msg_vecs, 3);
1278 610 : round_fn16(v, msg_vecs, 4);
1279 610 : round_fn16(v, msg_vecs, 5);
1280 610 : round_fn16(v, msg_vecs, 6);
1281 5490 : for (size_t i = 0; i < 8; i++) {
1282 4880 : v[i] = xor_512(v[i], v[i+8]);
1283 4880 : v[i+8] = xor_512(v[i+8], h_vecs[i]);
1284 4880 : }
1285 610 : transpose_vecs_512(&v[0]);
1286 10370 : for (size_t i = 0; i < 16; i++) {
1287 9760 : storeu_512(v[i], &out[i * sizeof(__m512i)]);
1288 9760 : }
1289 610 : }
1290 :
1291 : /*
1292 : * ----------------------------------------------------------------------------
1293 : * hash_many_avx512
1294 : * ----------------------------------------------------------------------------
1295 : */
1296 :
1297 : INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
1298 : const uint32_t key[8], uint64_t counter,
1299 : uint8_t flags, uint8_t flags_start,
1300 530092 : uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
1301 530092 : uint32_t cv[8];
1302 530092 : memcpy(cv, key, BLAKE3_KEY_LEN);
1303 530092 : uint8_t block_flags = flags | flags_start;
1304 5040944 : while (blocks > 0) {
1305 4510852 : if (blocks == 1) {
1306 530092 : block_flags |= flags_end;
1307 530092 : }
1308 4510852 : blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
1309 4510852 : block_flags);
1310 4510852 : input = &input[BLAKE3_BLOCK_LEN];
1311 4510852 : blocks -= 1;
1312 4510852 : block_flags = flags;
1313 4510852 : }
1314 530092 : memcpy(out, cv, BLAKE3_OUT_LEN);
1315 530092 : }
1316 :
1317 : void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
1318 : size_t blocks, const uint32_t key[8],
1319 : uint64_t counter, bool increment_counter,
1320 : uint8_t flags, uint8_t flags_start,
1321 760622 : uint8_t flags_end, uint8_t *out) {
1322 1057686 : while (num_inputs >= 16) {
1323 297064 : blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
1324 297064 : flags_start, flags_end, out);
1325 297064 : if (increment_counter) {
1326 165061 : counter += 16;
1327 165061 : }
1328 297064 : inputs += 16;
1329 297064 : num_inputs -= 16;
1330 297064 : out = &out[16 * BLAKE3_OUT_LEN];
1331 297064 : }
1332 826780 : while (num_inputs >= 8) {
1333 66158 : blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
1334 66158 : flags_start, flags_end, out);
1335 66158 : if (increment_counter) {
1336 33100 : counter += 8;
1337 33100 : }
1338 66158 : inputs += 8;
1339 66158 : num_inputs -= 8;
1340 66158 : out = &out[8 * BLAKE3_OUT_LEN];
1341 66158 : }
1342 892976 : while (num_inputs >= 4) {
1343 132354 : blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
1344 132354 : flags_start, flags_end, out);
1345 132354 : if (increment_counter) {
1346 66196 : counter += 4;
1347 66196 : }
1348 132354 : inputs += 4;
1349 132354 : num_inputs -= 4;
1350 132354 : out = &out[4 * BLAKE3_OUT_LEN];
1351 132354 : }
1352 1290714 : while (num_inputs > 0) {
1353 530092 : hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
1354 530092 : flags_end, out);
1355 530092 : if (increment_counter) {
1356 265384 : counter += 1;
1357 265384 : }
1358 530092 : inputs += 1;
1359 530092 : num_inputs -= 1;
1360 530092 : out = &out[BLAKE3_OUT_LEN];
1361 530092 : }
1362 760622 : }
1363 :
1364 : void blake3_xof_many_avx512(const uint32_t cv[8],
1365 : const uint8_t block[BLAKE3_BLOCK_LEN],
1366 : uint8_t block_len, uint64_t counter, uint8_t flags,
1367 305 : uint8_t* out, size_t outblocks) {
1368 915 : while (outblocks >= 16) {
1369 610 : blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
1370 610 : counter += 16;
1371 610 : outblocks -= 16;
1372 610 : out += 16 * BLAKE3_BLOCK_LEN;
1373 610 : }
1374 305 : while (outblocks >= 8) {
1375 0 : blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
1376 0 : counter += 8;
1377 0 : outblocks -= 8;
1378 0 : out += 8 * BLAKE3_BLOCK_LEN;
1379 0 : }
1380 305 : while (outblocks >= 4) {
1381 0 : blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
1382 0 : counter += 4;
1383 0 : outblocks -= 4;
1384 0 : out += 4 * BLAKE3_BLOCK_LEN;
1385 0 : }
1386 305 : while (outblocks > 0) {
1387 0 : blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
1388 0 : counter += 1;
1389 0 : outblocks -= 1;
1390 0 : out += BLAKE3_BLOCK_LEN;
1391 0 : }
1392 305 : }
|