Line data Source code
1 :
2 : // Source originally from https://github.com/BLAKE3-team/BLAKE3
3 : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
4 :
5 : #include "fd_blake3.h"
6 : #include "fd_blake3_private.h"
7 : #include "../../util/simd/fd_sse.h"
8 : #include <assert.h>
9 :
10 : #define _mm_shuffle_ps2(a, b, c) \
11 3513478528 : (_mm_castps_si128( \
12 3513478528 : _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
13 :
14 3074293712 : #define vu_rot16 vb_exch_adj_pair
15 :
16 : static inline __attribute__((always_inline)) vu_t
17 3074293712 : vu_rot12( vu_t x ) {
18 3074293712 : return vu_xor( vu_shr( x, 12 ), vu_shl( x, 32-12 ) );
19 3074293712 : }
20 :
21 : static inline __attribute__((always_inline)) vu_t
22 3074293712 : vu_rot8( vu_t x ) {
23 3074293712 : vb_t const mask = vb( 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 );
24 3074293712 : return _mm_shuffle_epi8( x, mask );
25 3074293712 : }
26 :
27 : static inline __attribute__((always_inline)) vu_t
28 3074293712 : vu_rot7( vu_t x ) {
29 3074293712 : return vu_xor( vu_shr( x, 7 ), vu_shl( x, 32-7 ) );
30 3074293712 : }
31 :
32 : static inline __attribute__((always_inline)) void
33 : g1( vu_t * row0,
34 : vu_t * row1,
35 : vu_t * row2,
36 : vu_t * row3,
37 3074293712 : vu_t m ) {
38 3074293712 : *row0 = vu_add(vu_add(*row0, m), *row1);
39 3074293712 : *row3 = vu_xor(*row3, *row0);
40 3074293712 : *row3 = vu_rot16(*row3);
41 3074293712 : *row2 = vu_add(*row2, *row3);
42 3074293712 : *row1 = vu_xor(*row1, *row2);
43 3074293712 : *row1 = vu_rot12(*row1);
44 3074293712 : }
45 :
46 : static inline __attribute__((always_inline)) void
47 : g2( vu_t * row0,
48 : vu_t * row1,
49 : vu_t * row2,
50 : vu_t * row3,
51 3074293712 : vu_t m ) {
52 3074293712 : *row0 = vu_add(vu_add(*row0, m), *row1);
53 3074293712 : *row3 = vu_xor(*row3, *row0);
54 3074293712 : *row3 = vu_rot8(*row3);
55 3074293712 : *row2 = vu_add(*row2, *row3);
56 3074293712 : *row1 = vu_xor(*row1, *row2);
57 3074293712 : *row1 = vu_rot7(*row1);
58 3074293712 : }
59 :
60 : // Note the optimization here of leaving row1 as the unrotated row, rather than
61 : // row0. All the message loads below are adjusted to compensate for this. See
62 : // discussion at https://github.com/sneves/blake2-avx2/pull/4
63 : static inline __attribute__((always_inline)) void
64 1537146856 : diagonalize(vu_t *row0, vu_t *row2, vu_t *row3) {
65 1537146856 : *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
66 1537146856 : *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
67 1537146856 : *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
68 1537146856 : }
69 :
70 : static inline __attribute__((always_inline)) void
71 1537146856 : undiagonalize(vu_t *row0, vu_t *row2, vu_t *row3) {
72 1537146856 : *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
73 1537146856 : *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
74 1537146856 : *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
75 1537146856 : }
76 :
77 : static inline __attribute__((always_inline)) void
78 : compress_pre( vu_t rows[4],
79 : uint const cv[ static 8 ],
80 : uchar const block[ static FD_BLAKE3_BLOCK_SZ ],
81 : uint block_len,
82 : ulong ctr,
83 219592408 : uint flags ) {
84 219592408 : rows[0] = vu_ld( cv );
85 219592408 : rows[1] = vu_ld( cv+4 );
86 219592408 : rows[2] = vu( FD_BLAKE3_IV[0], FD_BLAKE3_IV[1], FD_BLAKE3_IV[2], FD_BLAKE3_IV[3] );
87 219592408 : rows[3] = vu( (uint)(ctr&UINT_MAX), (uint)(ctr>>32),
88 219592408 : block_len, flags );
89 :
90 219592408 : vu_t m0 = vb_ldu( block ); vu_t m1 = vb_ldu( block+16 );
91 219592408 : vu_t m2 = vb_ldu( block+32 ); vu_t m3 = vb_ldu( block+48 );
92 :
93 219592408 : vu_t t0, t1, t2, t3, tt;
94 :
95 : // Round 1. The first round permutes the message words from the original
96 : // input order, into the groups that get mixed in parallel.
97 219592408 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
98 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
99 219592408 : t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
100 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
101 219592408 : diagonalize(&rows[0], &rows[2], &rows[3]);
102 219592408 : t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
103 219592408 : t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
104 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
105 219592408 : t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
106 219592408 : t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
107 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
108 219592408 : undiagonalize(&rows[0], &rows[2], &rows[3]);
109 219592408 : m0 = t0;
110 219592408 : m1 = t1;
111 219592408 : m2 = t2;
112 219592408 : m3 = t3;
113 :
114 : // Round 2. This round and all following rounds apply a fixed permutation
115 : // to the message words from the round before.
116 219592408 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
117 219592408 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
118 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
119 219592408 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
120 219592408 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
121 219592408 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
122 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
123 219592408 : diagonalize(&rows[0], &rows[2], &rows[3]);
124 219592408 : t2 = _mm_unpacklo_epi64(m3, m1);
125 219592408 : tt = _mm_blend_epi16(t2, m2, 0xC0);
126 219592408 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
127 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
128 219592408 : t3 = _mm_unpackhi_epi32(m1, m3);
129 219592408 : tt = _mm_unpacklo_epi32(m2, t3);
130 219592408 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
131 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
132 219592408 : undiagonalize(&rows[0], &rows[2], &rows[3]);
133 219592408 : m0 = t0;
134 219592408 : m1 = t1;
135 219592408 : m2 = t2;
136 219592408 : m3 = t3;
137 :
138 : // Round 3
139 219592408 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
140 219592408 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
141 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
142 219592408 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
143 219592408 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
144 219592408 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
145 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
146 219592408 : diagonalize(&rows[0], &rows[2], &rows[3]);
147 219592408 : t2 = _mm_unpacklo_epi64(m3, m1);
148 219592408 : tt = _mm_blend_epi16(t2, m2, 0xC0);
149 219592408 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
150 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
151 219592408 : t3 = _mm_unpackhi_epi32(m1, m3);
152 219592408 : tt = _mm_unpacklo_epi32(m2, t3);
153 219592408 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
154 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
155 219592408 : undiagonalize(&rows[0], &rows[2], &rows[3]);
156 219592408 : m0 = t0;
157 219592408 : m1 = t1;
158 219592408 : m2 = t2;
159 219592408 : m3 = t3;
160 :
161 : // Round 4
162 219592408 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
163 219592408 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
164 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
165 219592408 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
166 219592408 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
167 219592408 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
168 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
169 219592408 : diagonalize(&rows[0], &rows[2], &rows[3]);
170 219592408 : t2 = _mm_unpacklo_epi64(m3, m1);
171 219592408 : tt = _mm_blend_epi16(t2, m2, 0xC0);
172 219592408 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
173 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
174 219592408 : t3 = _mm_unpackhi_epi32(m1, m3);
175 219592408 : tt = _mm_unpacklo_epi32(m2, t3);
176 219592408 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
177 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
178 219592408 : undiagonalize(&rows[0], &rows[2], &rows[3]);
179 219592408 : m0 = t0;
180 219592408 : m1 = t1;
181 219592408 : m2 = t2;
182 219592408 : m3 = t3;
183 :
184 : // Round 5
185 219592408 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
186 219592408 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
187 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
188 219592408 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
189 219592408 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
190 219592408 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
191 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
192 219592408 : diagonalize(&rows[0], &rows[2], &rows[3]);
193 219592408 : t2 = _mm_unpacklo_epi64(m3, m1);
194 219592408 : tt = _mm_blend_epi16(t2, m2, 0xC0);
195 219592408 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
196 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
197 219592408 : t3 = _mm_unpackhi_epi32(m1, m3);
198 219592408 : tt = _mm_unpacklo_epi32(m2, t3);
199 219592408 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
200 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
201 219592408 : undiagonalize(&rows[0], &rows[2], &rows[3]);
202 219592408 : m0 = t0;
203 219592408 : m1 = t1;
204 219592408 : m2 = t2;
205 219592408 : m3 = t3;
206 :
207 : // Round 6
208 219592408 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
209 219592408 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
210 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
211 219592408 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
212 219592408 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
213 219592408 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
214 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
215 219592408 : diagonalize(&rows[0], &rows[2], &rows[3]);
216 219592408 : t2 = _mm_unpacklo_epi64(m3, m1);
217 219592408 : tt = _mm_blend_epi16(t2, m2, 0xC0);
218 219592408 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
219 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
220 219592408 : t3 = _mm_unpackhi_epi32(m1, m3);
221 219592408 : tt = _mm_unpacklo_epi32(m2, t3);
222 219592408 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
223 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
224 219592408 : undiagonalize(&rows[0], &rows[2], &rows[3]);
225 219592408 : m0 = t0;
226 219592408 : m1 = t1;
227 219592408 : m2 = t2;
228 219592408 : m3 = t3;
229 :
230 : // Round 7
231 219592408 : t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
232 219592408 : t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
233 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
234 219592408 : t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
235 219592408 : tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
236 219592408 : t1 = _mm_blend_epi16(tt, t1, 0xCC);
237 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
238 219592408 : diagonalize(&rows[0], &rows[2], &rows[3]);
239 219592408 : t2 = _mm_unpacklo_epi64(m3, m1);
240 219592408 : tt = _mm_blend_epi16(t2, m2, 0xC0);
241 219592408 : t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
242 219592408 : g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
243 219592408 : t3 = _mm_unpackhi_epi32(m1, m3);
244 219592408 : tt = _mm_unpacklo_epi32(m2, t3);
245 219592408 : t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
246 219592408 : g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
247 219592408 : undiagonalize(&rows[0], &rows[2], &rows[3]);
248 219592408 : }
249 :
250 : void
251 : fd_blake3_sse_compress1( uchar * restrict out,
252 : uchar const * restrict msg,
253 : uint msg_sz,
254 : ulong counter,
255 : uint const flags,
256 : uchar * restrict out_chain,
257 90345671 : uchar const * restrict in_chain ) {
258 90345671 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1(out=%p,msg=%p,sz=%u,counter=%lu,flags=%02x)",
259 90345671 : (void *)out, (void *)msg, msg_sz, counter, flags ));
260 90345671 : assert( msg_sz<=FD_BLAKE3_CHUNK_SZ );
261 :
262 90345671 : uint cv[8] = { FD_BLAKE3_IV[0], FD_BLAKE3_IV[1], FD_BLAKE3_IV[2], FD_BLAKE3_IV[3],
263 90345671 : FD_BLAKE3_IV[4], FD_BLAKE3_IV[5], FD_BLAKE3_IV[6], FD_BLAKE3_IV[7] };
264 90345671 : if( FD_UNLIKELY( in_chain ) ) {
265 0 : memcpy( cv, in_chain, FD_BLAKE3_OUTCHAIN_SZ );
266 0 : }
267 90345671 : vu_t rows[4];
268 :
269 90345671 : uint flag_mask = ~fd_uint_if( flags&FD_BLAKE3_FLAG_PARENT,
270 90345671 : FD_BLAKE3_FLAG_CHUNK_START|FD_BLAKE3_FLAG_CHUNK_END,
271 90345671 : 0U );
272 :
273 90345671 : uint block_flags = flags | (flag_mask & FD_BLAKE3_FLAG_CHUNK_START);
274 90345671 : if( FD_UNLIKELY( in_chain && !(flags&FD_BLAKE3_FLAG_CHUNK_START) ) ) {
275 0 : block_flags &= ~FD_BLAKE3_FLAG_CHUNK_START;
276 0 : }
277 248218773 : do {
278 248218773 : uint block_sz = fd_uint_min( msg_sz, FD_BLAKE3_BLOCK_SZ );
279 248218773 : block_flags |= FD_BLAKE3_FLAG_CHUNK_END;
280 248218773 : block_flags &= (flag_mask & ~fd_uint_if( msg_sz<=FD_BLAKE3_BLOCK_SZ, 0, (FD_BLAKE3_FLAG_CHUNK_END|FD_BLAKE3_FLAG_ROOT) ) );
281 :
282 248218773 : uchar tail[ FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(16)));
283 248218773 : uchar const * restrict block;
284 248218773 : if( FD_LIKELY( msg_sz>=FD_BLAKE3_BLOCK_SZ ) ) {
285 244237940 : block = msg;
286 244237940 : } else {
287 3980833 : vb_st( tail, vu_zero() );
288 3980833 : vb_st( tail+16, vu_zero() );
289 3980833 : vb_st( tail+32, vu_zero() );
290 3980833 : vb_st( tail+48, vu_zero() );
291 3980833 : fd_memcpy( tail, msg, msg_sz );
292 3980833 : block = tail;
293 3980833 : }
294 :
295 248218773 : if( FD_UNLIKELY( out_chain && (block_flags & FD_BLAKE3_FLAG_CHUNK_END) ) ) {
296 : /* FIXME better document and polish the transition from the compress
297 : part to the expand part. */
298 28626365 : fd_memcpy( out, block, FD_BLAKE3_BLOCK_SZ ); /* FIXME DOCUMENT OVERLOADING OF OUT ARGUMENT */
299 28626365 : fd_memcpy( out_chain, cv, FD_BLAKE3_OUTCHAIN_SZ );
300 28626365 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: done (XOF mode)" ));
301 28626365 : return;
302 28626365 : }
303 :
304 219592408 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: sz=%u counter=%u flags=%x", block_sz, (uint)counter, block_flags ));
305 219592408 : compress_pre( rows, cv, block, block_sz, counter, block_flags );
306 219592408 : if( FD_UNLIKELY( in_chain ) ) {
307 : /* FIXME UGLY */
308 0 : vu_stu( out+32, vu_xor( vu_ld( cv ), rows[2] ) );
309 0 : vu_stu( out+48, vu_xor( vu_ld( cv+4 ), rows[3] ) );
310 0 : }
311 219592408 : vu_st( cv, vu_xor( rows[0], rows[2] ) );
312 219592408 : vu_st( cv+4, vu_xor( rows[1], rows[3] ) );
313 219592408 : msg += FD_BLAKE3_BLOCK_SZ;
314 219592408 : msg_sz -= block_sz;
315 219592408 : block_flags = flags;
316 219592408 : } while( (int)msg_sz>0 );
317 :
318 61719306 : vu_stu( out, vu_ld( cv ) );
319 61719306 : vu_stu( out+16, vu_ld( cv+4 ) );
320 :
321 61719306 : FD_BLAKE3_TRACE(( "fd_blake3_sse_compress1: done" ));
322 61719306 : }
|