Line data Source code
1 :
2 : // Source originally from https://github.com/BLAKE3-team/BLAKE3
3 : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
4 :
5 : #include "fd_blake3_private.h"
6 : #include "../../util/simd/fd_avx512.h"
7 : #include "../../util/simd/fd_avx.h"
8 :
9 : static inline __attribute__((always_inline)) void
10 : round_fn16( wwu_t v[16],
11 : wwu_t m[16],
12 732738433 : ulong r ) {
13 732738433 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][0]]);
14 732738433 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][2]]);
15 732738433 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][4]]);
16 732738433 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][6]]);
17 732738433 : v[0x0] = wwu_add(v[0x0], v[0x4]);
18 732738433 : v[0x1] = wwu_add(v[0x1], v[0x5]);
19 732738433 : v[0x2] = wwu_add(v[0x2], v[0x6]);
20 732738433 : v[0x3] = wwu_add(v[0x3], v[0x7]);
21 732738433 : v[0xc] = wwu_xor(v[0xc], v[0x0]);
22 732738433 : v[0xd] = wwu_xor(v[0xd], v[0x1]);
23 732738433 : v[0xe] = wwu_xor(v[0xe], v[0x2]);
24 732738433 : v[0xf] = wwu_xor(v[0xf], v[0x3]);
25 732738433 : v[0xc] = wwu_ror(v[0xc], 16);
26 732738433 : v[0xd] = wwu_ror(v[0xd], 16);
27 732738433 : v[0xe] = wwu_ror(v[0xe], 16);
28 732738433 : v[0xf] = wwu_ror(v[0xf], 16);
29 732738433 : v[0x8] = wwu_add(v[0x8], v[0xc]);
30 732738433 : v[0x9] = wwu_add(v[0x9], v[0xd]);
31 732738433 : v[0xa] = wwu_add(v[0xa], v[0xe]);
32 732738433 : v[0xb] = wwu_add(v[0xb], v[0xf]);
33 732738433 : v[0x4] = wwu_xor(v[0x4], v[0x8]);
34 732738433 : v[0x5] = wwu_xor(v[0x5], v[0x9]);
35 732738433 : v[0x6] = wwu_xor(v[0x6], v[0xa]);
36 732738433 : v[0x7] = wwu_xor(v[0x7], v[0xb]);
37 732738433 : v[0x4] = wwu_ror(v[0x4], 12);
38 732738433 : v[0x5] = wwu_ror(v[0x5], 12);
39 732738433 : v[0x6] = wwu_ror(v[0x6], 12);
40 732738433 : v[0x7] = wwu_ror(v[0x7], 12);
41 732738433 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][1]]);
42 732738433 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][3]]);
43 732738433 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][5]]);
44 732738433 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][7]]);
45 732738433 : v[0x0] = wwu_add(v[0x0], v[0x4]);
46 732738433 : v[0x1] = wwu_add(v[0x1], v[0x5]);
47 732738433 : v[0x2] = wwu_add(v[0x2], v[0x6]);
48 732738433 : v[0x3] = wwu_add(v[0x3], v[0x7]);
49 732738433 : v[0xc] = wwu_xor(v[0xc], v[0x0]);
50 732738433 : v[0xd] = wwu_xor(v[0xd], v[0x1]);
51 732738433 : v[0xe] = wwu_xor(v[0xe], v[0x2]);
52 732738433 : v[0xf] = wwu_xor(v[0xf], v[0x3]);
53 732738433 : v[0xc] = wwu_ror(v[0xc], 8);
54 732738433 : v[0xd] = wwu_ror(v[0xd], 8);
55 732738433 : v[0xe] = wwu_ror(v[0xe], 8);
56 732738433 : v[0xf] = wwu_ror(v[0xf], 8);
57 732738433 : v[0x8] = wwu_add(v[0x8], v[0xc]);
58 732738433 : v[0x9] = wwu_add(v[0x9], v[0xd]);
59 732738433 : v[0xa] = wwu_add(v[0xa], v[0xe]);
60 732738433 : v[0xb] = wwu_add(v[0xb], v[0xf]);
61 732738433 : v[0x4] = wwu_xor(v[0x4], v[0x8]);
62 732738433 : v[0x5] = wwu_xor(v[0x5], v[0x9]);
63 732738433 : v[0x6] = wwu_xor(v[0x6], v[0xa]);
64 732738433 : v[0x7] = wwu_xor(v[0x7], v[0xb]);
65 732738433 : v[0x4] = wwu_ror(v[0x4], 7);
66 732738433 : v[0x5] = wwu_ror(v[0x5], 7);
67 732738433 : v[0x6] = wwu_ror(v[0x6], 7);
68 732738433 : v[0x7] = wwu_ror(v[0x7], 7);
69 :
70 732738433 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][8]]);
71 732738433 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][10]]);
72 732738433 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][12]]);
73 732738433 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][14]]);
74 732738433 : v[0x0] = wwu_add(v[0x0], v[0x5]);
75 732738433 : v[0x1] = wwu_add(v[0x1], v[0x6]);
76 732738433 : v[0x2] = wwu_add(v[0x2], v[0x7]);
77 732738433 : v[0x3] = wwu_add(v[0x3], v[0x4]);
78 732738433 : v[0xf] = wwu_xor(v[0xf], v[0x0]);
79 732738433 : v[0xc] = wwu_xor(v[0xc], v[0x1]);
80 732738433 : v[0xd] = wwu_xor(v[0xd], v[0x2]);
81 732738433 : v[0xe] = wwu_xor(v[0xe], v[0x3]);
82 732738433 : v[0xf] = wwu_ror(v[0xf], 16);
83 732738433 : v[0xc] = wwu_ror(v[0xc], 16);
84 732738433 : v[0xd] = wwu_ror(v[0xd], 16);
85 732738433 : v[0xe] = wwu_ror(v[0xe], 16);
86 732738433 : v[0xa] = wwu_add(v[0xa], v[0xf]);
87 732738433 : v[0xb] = wwu_add(v[0xb], v[0xc]);
88 732738433 : v[0x8] = wwu_add(v[0x8], v[0xd]);
89 732738433 : v[0x9] = wwu_add(v[0x9], v[0xe]);
90 732738433 : v[0x5] = wwu_xor(v[0x5], v[0xa]);
91 732738433 : v[0x6] = wwu_xor(v[0x6], v[0xb]);
92 732738433 : v[0x7] = wwu_xor(v[0x7], v[0x8]);
93 732738433 : v[0x4] = wwu_xor(v[0x4], v[0x9]);
94 732738433 : v[0x5] = wwu_ror(v[0x5], 12);
95 732738433 : v[0x6] = wwu_ror(v[0x6], 12);
96 732738433 : v[0x7] = wwu_ror(v[0x7], 12);
97 732738433 : v[0x4] = wwu_ror(v[0x4], 12);
98 732738433 : v[0x0] = wwu_add(v[0x0], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][9]]);
99 732738433 : v[0x1] = wwu_add(v[0x1], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][11]]);
100 732738433 : v[0x2] = wwu_add(v[0x2], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][13]]);
101 732738433 : v[0x3] = wwu_add(v[0x3], m[(size_t)FD_BLAKE3_MSG_SCHEDULE[r][15]]);
102 732738433 : v[0x0] = wwu_add(v[0x0], v[0x5]);
103 732738433 : v[0x1] = wwu_add(v[0x1], v[0x6]);
104 732738433 : v[0x2] = wwu_add(v[0x2], v[0x7]);
105 732738433 : v[0x3] = wwu_add(v[0x3], v[0x4]);
106 732738433 : v[0xf] = wwu_xor(v[0xf], v[0x0]);
107 732738433 : v[0xc] = wwu_xor(v[0xc], v[0x1]);
108 732738433 : v[0xd] = wwu_xor(v[0xd], v[0x2]);
109 732738433 : v[0xe] = wwu_xor(v[0xe], v[0x3]);
110 732738433 : v[0xf] = wwu_ror(v[0xf], 8);
111 732738433 : v[0xc] = wwu_ror(v[0xc], 8);
112 732738433 : v[0xd] = wwu_ror(v[0xd], 8);
113 732738433 : v[0xe] = wwu_ror(v[0xe], 8);
114 732738433 : v[0xa] = wwu_add(v[0xa], v[0xf]);
115 732738433 : v[0xb] = wwu_add(v[0xb], v[0xc]);
116 732738433 : v[0x8] = wwu_add(v[0x8], v[0xd]);
117 732738433 : v[0x9] = wwu_add(v[0x9], v[0xe]);
118 732738433 : v[0x5] = wwu_xor(v[0x5], v[0xa]);
119 732738433 : v[0x6] = wwu_xor(v[0x6], v[0xb]);
120 732738433 : v[0x7] = wwu_xor(v[0x7], v[0x8]);
121 732738433 : v[0x4] = wwu_xor(v[0x4], v[0x9]);
122 732738433 : v[0x5] = wwu_ror(v[0x5], 7);
123 732738433 : v[0x6] = wwu_ror(v[0x6], 7);
124 732738433 : v[0x7] = wwu_ror(v[0x7], 7);
125 732738433 : v[0x4] = wwu_ror(v[0x4], 7);
126 732738433 : }
127 :
128 : void
129 : fd_blake3_avx512_compress16( ulong batch_cnt,
130 : void const * restrict _batch_data,
131 : uint const * restrict batch_sz,
132 : ulong const * restrict ctr_vec,
133 : uint const * restrict batch_flags,
134 : void * const * restrict _batch_hash,
135 : ushort * restrict lthash,
136 : uint out_sz,
137 51377067 : void const * restrict batch_cv ) {
138 51377067 : if( FD_UNLIKELY( lthash && batch_cnt!=16 ) ) FD_LOG_ERR(( "Lane masking not supported for fd_blake3_avx512_compress16 in LtHash mode" ));
139 51377067 : if( FD_UNLIKELY( batch_cnt==0 || batch_cnt>16 ) ) FD_LOG_ERR(( "Invalid batch_cnt %lu", batch_cnt ));
140 :
141 : /* We can only process input blocks of 64 bytes, but message data size
142 : is not necessarily a multiple of 64. We compute the tail block of
143 : each message here. We then process complete blocks of the original
144 : message in place, switching to processing to these tail blocks in
145 : the same pass toward the end. */
146 :
147 51377067 : ulong const * batch_data = (ulong const *)_batch_data;
148 :
149 51377067 : if( FD_UNLIKELY( batch_cnt==1 ) ) {
150 20410160 : fd_blake3_sse_compress1( (uchar *)(_batch_hash[0]),
151 20410160 : (uchar const *)(batch_data[0]),
152 20410160 : batch_sz[0],
153 20410160 : ctr_vec[0],
154 20410160 : batch_flags[0],
155 20410160 : NULL,
156 20410160 : NULL );
157 20410160 : return;
158 20410160 : }
159 :
160 : #if FD_BLAKE3_TRACING
161 : /* This log_line buffer is oversized by a fair bit (due to all the
162 : NULL terminators) but that's fine */
163 : char log_line[
164 : sizeof( "fd_blake3_avx512_compress16" )+
165 : sizeof( "(batch_cnt=" )+21+
166 : sizeof( ",sz=[" )+(16*11)+sizeof( "]" )+
167 : sizeof( ",counter=[" )+(16*21)+sizeof( "]" )+
168 : sizeof( ",flags=[" )+(16* 2)+sizeof( "]" )+
169 : sizeof( ",custom_cv" )+
170 : sizeof( ",lthash" )+
171 : sizeof( ")" ) ];
172 :
173 : char * p = fd_cstr_init( log_line );
174 : p = fd_cstr_append_text( p, "fd_blake3_avx512_compress16(batch_cnt=", 38UL );
175 : p = fd_cstr_append_ulong_as_text( p, 0, 0, batch_cnt, fd_uchar_base10_dig_cnt( (uchar)batch_cnt ) );
176 : p = fd_cstr_append_text( p, ",sz=[", 5UL );
177 : for( ulong i=0UL; i<batch_cnt; i++ ) {
178 : p = fd_cstr_append_uint_as_text( p, ' ', 0, batch_sz[ i ], fd_uint_base10_dig_cnt( batch_sz[ i ] ) );
179 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
180 : }
181 : p = fd_cstr_append_text( p, "],counter=[", 11UL );
182 : for( ulong i=0UL; i<batch_cnt; i++ ) {
183 : p = fd_cstr_append_ulong_as_text( p, ' ', 0, ctr_vec[ i ], fd_ulong_base10_dig_cnt( ctr_vec[ i ] ) );
184 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
185 : }
186 : p = fd_cstr_append_text( p, "],flags=[", 9UL );
187 : for( ulong i=0UL; i<batch_cnt; i++ ) {
188 : static char const hex_lut[ 16 ] = {
189 : '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
190 : };
191 : p = fd_cstr_append_char( p, hex_lut[ batch_flags[ i ]&0xf ] );
192 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
193 : }
194 : p = fd_cstr_append_char( p, ']' );
195 : if( batch_cv ) p = fd_cstr_append_text( p, ",custom_cv", 10UL );
196 : if( lthash ) p = fd_cstr_append_text( p, ",lthash", 7UL );
197 : p = fd_cstr_append_char( p, ')' );
198 : ulong line_len = (ulong)( p-log_line );
199 : fd_cstr_fini( p );
200 :
201 : FD_BLAKE3_TRACE(( "%.*s", (int)line_len, log_line ));
202 : #endif
203 :
204 30966907 : ulong batch_tail_data[ 16 ] __attribute__((aligned(64)));
205 30966907 : ulong batch_tail_rem [ 16 ] __attribute__((aligned(64)));
206 :
207 30966907 : uchar scratch[ 16*FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(128)));
208 30966907 : do {
209 30966907 : ulong scratch_free = (ulong)scratch;
210 :
211 30966907 : wwv_t zero = wwv_zero();
212 :
213 446224478 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
214 :
215 : /* Allocate the tail blocks for this message */
216 :
217 415257571 : ulong data = batch_data[ batch_idx ];
218 415257571 : ulong sz = batch_sz [ batch_idx ];
219 :
220 415257571 : ulong tail_data = scratch_free;
221 415257571 : ulong tail_data_sz = sz & (FD_BLAKE3_BLOCK_SZ-1UL);
222 415257571 : ulong tail_data_off = fd_ulong_align_dn( sz, FD_BLAKE3_BLOCK_SZ );
223 :
224 415257571 : batch_tail_data[ batch_idx ] = tail_data;
225 415257571 : batch_tail_rem [ batch_idx ] = (!!tail_data_sz) ^ (!sz); /* (hash 1 tail block if 0 sz) */
226 :
227 415257571 : scratch_free += FD_BLAKE3_BLOCK_SZ;
228 :
229 : /* Populate the tail blocks. We first clear the blocks. Then we
230 : copy any straggler data bytes into the tail. */
231 :
232 415257571 : wwv_st( (ulong *) tail_data, zero );
233 :
234 415257571 : # if 1
235 : /* See fd_sha256_private_batch_avx */
236 415257571 : ulong src = (ulong)data + tail_data_off;
237 415257571 : ulong dst = tail_data;
238 415257571 : ulong rem = tail_data_sz;
239 454450650 : while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
240 532947736 : while( rem>= 8UL ) { *(ulong *)dst = FD_LOAD( ulong, src ); dst += 8UL; src += 8UL; rem -= 8UL; }
241 415257571 : if ( rem>= 4UL ) { *(uint *)dst = FD_LOAD( uint, src ); dst += 4UL; src += 4UL; rem -= 4UL; }
242 415257571 : if ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src ); dst += 2UL; src += 2UL; rem -= 2UL; }
243 415257571 : if ( rem ) { *(uchar *)dst = FD_LOAD( uchar, src ); dst++; }
244 : # else
245 : fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
246 : # endif
247 415257571 : }
248 30966907 : } while(0);
249 :
250 :
251 30966907 : wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
252 30966907 : wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
253 30966907 : wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
254 30966907 : wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
255 30966907 : wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
256 30966907 : wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
257 30966907 : wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
258 30966907 : wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
259 :
260 30966907 : wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
261 30966907 : wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
262 30966907 : if( FD_UNLIKELY( batch_cv ) ) {
263 : /* If the input chaining value is overridden, transpose the input
264 : to AVX512 representation. (wwu 16x8 transpose) FIXME There's
265 : probably a way to do this using AVX512 instead of AVX. */
266 23563226 : __m256i const ** cv_vec = (__m256i const **)batch_cv;
267 23563226 : wu_t cv_lo[8]; wu_t cv_hi[8];
268 23563226 : cv_lo[ 0 ] = _mm256_loadu_si256( cv_vec[ 0 ] );
269 23563226 : cv_lo[ 1 ] = _mm256_loadu_si256( cv_vec[ 1 ] );
270 23563226 : cv_lo[ 2 ] = _mm256_loadu_si256( cv_vec[ 2 ] );
271 23563226 : cv_lo[ 3 ] = _mm256_loadu_si256( cv_vec[ 3 ] );
272 23563226 : cv_lo[ 4 ] = _mm256_loadu_si256( cv_vec[ 4 ] );
273 23563226 : cv_lo[ 5 ] = _mm256_loadu_si256( cv_vec[ 5 ] );
274 23563226 : cv_lo[ 6 ] = _mm256_loadu_si256( cv_vec[ 6 ] );
275 23563226 : cv_lo[ 7 ] = _mm256_loadu_si256( cv_vec[ 7 ] );
276 23563226 : cv_hi[ 0 ] = _mm256_loadu_si256( cv_vec[ 8 ] );
277 23563226 : cv_hi[ 1 ] = _mm256_loadu_si256( cv_vec[ 9 ] );
278 23563226 : cv_hi[ 2 ] = _mm256_loadu_si256( cv_vec[ 10 ] );
279 23563226 : cv_hi[ 3 ] = _mm256_loadu_si256( cv_vec[ 11 ] );
280 23563226 : cv_hi[ 4 ] = _mm256_loadu_si256( cv_vec[ 12 ] );
281 23563226 : cv_hi[ 5 ] = _mm256_loadu_si256( cv_vec[ 13 ] );
282 23563226 : cv_hi[ 6 ] = _mm256_loadu_si256( cv_vec[ 14 ] );
283 23563226 : cv_hi[ 7 ] = _mm256_loadu_si256( cv_vec[ 15 ] );
284 23563226 : wu_transpose_8x8( cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7],
285 23563226 : cv_lo[0], cv_lo[1], cv_lo[2], cv_lo[3], cv_lo[4], cv_lo[5], cv_lo[6], cv_lo[7] );
286 23563226 : wu_transpose_8x8( cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7],
287 23563226 : cv_hi[0], cv_hi[1], cv_hi[2], cv_hi[3], cv_hi[4], cv_hi[5], cv_hi[6], cv_hi[7] );
288 23563226 : h0 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 0 ] ), cv_hi[ 0 ], 1 );
289 23563226 : h1 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 1 ] ), cv_hi[ 1 ], 1 );
290 23563226 : h2 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 2 ] ), cv_hi[ 2 ], 1 );
291 23563226 : h3 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 3 ] ), cv_hi[ 3 ], 1 );
292 23563226 : h4 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 4 ] ), cv_hi[ 4 ], 1 );
293 23563226 : h5 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 5 ] ), cv_hi[ 5 ], 1 );
294 23563226 : h6 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 6 ] ), cv_hi[ 6 ], 1 );
295 23563226 : h7 = _mm512_inserti64x4( _mm512_castsi256_si512( cv_lo[ 7 ] ), cv_hi[ 7 ], 1 );
296 23563226 : }
297 :
298 30966907 : wwu_t ctr_lo = wwu( ctr_vec[ 0], ctr_vec[ 1], ctr_vec[ 2], ctr_vec[ 3],
299 30966907 : ctr_vec[ 4], ctr_vec[ 5], ctr_vec[ 6], ctr_vec[ 7],
300 30966907 : ctr_vec[ 8], ctr_vec[ 9], ctr_vec[10], ctr_vec[11],
301 30966907 : ctr_vec[12], ctr_vec[13], ctr_vec[14], ctr_vec[15] );
302 30966907 : wwu_t ctr_hi = wwu( ctr_vec[ 0]>>32, ctr_vec[ 1]>>32, ctr_vec[ 2]>>32, ctr_vec[ 3]>>32,
303 30966907 : ctr_vec[ 4]>>32, ctr_vec[ 5]>>32, ctr_vec[ 6]>>32, ctr_vec[ 7]>>32,
304 30966907 : ctr_vec[ 8]>>32, ctr_vec[ 9]>>32, ctr_vec[10]>>32, ctr_vec[11]>>32,
305 30966907 : ctr_vec[12]>>32, ctr_vec[13]>>32, ctr_vec[14]>>32, ctr_vec[15]>>32 );
306 30966907 : wwu_t flags = wwu_ldu( batch_flags );
307 30966907 : wwu_t off = wwu_zero();
308 30966907 : wwu_t sz = wwu_ldu( batch_sz );
309 :
310 30966907 : wwv_t zero = wwv_zero();
311 30966907 : wwv_t one = wwv_one();
312 30966907 : wwu_t wwu_64 = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
313 30966907 : wwv_t wwv_64 = wwv_bcast( FD_BLAKE3_BLOCK_SZ );
314 30966907 : wwv_t W_sentinel = wwv_bcast( (ulong)scratch );
315 : //wwc_t batch_lane = wc_unpack( (1<<batch_cnt)-1 );
316 :
317 30966907 : wwv_t tail_lo = wwv_ld( batch_tail_data );
318 30966907 : wwv_t tail_hi = wwv_ld( batch_tail_data+8 );
319 :
320 30966907 : wwv_t tail_rem_lo = wwv_ld( batch_tail_rem );
321 30966907 : wwv_t tail_rem_hi = wwv_ld( batch_tail_rem+8 );
322 :
323 30966907 : wwv_t W_lo = wwv_ld( batch_data );
324 30966907 : wwv_t W_hi = wwv_ld( batch_data+8 );
325 :
326 30966907 : wwv_t batch_sz_lo = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 0 ) );
327 30966907 : wwv_t batch_sz_hi = _mm512_cvtepi32_epi64( _mm512_extracti32x8_epi32( sz, 1 ) );
328 :
329 30966907 : wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
330 30966907 : wwv_add( wwv_shr( batch_sz_lo, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_lo ), zero );
331 30966907 : wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
332 30966907 : wwv_add( wwv_shr( batch_sz_hi, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_hi ), zero );
333 :
334 : /* Upper half of the compression function output.
335 : Usually thrown away, but kept in the final compression round if
336 : out_sz==64. */
337 30966907 : wwu_t hu[8] = {0};
338 :
339 30966907 : ulong lthash_rem = lthash ? 32 : 0; /* Number of LtHash (XOF) blocks remaining */
340 30966907 : int compress_done = 0;
341 105809102 : for(;;) {
342 : /* Switch lanes that have hit the end of their in-place bulk
343 : processing to their out-of-place scratch tail regions as
344 : necessary. */
345 :
346 105809102 : W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
347 105809102 : W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
348 :
349 : /* Derive per-block flags and block sizes */
350 :
351 105809102 : int block_first = wwu_eq( off, wwu_zero() );
352 105809102 : int block_last = wwi_le( sz, wwu_add( off, wwu_bcast( FD_BLAKE3_BLOCK_SZ ) ) );
353 :
354 : /* Suppress root flag unless last block */
355 :
356 105809102 : wwu_t root_mask = wwu_if( block_last, wwu_bcast( UINT_MAX ), wwu_bcast( ~FD_BLAKE3_FLAG_ROOT ) );
357 105809102 : wwu_t block_flags = wwu_and( flags, root_mask );
358 :
359 : /* Mask lanes that completed */
360 :
361 105809102 : int active_lane_lo;
362 105809102 : int active_lane_hi;
363 105809102 : if( FD_UNLIKELY( lthash ) ) {
364 : /* Compress until root block */
365 1648055 : wwu_t all_root = wwu_bcast( FD_BLAKE3_FLAG_ROOT );
366 1648055 : int not_root = wwu_ne( wwu_and( block_flags, all_root ), all_root );
367 1648055 : active_lane_lo = (int)(__mmask8)not_root;
368 1648055 : active_lane_hi = (int)(__mmask8)(not_root>>8);
369 104161047 : } else {
370 : /* Complete when there is no more input data */
371 104161047 : active_lane_lo = wwv_ne( block_rem_lo, zero );
372 104161047 : active_lane_hi = wwv_ne( block_rem_hi, zero );
373 104161047 : }
374 :
375 : /* Suppress CHUNK_{START,END} flags unless leaf node */
376 :
377 105809102 : int is_parent = wwu_ne( wwu_and( flags, wwu_bcast( FD_BLAKE3_FLAG_PARENT ) ), wwu_zero() );
378 105809102 : wwu_t chunk_flags = wwu_if( block_last, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_END ), wwu_zero() );
379 105809102 : if( out_sz==32 ) {
380 : /* Hacky: out_sz==64 is only used for post-compress XOF hashing,
381 : so use that as a hint when to suppress the 'CHUNK_START' flag. */
382 58682650 : chunk_flags = wwu_or( chunk_flags, wwu_if( block_first, wwu_bcast( FD_BLAKE3_FLAG_CHUNK_START ), wwu_zero() ) );
383 58682650 : }
384 105809102 : wwu_t block_sz = wwu_min( wwu_sub( sz, off ), wwu_64 );
385 105809102 : block_flags = wwu_or( block_flags, wwu_if( is_parent, wwu_zero(), chunk_flags ) );
386 :
387 : /* Check if we are done compressing */
388 :
389 105809102 : compress_done |= !(active_lane_lo | active_lane_hi);
390 105809102 : if( FD_UNLIKELY( compress_done ) ) {
391 30966907 : if( FD_UNLIKELY( !lthash_rem ) ) break;
392 200101 : active_lane_lo = 0xff;
393 200101 : active_lane_hi = 0xff;
394 : /* Load the next message block and fall through to XOF expansion */
395 200101 : }
396 :
397 : /* At this point, we have at least 1 block in this message segment
398 : pass that has not been processed. Load the next 64 bytes of
399 : each unprocessed block. Inactive lanes (e.g. message segments
400 : in this pass for which we've already processed all the blocks)
401 : will load garbage from a sentinel location (and the result of
402 : the state computations for the inactive lane will be ignored). */
403 :
404 75042296 : ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
405 75042296 : ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
406 75042296 : wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
407 75042296 : wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
408 75042296 : uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
409 75042296 : uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
410 75042296 : uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
411 75042296 : uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
412 75042296 : uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
413 75042296 : uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
414 75042296 : uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
415 75042296 : uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
416 :
417 75042296 : wwu_t m[16];
418 75042296 : m[0x0] = wwu_ldu( W0 ); m[0x1] = wwu_ldu( W1 );
419 75042296 : m[0x2] = wwu_ldu( W2 ); m[0x3] = wwu_ldu( W3 );
420 75042296 : m[0x4] = wwu_ldu( W4 ); m[0x5] = wwu_ldu( W5 );
421 75042296 : m[0x6] = wwu_ldu( W6 ); m[0x7] = wwu_ldu( W7 );
422 75042296 : m[0x8] = wwu_ldu( W8 ); m[0x9] = wwu_ldu( W9 );
423 75042296 : m[0xa] = wwu_ldu( Wa ); m[0xb] = wwu_ldu( Wb );
424 75042296 : m[0xc] = wwu_ldu( Wc ); m[0xd] = wwu_ldu( Wd );
425 75042296 : m[0xe] = wwu_ldu( We ); m[0xf] = wwu_ldu( Wf );
426 :
427 75042296 : wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
428 75042296 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
429 75042296 : m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
430 75042296 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
431 :
432 : /* Compute the BLAKE3 compression function updates */
433 :
434 81245427 : compress: (void)0;
435 81245427 : wwu_t v[16] = {
436 81245427 : h0, h1, h2, h3,
437 81245427 : h4, h5, h6, h7,
438 81245427 : iv0, iv1, iv2, iv3,
439 81245427 : ctr_lo, ctr_hi, block_sz, block_flags,
440 81245427 : };
441 :
442 : /* Debug utility */
443 81245427 : #define STATE_FMT "[%u] =\n %08x %08x %08x %08x\n %08x %08x %08x %08x\n %08x %08x %08x %08x\n %08x %08x %08x %08x"
444 81245427 : #define STATE_FMT_ARGS(v,i) (uint)i,\
445 81245427 : fd_uint_bswap(wwu_extract(v[0x0],i)),fd_uint_bswap(wwu_extract(v[0x1],i)),fd_uint_bswap(wwu_extract(v[0x2],i)),fd_uint_bswap(wwu_extract(v[0x3],i)),\
446 81245427 : fd_uint_bswap(wwu_extract(v[0x4],i)),fd_uint_bswap(wwu_extract(v[0x5],i)),fd_uint_bswap(wwu_extract(v[0x6],i)),fd_uint_bswap(wwu_extract(v[0x7],i)),\
447 81245427 : fd_uint_bswap(wwu_extract(v[0x8],i)),fd_uint_bswap(wwu_extract(v[0x9],i)),fd_uint_bswap(wwu_extract(v[0xa],i)),fd_uint_bswap(wwu_extract(v[0xb],i)),\
448 81245427 : fd_uint_bswap(wwu_extract(v[0xc],i)),fd_uint_bswap(wwu_extract(v[0xd],i)),fd_uint_bswap(wwu_extract(v[0xe],i)),fd_uint_bswap(wwu_extract(v[0xf],i))
449 :
450 : // FD_LOG_NOTICE(( "pre " STATE_FMT, STATE_FMT_ARGS(v,0) ));
451 81245427 : round_fn16( v, m, 0 );
452 81245427 : round_fn16( v, m, 1 );
453 81245427 : round_fn16( v, m, 2 );
454 81245427 : round_fn16( v, m, 3 );
455 81245427 : round_fn16( v, m, 4 );
456 81245427 : round_fn16( v, m, 5 );
457 81245427 : round_fn16( v, m, 6 );
458 : // FD_LOG_NOTICE(( "post" STATE_FMT, STATE_FMT_ARGS(v,0) ));
459 :
460 81245427 : if( FD_LIKELY( !compress_done ) ) {
461 :
462 : /* Apply the state updates to the active lanes */
463 :
464 74842195 : int active_lane = active_lane_lo | (active_lane_hi<<8);
465 74842195 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: compress lanes %02x%02x", active_lane_hi, active_lane_lo ));
466 :
467 74842195 : if( FD_UNLIKELY( out_sz==64 ) ) {
468 : /* FIXME only export in the last iteration */
469 23563226 : hu[0] = wwu_xor_if( active_lane, h0, v[ 8], hu[0] );
470 23563226 : hu[1] = wwu_xor_if( active_lane, h1, v[ 9], hu[1] );
471 23563226 : hu[2] = wwu_xor_if( active_lane, h2, v[10], hu[2] );
472 23563226 : hu[3] = wwu_xor_if( active_lane, h3, v[11], hu[3] );
473 23563226 : hu[4] = wwu_xor_if( active_lane, h4, v[12], hu[4] );
474 23563226 : hu[5] = wwu_xor_if( active_lane, h5, v[13], hu[5] );
475 23563226 : hu[6] = wwu_xor_if( active_lane, h6, v[14], hu[6] );
476 23563226 : hu[7] = wwu_xor_if( active_lane, h7, v[15], hu[7] );
477 23563226 : }
478 74842195 : h0 = wwu_xor_if( active_lane, v[ 0], v[ 8], h0 );
479 74842195 : h1 = wwu_xor_if( active_lane, v[ 1], v[ 9], h1 );
480 74842195 : h2 = wwu_xor_if( active_lane, v[ 2], v[10], h2 );
481 74842195 : h3 = wwu_xor_if( active_lane, v[ 3], v[11], h3 );
482 74842195 : h4 = wwu_xor_if( active_lane, v[ 4], v[12], h4 );
483 74842195 : h5 = wwu_xor_if( active_lane, v[ 5], v[13], h5 );
484 74842195 : h6 = wwu_xor_if( active_lane, v[ 6], v[14], h6 );
485 74842195 : h7 = wwu_xor_if( active_lane, v[ 7], v[15], h7 );
486 :
487 : /* Advance to the next message segment blocks. In pseudo code,
488 : the below is:
489 :
490 : W += 64; if( block_rem ) block_rem--;
491 :
492 : Since we do not load anything at W(lane) above unless
493 : block_rem(lane) is non-zero, we can omit vector conditional
494 : operations for W(lane) below. */
495 :
496 74842195 : W_lo = wwv_add_if( active_lane_lo, W_lo, wwv_64, W_lo );
497 74842195 : W_hi = wwv_add_if( active_lane_hi, W_hi, wwv_64, W_hi );
498 74842195 : off = wwu_add_if( active_lane, off, wwu_64, off );
499 :
500 74842195 : block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
501 74842195 : block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
502 :
503 74842195 : } else { /* LtHash mode */
504 :
505 : /* d[i] contains output_off+(i*4) 32-bit words across output[0..8] */
506 6403232 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: expand lanes" ));
507 6403232 : wwu_t d[ 16 ] = {
508 6403232 : wwu_xor( v[0x0], v[0x8] ),
509 6403232 : wwu_xor( v[0x1], v[0x9] ),
510 6403232 : wwu_xor( v[0x2], v[0xa] ),
511 6403232 : wwu_xor( v[0x3], v[0xb] ),
512 6403232 : wwu_xor( v[0x4], v[0xc] ),
513 6403232 : wwu_xor( v[0x5], v[0xd] ),
514 6403232 : wwu_xor( v[0x6], v[0xe] ),
515 6403232 : wwu_xor( v[0x7], v[0xf] ),
516 6403232 : wwu_xor( h0, v[0x8] ),
517 6403232 : wwu_xor( h1, v[0x9] ),
518 6403232 : wwu_xor( h2, v[0xa] ),
519 6403232 : wwu_xor( h3, v[0xb] ),
520 6403232 : wwu_xor( h4, v[0xc] ),
521 6403232 : wwu_xor( h5, v[0xd] ),
522 6403232 : wwu_xor( h6, v[0xe] ),
523 6403232 : wwu_xor( h7, v[0xf] )
524 6403232 : };
525 :
526 : /* Transpose each 8x8 block */
527 6403232 : wwu_transpose_16x16( d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
528 6403232 : d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf],
529 6403232 : d[0x0], d[0x1], d[0x2], d[0x3], d[0x4], d[0x5], d[0x6], d[0x7],
530 6403232 : d[0x8], d[0x9], d[0xa], d[0xb], d[0xc], d[0xd], d[0xe], d[0xf] );
531 :
532 : /* Reduce-add into d[0] */
533 6403232 : d[0x0] = wwh_add( d[0x0], d[0x1] ); /* sum(l[0 1]) */
534 6403232 : d[0x2] = wwh_add( d[0x2], d[0x3] ); /* sum(l[2 3]) */
535 6403232 : d[0x4] = wwh_add( d[0x4], d[0x5] ); /* sum(l[4 5]) */
536 6403232 : d[0x6] = wwh_add( d[0x6], d[0x7] ); /* sum(l[6 7]) */
537 6403232 : d[0x8] = wwh_add( d[0x8], d[0x9] ); /* sum(l[8 9]) */
538 6403232 : d[0xa] = wwh_add( d[0xa], d[0xb] ); /* sum(l[a b]) */
539 6403232 : d[0xc] = wwh_add( d[0xc], d[0xd] ); /* sum(l[c d]) */
540 6403232 : d[0xe] = wwh_add( d[0xe], d[0xf] ); /* sum(l[e f]) */
541 6403232 : d[0x0] = wwh_add( d[0x0], d[0x2] ); /* sum(l[0 1 2 3]) */
542 6403232 : d[0x4] = wwh_add( d[0x4], d[0x6] ); /* sum(l[4 5 6 7]) */
543 6403232 : d[0x8] = wwh_add( d[0x8], d[0xa] ); /* sum(l[8 9 a b]) */
544 6403232 : d[0xc] = wwh_add( d[0xc], d[0xe] ); /* sum(l[c d e f]) */
545 6403232 : d[0x0] = wwh_add( d[0x0], d[0x4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
546 6403232 : d[0x8] = wwh_add( d[0x8], d[0xc] ); /* sum(l[8 9 a b c d e f]) */
547 6403232 : d[0x0] = wwh_add( d[0x0], d[0x8] ); /* sum(l[0 1 2 3 4 5 6 7 8 9 a b c d e f]) */
548 6403232 : wwh_st( lthash, d[0x0] );
549 :
550 : /* Wind up for next iteration */
551 6403232 : lthash += 32; /* 64 byte stride */
552 6403232 : lthash_rem--;
553 6403232 : wwu_t ctr_add = wwu_bcast( 1 );
554 6403232 : /**/ ctr_lo = wwu_add( ctr_lo, ctr_add );
555 6403232 : int ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
556 6403232 : wwu_xor( ctr_lo, wwu_bcast( 0x80000000 ) ) );
557 6403232 : /**/ ctr_hi = wwu_add_if( ctr_carry, ctr_hi, wwu_one(), ctr_hi );
558 6403232 : if( FD_UNLIKELY( !lthash_rem ) ) {
559 200101 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (lthash para)" ));
560 200101 : return;
561 200101 : }
562 6203131 : goto compress;
563 :
564 6403232 : # undef STATE_FMT
565 6403232 : # undef STATE_FMT_ARGS
566 6403232 : }
567 81245427 : }
568 :
569 : /* Store the results */
570 :
571 30766806 : wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
572 30766806 : wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
573 :
574 30766806 : wwu_transpose_16x16( h0, h1, h2, h3, h4, h5, h6, h7,
575 30766806 : hu[0],hu[1],hu[2],hu[3],hu[4],hu[5],hu[6],hu[7],
576 30766806 : o0, o1, o2, o3, o4, o5, o6, o7,
577 30766806 : o8, o9, oA, oB, oC, oD, oE, oF );
578 :
579 30766806 : uint * const * batch_hash = (uint * const *)_batch_hash;
580 30766806 : if( FD_LIKELY( out_sz==32 ) ) {
581 7203580 : switch( batch_cnt ) { /* application dependent prob */
582 316956 : case 16UL: wu_stu( batch_hash[15], _mm512_castsi512_si256( oF ) ); __attribute__((fallthrough));
583 410197 : case 15UL: wu_stu( batch_hash[14], _mm512_castsi512_si256( oE ) ); __attribute__((fallthrough));
584 508669 : case 14UL: wu_stu( batch_hash[13], _mm512_castsi512_si256( oD ) ); __attribute__((fallthrough));
585 613180 : case 13UL: wu_stu( batch_hash[12], _mm512_castsi512_si256( oC ) ); __attribute__((fallthrough));
586 724777 : case 12UL: wu_stu( batch_hash[11], _mm512_castsi512_si256( oB ) ); __attribute__((fallthrough));
587 844769 : case 11UL: wu_stu( batch_hash[10], _mm512_castsi512_si256( oA ) ); __attribute__((fallthrough));
588 974880 : case 10UL: wu_stu( batch_hash[ 9], _mm512_castsi512_si256( o9 ) ); __attribute__((fallthrough));
589 1117442 : case 9UL: wu_stu( batch_hash[ 8], _mm512_castsi512_si256( o8 ) ); __attribute__((fallthrough));
590 1342729 : case 8UL: wu_stu( batch_hash[ 7], _mm512_castsi512_si256( o7 ) ); __attribute__((fallthrough));
591 1674518 : case 7UL: wu_stu( batch_hash[ 6], _mm512_castsi512_si256( o6 ) ); __attribute__((fallthrough));
592 2055808 : case 6UL: wu_stu( batch_hash[ 5], _mm512_castsi512_si256( o5 ) ); __attribute__((fallthrough));
593 2505447 : case 5UL: wu_stu( batch_hash[ 4], _mm512_castsi512_si256( o4 ) ); __attribute__((fallthrough));
594 3089878 : case 4UL: wu_stu( batch_hash[ 3], _mm512_castsi512_si256( o3 ) ); __attribute__((fallthrough));
595 4457929 : case 3UL: wu_stu( batch_hash[ 2], _mm512_castsi512_si256( o2 ) ); __attribute__((fallthrough));
596 7203580 : case 2UL: wu_stu( batch_hash[ 1], _mm512_castsi512_si256( o1 ) ); __attribute__((fallthrough));
597 7203580 : case 1UL: wu_stu( batch_hash[ 0], _mm512_castsi512_si256( o0 ) ); __attribute__((fallthrough));
598 7203580 : default: break;
599 7203580 : }
600 7203580 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done" ));
601 23563226 : } else if( out_sz==64 ) {
602 23563226 : switch( batch_cnt ) { /* application dependent prob */
603 23563226 : case 16UL: wwu_stu( batch_hash[15], oF ); __attribute__((fallthrough));
604 23563226 : case 15UL: wwu_stu( batch_hash[14], oE ); __attribute__((fallthrough));
605 23563226 : case 14UL: wwu_stu( batch_hash[13], oD ); __attribute__((fallthrough));
606 23563226 : case 13UL: wwu_stu( batch_hash[12], oC ); __attribute__((fallthrough));
607 23563226 : case 12UL: wwu_stu( batch_hash[11], oB ); __attribute__((fallthrough));
608 23563226 : case 11UL: wwu_stu( batch_hash[10], oA ); __attribute__((fallthrough));
609 23563226 : case 10UL: wwu_stu( batch_hash[ 9], o9 ); __attribute__((fallthrough));
610 23563226 : case 9UL: wwu_stu( batch_hash[ 8], o8 ); __attribute__((fallthrough));
611 23563226 : case 8UL: wwu_stu( batch_hash[ 7], o7 ); __attribute__((fallthrough));
612 23563226 : case 7UL: wwu_stu( batch_hash[ 6], o6 ); __attribute__((fallthrough));
613 23563226 : case 6UL: wwu_stu( batch_hash[ 5], o5 ); __attribute__((fallthrough));
614 23563226 : case 5UL: wwu_stu( batch_hash[ 4], o4 ); __attribute__((fallthrough));
615 23563226 : case 4UL: wwu_stu( batch_hash[ 3], o3 ); __attribute__((fallthrough));
616 23563226 : case 3UL: wwu_stu( batch_hash[ 2], o2 ); __attribute__((fallthrough));
617 23563226 : case 2UL: wwu_stu( batch_hash[ 1], o1 ); __attribute__((fallthrough));
618 23563226 : case 1UL: wwu_stu( batch_hash[ 0], o0 ); __attribute__((fallthrough));
619 23563226 : default: break;
620 23563226 : }
621 23563226 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16: done (out_sz=64)" ));
622 23563226 : } else {
623 0 : FD_LOG_ERR(( "Invalid out_sz %u", out_sz ));
624 0 : }
625 30766806 : }
626 :
627 : void
628 : fd_blake3_avx512_compress16_fast( uchar const * restrict msg,
629 : uchar * restrict out,
630 : ulong counter,
631 1987987 : uchar flags ) {
632 1987987 : FD_BLAKE3_TRACE(( "fd_blake3_avx512_compress16_fast(msg=%p,out=%p,counter=%lu,flags=%02x)", (void *)msg, (void *)out, counter, flags ));
633 :
634 1987987 : int parent = flags & FD_BLAKE3_FLAG_PARENT;
635 1987987 : int lg_sz = fd_int_if( parent, FD_BLAKE3_OUTCHAIN_LG_SZ+1, FD_BLAKE3_CHUNK_LG_SZ );
636 1987987 : ulong sz = 1UL<<lg_sz;
637 :
638 : /* counters stay the same for each block. Across chunks, they
639 : increment if we are hashing leaves. Otherwise, they are zero. */
640 :
641 1987987 : wwu_t ctr_add = wwu_and( wwu_bcast( parent ? 0 : UINT_MAX ),
642 1987987 : wwu( 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
643 1987987 : 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf ) );
644 1987987 : wwu_t ctr_lo = wwu_add( wwu_bcast( counter ), ctr_add );
645 1987987 : int ctr_carry = wwi_gt ( wwu_xor( ctr_add, wwu_bcast( 0x80000000 ) ),
646 1987987 : wwu_xor( ctr_lo, wwu_bcast( 0x80000000 ) ) );
647 1987987 : wwu_t ctr_hi = wwu_add_if( ctr_carry, wwu_bcast( counter>>32 ), wwu_one(), wwu_bcast( counter>>32 ) );
648 1987987 : wwu_t sz_vec = wwu_bcast( FD_BLAKE3_BLOCK_SZ );
649 :
650 1987987 : wwu_t const iv0 = wwu_bcast( FD_BLAKE3_IV[0] );
651 1987987 : wwu_t const iv1 = wwu_bcast( FD_BLAKE3_IV[1] );
652 1987987 : wwu_t const iv2 = wwu_bcast( FD_BLAKE3_IV[2] );
653 1987987 : wwu_t const iv3 = wwu_bcast( FD_BLAKE3_IV[3] );
654 1987987 : wwu_t const iv4 = wwu_bcast( FD_BLAKE3_IV[4] );
655 1987987 : wwu_t const iv5 = wwu_bcast( FD_BLAKE3_IV[5] );
656 1987987 : wwu_t const iv6 = wwu_bcast( FD_BLAKE3_IV[6] );
657 1987987 : wwu_t const iv7 = wwu_bcast( FD_BLAKE3_IV[7] );
658 :
659 1987987 : wwu_t h0=iv0; wwu_t h1=iv1; wwu_t h2=iv2; wwu_t h3=iv3;
660 1987987 : wwu_t h4=iv4; wwu_t h5=iv5; wwu_t h6=iv6; wwu_t h7=iv7;
661 :
662 1987987 : ulong off = 0UL;
663 23431492 : do {
664 23431492 : ulong const off_next = off+FD_BLAKE3_BLOCK_SZ;
665 23431492 : int chunk_flags =
666 23431492 : ( off ==0UL ? FD_BLAKE3_FLAG_CHUNK_START : 0 ) |
667 23431492 : ( off_next==sz ? FD_BLAKE3_FLAG_CHUNK_END : 0 );
668 23431492 : int flags_ = flags | fd_int_if( parent, 0, chunk_flags );
669 23431492 : wwu_t flags_vec = wwu_bcast( flags_ );
670 :
671 23431492 : wwu_t m[16];
672 23431492 : m[0x0] = wwu_ldu( msg + (0x0<<lg_sz) + off );
673 23431492 : m[0x1] = wwu_ldu( msg + (0x1<<lg_sz) + off );
674 23431492 : m[0x2] = wwu_ldu( msg + (0x2<<lg_sz) + off );
675 23431492 : m[0x3] = wwu_ldu( msg + (0x3<<lg_sz) + off );
676 23431492 : m[0x4] = wwu_ldu( msg + (0x4<<lg_sz) + off );
677 23431492 : m[0x5] = wwu_ldu( msg + (0x5<<lg_sz) + off );
678 23431492 : m[0x6] = wwu_ldu( msg + (0x6<<lg_sz) + off );
679 23431492 : m[0x7] = wwu_ldu( msg + (0x7<<lg_sz) + off );
680 23431492 : m[0x8] = wwu_ldu( msg + (0x8<<lg_sz) + off );
681 23431492 : m[0x9] = wwu_ldu( msg + (0x9<<lg_sz) + off );
682 23431492 : m[0xa] = wwu_ldu( msg + (0xa<<lg_sz) + off );
683 23431492 : m[0xb] = wwu_ldu( msg + (0xb<<lg_sz) + off );
684 23431492 : m[0xc] = wwu_ldu( msg + (0xc<<lg_sz) + off );
685 23431492 : m[0xd] = wwu_ldu( msg + (0xd<<lg_sz) + off );
686 23431492 : m[0xe] = wwu_ldu( msg + (0xe<<lg_sz) + off );
687 23431492 : m[0xf] = wwu_ldu( msg + (0xf<<lg_sz) + off );
688 :
689 23431492 : wwu_transpose_16x16( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
690 23431492 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
691 23431492 : m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
692 23431492 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
693 :
694 23431492 : wwu_t v[16] = {
695 23431492 : h0, h1, h2, h3,
696 23431492 : h4, h5, h6, h7,
697 23431492 : iv0, iv1, iv2, iv3,
698 23431492 : ctr_lo, ctr_hi, sz_vec, flags_vec,
699 23431492 : };
700 :
701 23431492 : round_fn16( v, m, 0 );
702 23431492 : round_fn16( v, m, 1 );
703 23431492 : round_fn16( v, m, 2 );
704 23431492 : round_fn16( v, m, 3 );
705 23431492 : round_fn16( v, m, 4 );
706 23431492 : round_fn16( v, m, 5 );
707 23431492 : round_fn16( v, m, 6 );
708 :
709 23431492 : h0 = wwu_xor( v[ 0], v[ 8] );
710 23431492 : h1 = wwu_xor( v[ 1], v[ 9] );
711 23431492 : h2 = wwu_xor( v[ 2], v[10] );
712 23431492 : h3 = wwu_xor( v[ 3], v[11] );
713 23431492 : h4 = wwu_xor( v[ 4], v[12] );
714 23431492 : h5 = wwu_xor( v[ 5], v[13] );
715 23431492 : h6 = wwu_xor( v[ 6], v[14] );
716 23431492 : h7 = wwu_xor( v[ 7], v[15] );
717 :
718 23431492 : off = off_next;
719 23431492 : } while( off!=sz );
720 :
721 1987987 : wwu_t o0; wwu_t o1; wwu_t o2; wwu_t o3; wwu_t o4; wwu_t o5; wwu_t o6; wwu_t o7;
722 1987987 : wwu_t o8; wwu_t o9; wwu_t oA; wwu_t oB; wwu_t oC; wwu_t oD; wwu_t oE; wwu_t oF;
723 :
724 1987987 : wwu_t zero = wwu_zero();
725 1987987 : wwu_transpose_16x16( h0, h1, h2, h3, h4, h5, h6, h7,
726 1987987 : zero, zero, zero, zero, zero, zero, zero, zero,
727 1987987 : o0, o1, o2, o3, o4, o5, o6, o7,
728 1987987 : o8, o9, oA, oB, oC, oD, oE, oF );
729 :
730 1987987 : wb_st( out + (0x0UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o0 ) );
731 1987987 : wb_st( out + (0x1UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o1 ) );
732 1987987 : wb_st( out + (0x2UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o2 ) );
733 1987987 : wb_st( out + (0x3UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o3 ) );
734 1987987 : wb_st( out + (0x4UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o4 ) );
735 1987987 : wb_st( out + (0x5UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o5 ) );
736 1987987 : wb_st( out + (0x6UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o6 ) );
737 1987987 : wb_st( out + (0x7UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o7 ) );
738 1987987 : wb_st( out + (0x8UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o8 ) );
739 1987987 : wb_st( out + (0x9UL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( o9 ) );
740 1987987 : wb_st( out + (0xaUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oA ) );
741 1987987 : wb_st( out + (0xbUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oB ) );
742 1987987 : wb_st( out + (0xcUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oC ) );
743 1987987 : wb_st( out + (0xdUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oD ) );
744 1987987 : wb_st( out + (0xeUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oE ) );
745 1987987 : wb_st( out + (0xfUL<<FD_BLAKE3_OUTCHAIN_LG_SZ), _mm512_castsi512_si256( oF ) );
746 1987987 : }
|