Line data Source code
1 :
2 : // Source originally from https://github.com/BLAKE3-team/BLAKE3
3 : // From commit: 64747d48ffe9d1fbf4b71e94cabeb8a211461081
4 :
5 : #include "fd_blake3.h"
6 : #include "fd_blake3_private.h"
7 : #include "../../util/simd/fd_avx.h"
8 : #include <assert.h>
9 :
10 17258081008 : #define wu_rot16 wb_exch_adj_pair
11 :
12 : static inline __attribute__((always_inline)) wu_t
13 17258081008 : wu_rot12( wu_t x ) {
14 17258081008 : return wu_ror( x, 12 );
15 17258081008 : }
16 :
17 : static inline __attribute__((always_inline)) wu_t
18 17258081008 : wu_rot8( wu_t x ) {
19 17258081008 : wb_t const mask =
20 17258081008 : wb( 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12,
21 17258081008 : 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 );
22 17258081008 : return _mm256_shuffle_epi8( x, mask );
23 17258081008 : }
24 :
25 : static inline __attribute__((always_inline)) wu_t
26 17258081008 : wu_rot7( wu_t x ) {
27 17258081008 : return wu_ror( x, 7 );
28 17258081008 : }
29 :
30 : static inline __attribute__((always_inline)) void
31 : round_fn8( wu_t v[16],
32 : wu_t m[16],
33 2157260126 : ulong r ) {
34 2157260126 : v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][0]]);
35 2157260126 : v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][2]]);
36 2157260126 : v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][4]]);
37 2157260126 : v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][6]]);
38 2157260126 : v[ 0] = wu_add(v[0], v[4]);
39 2157260126 : v[ 1] = wu_add(v[1], v[5]);
40 2157260126 : v[ 2] = wu_add(v[2], v[6]);
41 2157260126 : v[ 3] = wu_add(v[3], v[7]);
42 2157260126 : v[12] = wu_xor(v[12], v[0]);
43 2157260126 : v[13] = wu_xor(v[13], v[1]);
44 2157260126 : v[14] = wu_xor(v[14], v[2]);
45 2157260126 : v[15] = wu_xor(v[15], v[3]);
46 2157260126 : v[12] = wu_rot16(v[12]);
47 2157260126 : v[13] = wu_rot16(v[13]);
48 2157260126 : v[14] = wu_rot16(v[14]);
49 2157260126 : v[15] = wu_rot16(v[15]);
50 2157260126 : v[ 8] = wu_add(v[8], v[12]);
51 2157260126 : v[ 9] = wu_add(v[9], v[13]);
52 2157260126 : v[10] = wu_add(v[10], v[14]);
53 2157260126 : v[11] = wu_add(v[11], v[15]);
54 2157260126 : v[ 4] = wu_xor(v[4], v[8]);
55 2157260126 : v[ 5] = wu_xor(v[5], v[9]);
56 2157260126 : v[ 6] = wu_xor(v[6], v[10]);
57 2157260126 : v[ 7] = wu_xor(v[7], v[11]);
58 2157260126 : v[ 4] = wu_rot12(v[4]);
59 2157260126 : v[ 5] = wu_rot12(v[5]);
60 2157260126 : v[ 6] = wu_rot12(v[6]);
61 2157260126 : v[ 7] = wu_rot12(v[7]);
62 2157260126 : v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][1]]);
63 2157260126 : v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][3]]);
64 2157260126 : v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][5]]);
65 2157260126 : v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][7]]);
66 2157260126 : v[ 0] = wu_add(v[0], v[4]);
67 2157260126 : v[ 1] = wu_add(v[1], v[5]);
68 2157260126 : v[ 2] = wu_add(v[2], v[6]);
69 2157260126 : v[ 3] = wu_add(v[3], v[7]);
70 2157260126 : v[12] = wu_xor(v[12], v[0]);
71 2157260126 : v[13] = wu_xor(v[13], v[1]);
72 2157260126 : v[14] = wu_xor(v[14], v[2]);
73 2157260126 : v[15] = wu_xor(v[15], v[3]);
74 2157260126 : v[12] = wu_rot8(v[12]);
75 2157260126 : v[13] = wu_rot8(v[13]);
76 2157260126 : v[14] = wu_rot8(v[14]);
77 2157260126 : v[15] = wu_rot8(v[15]);
78 2157260126 : v[ 8] = wu_add(v[8], v[12]);
79 2157260126 : v[ 9] = wu_add(v[9], v[13]);
80 2157260126 : v[10] = wu_add(v[10], v[14]);
81 2157260126 : v[11] = wu_add(v[11], v[15]);
82 2157260126 : v[ 4] = wu_xor(v[4], v[8]);
83 2157260126 : v[ 5] = wu_xor(v[5], v[9]);
84 2157260126 : v[ 6] = wu_xor(v[6], v[10]);
85 2157260126 : v[ 7] = wu_xor(v[7], v[11]);
86 2157260126 : v[ 4] = wu_rot7(v[4]);
87 2157260126 : v[ 5] = wu_rot7(v[5]);
88 2157260126 : v[ 6] = wu_rot7(v[6]);
89 2157260126 : v[ 7] = wu_rot7(v[7]);
90 :
91 2157260126 : v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][8]]);
92 2157260126 : v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][10]]);
93 2157260126 : v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][12]]);
94 2157260126 : v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][14]]);
95 2157260126 : v[ 0] = wu_add(v[0], v[5]);
96 2157260126 : v[ 1] = wu_add(v[1], v[6]);
97 2157260126 : v[ 2] = wu_add(v[2], v[7]);
98 2157260126 : v[ 3] = wu_add(v[3], v[4]);
99 2157260126 : v[15] = wu_xor(v[15], v[0]);
100 2157260126 : v[12] = wu_xor(v[12], v[1]);
101 2157260126 : v[13] = wu_xor(v[13], v[2]);
102 2157260126 : v[14] = wu_xor(v[14], v[3]);
103 2157260126 : v[15] = wu_rot16(v[15]);
104 2157260126 : v[12] = wu_rot16(v[12]);
105 2157260126 : v[13] = wu_rot16(v[13]);
106 2157260126 : v[14] = wu_rot16(v[14]);
107 2157260126 : v[10] = wu_add(v[10], v[15]);
108 2157260126 : v[11] = wu_add(v[11], v[12]);
109 2157260126 : v[ 8] = wu_add(v[8], v[13]);
110 2157260126 : v[ 9] = wu_add(v[9], v[14]);
111 2157260126 : v[ 5] = wu_xor(v[5], v[10]);
112 2157260126 : v[ 6] = wu_xor(v[6], v[11]);
113 2157260126 : v[ 7] = wu_xor(v[7], v[8]);
114 2157260126 : v[ 4] = wu_xor(v[4], v[9]);
115 2157260126 : v[ 5] = wu_rot12(v[5]);
116 2157260126 : v[ 6] = wu_rot12(v[6]);
117 2157260126 : v[ 7] = wu_rot12(v[7]);
118 2157260126 : v[ 4] = wu_rot12(v[4]);
119 2157260126 : v[ 0] = wu_add(v[0], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][9]]);
120 2157260126 : v[ 1] = wu_add(v[1], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][11]]);
121 2157260126 : v[ 2] = wu_add(v[2], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][13]]);
122 2157260126 : v[ 3] = wu_add(v[3], m[(ulong)FD_BLAKE3_MSG_SCHEDULE[r][15]]);
123 2157260126 : v[ 0] = wu_add(v[0], v[5]);
124 2157260126 : v[ 1] = wu_add(v[1], v[6]);
125 2157260126 : v[ 2] = wu_add(v[2], v[7]);
126 2157260126 : v[ 3] = wu_add(v[3], v[4]);
127 2157260126 : v[15] = wu_xor(v[15], v[0]);
128 2157260126 : v[12] = wu_xor(v[12], v[1]);
129 2157260126 : v[13] = wu_xor(v[13], v[2]);
130 2157260126 : v[14] = wu_xor(v[14], v[3]);
131 2157260126 : v[15] = wu_rot8(v[15]);
132 2157260126 : v[12] = wu_rot8(v[12]);
133 2157260126 : v[13] = wu_rot8(v[13]);
134 2157260126 : v[14] = wu_rot8(v[14]);
135 2157260126 : v[10] = wu_add(v[10], v[15]);
136 2157260126 : v[11] = wu_add(v[11], v[12]);
137 2157260126 : v[ 8] = wu_add(v[8], v[13]);
138 2157260126 : v[ 9] = wu_add(v[9], v[14]);
139 2157260126 : v[ 5] = wu_xor(v[5], v[10]);
140 2157260126 : v[ 6] = wu_xor(v[6], v[11]);
141 2157260126 : v[ 7] = wu_xor(v[7], v[8]);
142 2157260126 : v[ 4] = wu_xor(v[4], v[9]);
143 2157260126 : v[ 5] = wu_rot7(v[5]);
144 2157260126 : v[ 6] = wu_rot7(v[6]);
145 2157260126 : v[ 7] = wu_rot7(v[7]);
146 2157260126 : v[ 4] = wu_rot7(v[4]);
147 2157260126 : }
148 :
149 : void
150 : fd_blake3_avx_compress8( ulong batch_cnt,
151 : void const * restrict _batch_data,
152 : uint const * restrict batch_sz,
153 : ulong const * restrict ctr_vec,
154 : uint const * restrict batch_flags,
155 : void * const * restrict _batch_hash,
156 : ushort * restrict lthash,
157 : uint out_sz,
158 137463814 : void const * restrict batch_cv ) {
159 137463814 : if( FD_UNLIKELY( lthash && batch_cnt!=8 ) ) FD_LOG_ERR(( "Lane masking not supported for fd_blake3_avx_compress8 in LtHash mode" ));
160 137463814 : if( FD_UNLIKELY( batch_cnt==0 || batch_cnt>8 ) ) FD_LOG_ERR(( "Invalid batch_cnt %lu", batch_cnt ));
161 :
162 137463814 : ulong const * batch_data = (ulong const *)_batch_data;
163 :
164 137463814 : if( FD_UNLIKELY( batch_cnt==1 ) ) {
165 41008846 : fd_blake3_sse_compress1( (uchar *)(_batch_hash[0]),
166 41008846 : (uchar const *)(batch_data[0]),
167 41008846 : batch_sz[0],
168 41008846 : ctr_vec[0],
169 41008846 : batch_flags[0],
170 41008846 : NULL,
171 41008846 : NULL );
172 41008846 : return;
173 41008846 : }
174 :
175 : #if FD_BLAKE3_TRACING
176 : /* This log_line buffer is oversized by a fair bit (due to all the
177 : NULL terminators) but that's fine */
178 : char log_line[
179 : sizeof( "fd_blake3_avx_compress8" )+
180 : sizeof( "(batch_cnt=" )+21+
181 : sizeof( ",sz=[" )+(8*11)+sizeof( "]" )+
182 : sizeof( ",counter=[" )+(8*21)+sizeof( "]" )+
183 : sizeof( ",flags=[" )+(8* 2)+sizeof( "]" )+
184 : sizeof( ",custom_cv" )+
185 : sizeof( ",lthash" )+
186 : sizeof( ")" ) ];
187 :
188 : char * p = fd_cstr_init( log_line );
189 : p = fd_cstr_append_text( p, "fd_blake3_avx_compress8(batch_cnt=", 34UL );
190 : p = fd_cstr_append_ulong_as_text( p, 0, 0, batch_cnt, fd_uchar_base10_dig_cnt( (uchar)batch_cnt ) );
191 : p = fd_cstr_append_text( p, ",sz=[", 5UL );
192 : for( ulong i=0UL; i<batch_cnt; i++ ) {
193 : p = fd_cstr_append_uint_as_text( p, ' ', 0, batch_sz[ i ], fd_uint_base10_dig_cnt( batch_sz[ i ] ) );
194 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
195 : }
196 : p = fd_cstr_append_text( p, "],counter=[", 11UL );
197 : for( ulong i=0UL; i<batch_cnt; i++ ) {
198 : p = fd_cstr_append_ulong_as_text( p, ' ', 0, ctr_vec[ i ], fd_ulong_base10_dig_cnt( ctr_vec[ i ] ) );
199 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
200 : }
201 : p = fd_cstr_append_text( p, "],flags=[", 9UL );
202 : for( ulong i=0UL; i<batch_cnt; i++ ) {
203 : static char const hex_lut[ 16 ] = {
204 : '0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'
205 : };
206 : p = fd_cstr_append_char( p, hex_lut[ batch_flags[ i ]&0xf ] );
207 : if( i+1<batch_cnt ) p = fd_cstr_append_char( p, ',' );
208 : }
209 : p = fd_cstr_append_char( p, ']' );
210 : if( batch_cv ) p = fd_cstr_append_text( p, ",custom_cv", 10UL );
211 : if( lthash ) p = fd_cstr_append_text( p, ",lthash", 7UL );
212 : p = fd_cstr_append_char( p, ')' );
213 : ulong line_len = (ulong)( p-log_line );
214 : fd_cstr_fini( p );
215 :
216 : FD_BLAKE3_TRACE(( "%.*s", (int)line_len, log_line ));
217 : #endif
218 :
219 : /* We can only process input blocks of 64 bytes, but message data size
220 : is not necessarily a multiple of 64. We compute the tail block of
221 : each message here. We then process complete blocks of the original
222 : message in place, switching to processing to these tail blocks in
223 : the same pass toward the end. */
224 :
225 96454968 : ulong batch_tail_data[ 8 ] __attribute__((aligned(32)));
226 96454968 : ulong batch_tail_rem [ 8 ] __attribute__((aligned(32)));
227 :
228 96454968 : uchar scratch[ 8*FD_BLAKE3_BLOCK_SZ ] __attribute__((aligned(128)));
229 96454968 : do {
230 96454968 : ulong scratch_free = (ulong)scratch;
231 :
232 96454968 : wv_t zero = wv_zero();
233 :
234 805697588 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
235 :
236 : /* Allocate the tail blocks for this message */
237 :
238 709242620 : ulong data = batch_data[ batch_idx ];
239 709242620 : ulong sz = batch_sz [ batch_idx ];
240 :
241 709242620 : ulong tail_data = scratch_free;
242 709242620 : ulong tail_data_sz = sz & (FD_BLAKE3_BLOCK_SZ-1UL);
243 709242620 : ulong tail_data_off = fd_ulong_align_dn( sz, FD_BLAKE3_BLOCK_SZ );
244 :
245 709242620 : batch_tail_data[ batch_idx ] = tail_data;
246 709242620 : batch_tail_rem [ batch_idx ] = (ulong)( (!!tail_data_sz) ^ (!sz) ); /* (hash 1 tail block if 0 sz) */
247 :
248 709242620 : scratch_free += FD_BLAKE3_BLOCK_SZ;
249 :
250 : /* Populate the tail blocks. We first clear the blocks. Then we
251 : copy any straggler data bytes into the tail. */
252 :
253 709242620 : wv_st( (ulong *) tail_data, zero );
254 709242620 : wv_st( (ulong *)(tail_data+32), zero );
255 :
256 709242620 : # if 1
257 : /* See fd_sha256_private_batch_avx */
258 709242620 : ulong src = (ulong)data + tail_data_off;
259 709242620 : ulong dst = tail_data;
260 709242620 : ulong rem = tail_data_sz;
261 736149132 : while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
262 789978078 : while( rem>= 8UL ) { *(ulong *)dst = FD_LOAD( ulong, src ); dst += 8UL; src += 8UL; rem -= 8UL; }
263 709242620 : if ( rem>= 4UL ) { *(uint *)dst = FD_LOAD( uint, src ); dst += 4UL; src += 4UL; rem -= 4UL; }
264 709242620 : if ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src ); dst += 2UL; src += 2UL; rem -= 2UL; }
265 709242620 : if ( rem ) { *(uchar *)dst = FD_LOAD( uchar, src ); dst++; }
266 : # else
267 : fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
268 : # endif
269 709242620 : }
270 96454968 : } while(0);
271 :
272 :
273 96454968 : wu_t const iv0 = wu_bcast( FD_BLAKE3_IV[0] );
274 96454968 : wu_t const iv1 = wu_bcast( FD_BLAKE3_IV[1] );
275 96454968 : wu_t const iv2 = wu_bcast( FD_BLAKE3_IV[2] );
276 96454968 : wu_t const iv3 = wu_bcast( FD_BLAKE3_IV[3] );
277 96454968 : wu_t const iv4 = wu_bcast( FD_BLAKE3_IV[4] );
278 96454968 : wu_t const iv5 = wu_bcast( FD_BLAKE3_IV[5] );
279 96454968 : wu_t const iv6 = wu_bcast( FD_BLAKE3_IV[6] );
280 96454968 : wu_t const iv7 = wu_bcast( FD_BLAKE3_IV[7] );
281 :
282 96454968 : wu_t h0=iv0; wu_t h1=iv1; wu_t h2=iv2; wu_t h3=iv3;
283 96454968 : wu_t h4=iv4; wu_t h5=iv5; wu_t h6=iv6; wu_t h7=iv7;
284 96454968 : if( FD_UNLIKELY( batch_cv ) ) {
285 : /* If the input chaining value is overridden, transpose the input to
286 : AVX representation (8x8 transpose). */
287 81452968 : __m256i const ** cv_vec = (__m256i const **)batch_cv;
288 81452968 : wu_t cv[8];
289 733076712 : for( ulong i=0UL; i<8UL; i++ ) cv[i] = _mm256_loadu_si256( cv_vec[ i ] );
290 81452968 : wu_transpose_8x8( cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7],
291 81452968 : h0, h1, h2, h3, h4, h5, h6, h7 );
292 81452968 : }
293 :
294 96454968 : wu_t ctr_lo = wu( ctr_vec[0], ctr_vec[1], ctr_vec[2], ctr_vec[3],
295 96454968 : ctr_vec[4], ctr_vec[5], ctr_vec[6], ctr_vec[7] );
296 96454968 : wu_t ctr_hi = wu( ctr_vec[0]>>32, ctr_vec[1]>>32, ctr_vec[2]>>32, ctr_vec[3]>>32,
297 96454968 : ctr_vec[4]>>32, ctr_vec[5]>>32, ctr_vec[6]>>32, ctr_vec[7]>>32 );
298 96454968 : wu_t flags = wu_ldu( batch_flags );
299 96454968 : wu_t off = wu_zero();
300 96454968 : wu_t sz = wu_ldu( batch_sz );
301 :
302 96454968 : wv_t wv_64 = wv_bcast( FD_BLAKE3_BLOCK_SZ );
303 96454968 : wv_t W_sentinel = wv_bcast( (ulong)scratch );
304 96454968 : wc_t batch_lane = wc_unpack( (1<<batch_cnt)-1 );
305 :
306 96454968 : wv_t tail_lo = wv_ld( batch_tail_data );
307 96454968 : wv_t tail_hi = wv_ld( batch_tail_data+4 );
308 :
309 96454968 : wv_t tail_rem_lo = wv_ld( batch_tail_rem );
310 96454968 : wv_t tail_rem_hi = wv_ld( batch_tail_rem+4 );
311 :
312 96454968 : wv_t W_lo = wv_ld( batch_data );
313 96454968 : wv_t W_hi = wv_ld( batch_data+4 );
314 :
315 96454968 : wv_t batch_sz_lo = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( sz, 0 ) );
316 96454968 : wv_t batch_sz_hi = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( sz, 1 ) );
317 :
318 96454968 : wv_t block_rem_lo = wv_notczero( wc_expand( batch_lane, 0 ),
319 96454968 : wv_add( wv_shr( batch_sz_lo, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_lo ) );
320 96454968 : wv_t block_rem_hi = wv_notczero( wc_expand( batch_lane, 1 ),
321 96454968 : wv_add( wv_shr( batch_sz_hi, FD_BLAKE3_BLOCK_LG_SZ ), tail_rem_hi ) );
322 :
323 : /* Upper half of the compression function output.
324 : Usually thrown away, but kept in the final compression round if
325 : out_sz==64. */
326 96454968 : wu_t hu[8] = {0};
327 :
328 96454968 : ulong lthash_rem = lthash ? 32 : 0; /* Number of LtHash (XOF) blocks remaining */
329 96454968 : int compress_done = 0;
330 281244472 : for(;;) {
331 : /* Switch lanes that have hit the end of their in-place bulk
332 : processing to their out-of-place scratch tail regions as
333 : necessary. */
334 :
335 281244472 : W_lo = wv_if( wv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
336 281244472 : W_hi = wv_if( wv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
337 :
338 : /* Derive per-block flags and block sizes */
339 :
340 281244472 : wc_t block_first = wu_eq( off, wu_zero() );
341 281244472 : wc_t block_last = wi_lt( sz, wu_add( off, wu_bcast( FD_BLAKE3_BLOCK_SZ+1 ) ) );
342 :
343 : /* Suppress root flag unless last block */
344 :
345 281244472 : wu_t root_mask = wu_or( block_last, wu_bcast( ~FD_BLAKE3_FLAG_ROOT ) );
346 281244472 : wu_t block_flags = wu_and( flags, root_mask );
347 :
348 : /* LtHash mode ends compression one early */
349 :
350 281244472 : wc_t active_lane_lo;
351 281244472 : wc_t active_lane_hi;
352 281244472 : if( FD_UNLIKELY( lthash ) ) {
353 : /* Compress until root block */
354 4705828 : wu_t all_root = wu_bcast( FD_BLAKE3_FLAG_ROOT );
355 4705828 : wu_t not_root = wu_ne( wu_and( block_flags, all_root ), all_root );
356 4705828 : active_lane_lo = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( not_root, 0 ) );
357 4705828 : active_lane_hi = _mm256_cvtepi32_epi64( _mm256_extractf128_si256( not_root, 1 ) );
358 276538644 : } else {
359 : /* Complete when there is no more input data */
360 276538644 : active_lane_lo = wv_to_wc( block_rem_lo );
361 276538644 : active_lane_hi = wv_to_wc( block_rem_hi );
362 276538644 : }
363 :
364 : /* Suppress CHUNK_{START,END} flags unless leaf node */
365 :
366 281244472 : wc_t is_parent = wu_shl( flags, 5 ); /* shift FLAG_PARENT into AVX condition bit */
367 281244472 : wu_t chunk_flags = wu_if( block_last, wu_bcast( FD_BLAKE3_FLAG_CHUNK_END ), wu_zero() );
368 281244472 : if( out_sz==32 ) {
369 : /* Hacky: out_sz==64 is only used for post-compress XOF hashing,
370 : so use that as a hint when to suppress the 'CHUNK_START' flag. */
371 118338536 : chunk_flags = wu_or( chunk_flags, wu_if( block_first, wu_bcast( FD_BLAKE3_FLAG_CHUNK_START ), wu_zero() ) );
372 118338536 : }
373 281244472 : wu_t block_sz = wu_min( wu_sub( sz, off ), wu_bcast( FD_BLAKE3_BLOCK_SZ ) );
374 281244472 : block_flags = wu_or( block_flags, wu_if( is_parent, wu_zero(), chunk_flags ) );
375 :
376 : /* Check if we are done compressing */
377 :
378 281244472 : compress_done |= !wc_any( wc_or( active_lane_lo, active_lane_hi ) );
379 281244472 : if( FD_UNLIKELY( compress_done ) ) {
380 96454968 : if( FD_UNLIKELY( !lthash_rem ) ) break;
381 600306 : active_lane_lo = wc_bcast( INT_MAX );
382 600306 : active_lane_hi = wc_bcast( INT_MAX );
383 : /* Load the next message block and fall through to XOF expansion */
384 600306 : }
385 :
386 : /* At this point, we have at least 1 block in this message segment
387 : pass that has not been processed. Load the next 64 bytes of
388 : each unprocessed block. Inactive lanes (e.g. message segments
389 : in this pass for which we've already processed all the blocks)
390 : will load garbage from a sentinel location (and the result of
391 : the state computations for the inactive lane will be ignored). */
392 :
393 185389810 : wv_t W03 = wv_if( active_lane_lo, W_lo, W_sentinel );
394 185389810 : uchar const * W0 = (uchar const *)wv_extract( W03, 0 );
395 185389810 : uchar const * W1 = (uchar const *)wv_extract( W03, 1 );
396 185389810 : uchar const * W2 = (uchar const *)wv_extract( W03, 2 );
397 185389810 : uchar const * W3 = (uchar const *)wv_extract( W03, 3 );
398 :
399 185389810 : wv_t W47 = wv_if( active_lane_hi, W_hi, W_sentinel );
400 185389810 : uchar const * W4 = (uchar const *)wv_extract( W47, 0 );
401 185389810 : uchar const * W5 = (uchar const *)wv_extract( W47, 1 );
402 185389810 : uchar const * W6 = (uchar const *)wv_extract( W47, 2 );
403 185389810 : uchar const * W7 = (uchar const *)wv_extract( W47, 3 );
404 :
405 185389810 : wu_t m[16] = { wu_ldu( W0 ), wu_ldu( W1 ), wu_ldu( W2 ), wu_ldu( W3 ),
406 185389810 : wu_ldu( W4 ), wu_ldu( W5 ), wu_ldu( W6 ), wu_ldu( W7 ),
407 185389810 : wu_ldu( W0+32 ), wu_ldu( W1+32 ), wu_ldu( W2+32 ), wu_ldu( W3+32 ),
408 185389810 : wu_ldu( W4+32 ), wu_ldu( W5+32 ), wu_ldu( W6+32 ), wu_ldu( W7+32 ) };
409 :
410 185389810 : wu_transpose_8x8( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
411 185389810 : m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7] );
412 185389810 : wu_transpose_8x8( m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
413 185389810 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
414 :
415 : /* Compute the BLAKE3 compression function updates */
416 :
417 203999296 : compress: (void)0;
418 203999296 : wu_t v[16] = {
419 203999296 : h0, h1, h2, h3,
420 203999296 : h4, h5, h6, h7,
421 203999296 : iv0, iv1, iv2, iv3,
422 203999296 : ctr_lo, ctr_hi, block_sz, block_flags,
423 203999296 : };
424 :
425 : /* Debug utility */
426 203999296 : #define STATE_FMT "state[%u] =\n %08x %08x %08x %08x\n %08x %08x %08x %08x\n %08x %08x %08x %08x\n %08x %08x %08x %08x"
427 203999296 : #define STATE_FMT_ARGS(v,i) (uint)i,\
428 203999296 : fd_uint_bswap(wu_extract(v[0x0],i)),fd_uint_bswap(wu_extract(v[0x1],i)),fd_uint_bswap(wu_extract(v[0x2],i)),fd_uint_bswap(wu_extract(v[0x3],i)),\
429 203999296 : fd_uint_bswap(wu_extract(v[0x4],i)),fd_uint_bswap(wu_extract(v[0x5],i)),fd_uint_bswap(wu_extract(v[0x6],i)),fd_uint_bswap(wu_extract(v[0x7],i)),\
430 203999296 : fd_uint_bswap(wu_extract(v[0x8],i)),fd_uint_bswap(wu_extract(v[0x9],i)),fd_uint_bswap(wu_extract(v[0xa],i)),fd_uint_bswap(wu_extract(v[0xb],i)),\
431 203999296 : fd_uint_bswap(wu_extract(v[0xc],i)),fd_uint_bswap(wu_extract(v[0xd],i)),fd_uint_bswap(wu_extract(v[0xe],i)),fd_uint_bswap(wu_extract(v[0xf],i))
432 :
433 : // FD_LOG_NOTICE(( STATE_FMT, STATE_FMT_ARGS(v,0) ));
434 203999296 : round_fn8( v, m, 0 );
435 203999296 : round_fn8( v, m, 1 );
436 203999296 : round_fn8( v, m, 2 );
437 203999296 : round_fn8( v, m, 3 );
438 203999296 : round_fn8( v, m, 4 );
439 203999296 : round_fn8( v, m, 5 );
440 203999296 : round_fn8( v, m, 6 );
441 : // FD_LOG_NOTICE(( STATE_FMT, STATE_FMT_ARGS(v,0) ));
442 :
443 203999296 : wu_t d[8] = {
444 203999296 : wu_xor( v[ 0], v[ 8] ), wu_xor( v[ 1], v[ 9] ),
445 203999296 : wu_xor( v[ 2], v[10] ), wu_xor( v[ 3], v[11] ),
446 203999296 : wu_xor( v[ 4], v[12] ), wu_xor( v[ 5], v[13] ),
447 203999296 : wu_xor( v[ 6], v[14] ), wu_xor( v[ 7], v[15] )
448 203999296 : };
449 :
450 203999296 : if( FD_LIKELY( !compress_done ) ) {
451 :
452 : /* Apply the state updates to the active lanes */
453 :
454 184789504 : wc_t active_lane = wc_narrow( active_lane_lo, active_lane_hi );
455 184789504 : if( FD_UNLIKELY( out_sz==64 ) ) {
456 : /* FIXME only export in the last iteration */
457 81452968 : hu[0] = wu_if( active_lane, wu_xor( h0, v[ 8] ), hu[0] );
458 81452968 : hu[1] = wu_if( active_lane, wu_xor( h1, v[ 9] ), hu[1] );
459 81452968 : hu[2] = wu_if( active_lane, wu_xor( h2, v[10] ), hu[2] );
460 81452968 : hu[3] = wu_if( active_lane, wu_xor( h3, v[11] ), hu[3] );
461 81452968 : hu[4] = wu_if( active_lane, wu_xor( h4, v[12] ), hu[4] );
462 81452968 : hu[5] = wu_if( active_lane, wu_xor( h5, v[13] ), hu[5] );
463 81452968 : hu[6] = wu_if( active_lane, wu_xor( h6, v[14] ), hu[6] );
464 81452968 : hu[7] = wu_if( active_lane, wu_xor( h7, v[15] ), hu[7] );
465 81452968 : }
466 184789504 : h0 = wu_if( active_lane, d[0], h0 );
467 184789504 : h1 = wu_if( active_lane, d[1], h1 );
468 184789504 : h2 = wu_if( active_lane, d[2], h2 );
469 184789504 : h3 = wu_if( active_lane, d[3], h3 );
470 184789504 : h4 = wu_if( active_lane, d[4], h4 );
471 184789504 : h5 = wu_if( active_lane, d[5], h5 );
472 184789504 : h6 = wu_if( active_lane, d[6], h6 );
473 184789504 : h7 = wu_if( active_lane, d[7], h7 );
474 :
475 : /* Advance to the next message segment blocks. In pseudo code,
476 : the below is:
477 :
478 : W += 64; if( block_rem ) block_rem--;
479 :
480 : Since wc_to_wv_raw(false/true) is 0UL/~0UL, we can use wv_add /
481 : wc_to_wv_raw instead of wv_sub / wc_to_wv to save some ops.
482 : (Consider conditional increment / decrement operations?)
483 :
484 : Also since we do not load anything at W(lane) above unless
485 : block_rem(lane) is non-zero, we can omit vector conditional
486 : operations for W(lane) below to save some additional ops. */
487 :
488 184789504 : W_lo = wv_add( W_lo, wv_if( active_lane_lo, wv_64, wv_zero() ) );
489 184789504 : W_hi = wv_add( W_hi, wv_if( active_lane_hi, wv_64, wv_zero() ) );
490 184789504 : off = wu_add( off, wu_if( active_lane, wu_bcast( FD_BLAKE3_BLOCK_SZ ), wv_zero() ) );
491 :
492 184789504 : block_rem_lo = wv_add( block_rem_lo, wv_if( active_lane_lo, wc_to_wv_raw( active_lane_lo ), wv_zero() ) );
493 184789504 : block_rem_hi = wv_add( block_rem_hi, wv_if( active_lane_hi, wc_to_wv_raw( active_lane_hi ), wv_zero() ) );
494 :
495 188123150 : } else { /* LtHash mode */
496 :
497 : /* d[i] contains output_off+(i*4) 32-bit words across output[0..8] */
498 19209792 : wu_t dh[ 8 ] = {
499 19209792 : wu_xor( h0, v[0x8] ),
500 19209792 : wu_xor( h1, v[0x9] ),
501 19209792 : wu_xor( h2, v[0xa] ),
502 19209792 : wu_xor( h3, v[0xb] ),
503 19209792 : wu_xor( h4, v[0xc] ),
504 19209792 : wu_xor( h5, v[0xd] ),
505 19209792 : wu_xor( h6, v[0xe] ),
506 19209792 : wu_xor( h7, v[0xf] )
507 19209792 : };
508 :
509 : /* Transpose outer 8x8 blocks */
510 19209792 : wu_transpose_8x8( d [0],d [1],d [2],d [3],d [4],d [5],d [6],d [7],
511 19209792 : d [0],d [1],d [2],d [3],d [4],d [5],d [6],d [7] );
512 19209792 : wu_transpose_8x8( dh[0],dh[1],dh[2],dh[3],dh[4],dh[5],dh[6],dh[7],
513 19209792 : dh[0],dh[1],dh[2],dh[3],dh[4],dh[5],dh[6],dh[7] );
514 :
515 : /* d[i] contains output[i]+out_off */
516 :
517 : /* Reduce-add into d[0] */
518 19209792 : d [0] = wh_add( d [0], d [1] ); /* sum(l[0 1]) */
519 19209792 : dh[0] = wh_add( dh[0], dh[1] ); /* sum(h[0 1]) */
520 19209792 : d [2] = wh_add( d [2], d [3] ); /* sum(l[2 3]) */
521 19209792 : dh[2] = wh_add( dh[2], dh[3] ); /* sum(h[2 3]) */
522 19209792 : d [4] = wh_add( d [4], d [5] ); /* sum(l[4 5])*/
523 19209792 : dh[4] = wh_add( dh[4], dh[5] ); /* sum(h[4 5]) */
524 19209792 : d [6] = wh_add( d [6], d [7] ); /* sum(l[6 7]) */
525 19209792 : dh[6] = wh_add( dh[6], dh[7] ); /* sum(h[6 7]) */
526 19209792 : d [0] = wh_add( d [0], d [2] ); /* sum(l[0 1 2 3]) */
527 19209792 : dh[0] = wh_add( dh[0], dh[2] ); /* sum(h[0 1 2 3]) */
528 19209792 : d [4] = wh_add( d [4], d [6] ); /* sum(l[4 5 6 7]) */
529 19209792 : dh[4] = wh_add( dh[4], dh[6] ); /* sum(h[4 5 6 7]) */
530 19209792 : d [0] = wh_add( d [0], d [4] ); /* sum(l[0 1 2 3 4 5 6 7]) */
531 19209792 : dh[0] = wh_add( dh[0], dh[4] ); /* sum(h[0 1 2 3 4 5 6 7]) */
532 19209792 : wh_st( lthash, d [0] );
533 19209792 : wh_st( lthash+16, dh[0] );
534 :
535 : /* Wind up for next iteration */
536 19209792 : lthash += 32;
537 19209792 : lthash_rem--;
538 19209792 : wu_t ctr_add = wu_bcast( 1 );
539 19209792 : /**/ ctr_lo = wu_add( ctr_lo, ctr_add );
540 19209792 : wu_t ctr_carry = wi_gt ( wu_xor( ctr_add, wu_bcast( 0x80000000 ) ),
541 19209792 : wu_xor( ctr_lo, wu_bcast( 0x80000000 ) ) );
542 19209792 : /**/ ctr_hi = wu_sub( ctr_hi, ctr_carry );
543 19209792 : if( FD_UNLIKELY( !lthash_rem ) ) {
544 600306 : FD_BLAKE3_TRACE(( "fd_blake3_avx_compress8: done (lthash para)" ));
545 600306 : return;
546 600306 : }
547 18609486 : goto compress;
548 :
549 19209792 : # undef STATE_FMT
550 19209792 : # undef STATE_FMT_ARGS
551 19209792 : }
552 203999296 : }
553 :
554 : /* Store the results */
555 :
556 95854662 : wu_transpose_8x8( h0, h1, h2, h3, h4, h5, h6, h7,
557 95854662 : h0, h1, h2, h3, h4, h5, h6, h7 );
558 :
559 95854662 : uint * const * batch_hash = (uint * const *)__builtin_assume_aligned( _batch_hash, 32 );
560 95854662 : if( FD_LIKELY( out_sz==32 ) ) {
561 14401694 : switch( batch_cnt ) { /* application dependent prob */
562 1107610 : case 8UL: wu_st( batch_hash[7], h7 ); __attribute__((fallthrough));
563 1957670 : case 7UL: wu_st( batch_hash[6], h6 ); __attribute__((fallthrough));
564 2917194 : case 6UL: wu_st( batch_hash[5], h5 ); __attribute__((fallthrough));
565 4025494 : case 5UL: wu_st( batch_hash[4], h4 ); __attribute__((fallthrough));
566 5464670 : case 4UL: wu_st( batch_hash[3], h3 ); __attribute__((fallthrough));
567 8540402 : case 3UL: wu_st( batch_hash[2], h2 ); __attribute__((fallthrough));
568 14401694 : case 2UL: wu_st( batch_hash[1], h1 ); __attribute__((fallthrough));
569 14401694 : case 1UL: wu_st( batch_hash[0], h0 ); __attribute__((fallthrough));
570 14401694 : default: break;
571 14401694 : }
572 81653106 : } else if( out_sz==64 ) {
573 81452968 : wu_transpose_8x8( hu[0], hu[1], hu[2], hu[3], hu[4], hu[5], hu[6], hu[7],
574 81452968 : hu[0], hu[1], hu[2], hu[3], hu[4], hu[5], hu[6], hu[7] );
575 81452968 : switch( batch_cnt ) { /* application dependent prob */
576 81452968 : case 8UL: wu_st( batch_hash[7], h7 );
577 81452968 : wu_st( batch_hash[7]+8, hu[7] ); __attribute__((fallthrough));
578 81452968 : case 7UL: wu_st( batch_hash[6], h6 );
579 81452968 : wu_st( batch_hash[6]+8, hu[6] ); __attribute__((fallthrough));
580 81452968 : case 6UL: wu_st( batch_hash[5], h5 );
581 81452968 : wu_st( batch_hash[5]+8, hu[5] ); __attribute__((fallthrough));
582 81452968 : case 5UL: wu_st( batch_hash[4], h4 );
583 81452968 : wu_st( batch_hash[4]+8, hu[4] ); __attribute__((fallthrough));
584 81452968 : case 4UL: wu_st( batch_hash[3], h3 );
585 81452968 : wu_st( batch_hash[3]+8, hu[3] ); __attribute__((fallthrough));
586 81452968 : case 3UL: wu_st( batch_hash[2], h2 );
587 81452968 : wu_st( batch_hash[2]+8, hu[2] ); __attribute__((fallthrough));
588 81452968 : case 2UL: wu_st( batch_hash[1], h1 );
589 81452968 : wu_st( batch_hash[1]+8, hu[1] ); __attribute__((fallthrough));
590 81452968 : case 1UL: wu_st( batch_hash[0], h0 );
591 81452968 : wu_st( batch_hash[0]+8, hu[0] ); __attribute__((fallthrough));
592 81452968 : default: break;
593 81452968 : }
594 81452968 : } else {
595 0 : FD_LOG_ERR(( "Invalid out_sz %u", out_sz ));
596 0 : }
597 95854662 : }
598 :
599 : void
600 : fd_blake3_avx_compress8_fast( uchar const * restrict msg,
601 : uchar * restrict _out,
602 : ulong counter,
603 8911882 : uchar flags ) {
604 8911882 : FD_BLAKE3_TRACE(( "fd_blake3_avx_compress8_fast(msg=%p,out=%p,counter=%lu,flags=%02x)", (void *)msg, (void *)_out, counter, flags ));
605 :
606 8911882 : uchar * restrict out = __builtin_assume_aligned( _out, 32 );
607 :
608 8911882 : int parent = flags & FD_BLAKE3_FLAG_PARENT;
609 8911882 : int lg_sz = fd_int_if( parent, FD_BLAKE3_OUTCHAIN_LG_SZ+1, FD_BLAKE3_CHUNK_LG_SZ );
610 8911882 : ulong sz = 1UL<<lg_sz;
611 :
612 : /* counters stay the same for each block. Across chunks, they
613 : increment if we are hashing leaves. Otherwise, they are zero. */
614 :
615 8911882 : wu_t ctr_add = wu_and( wu_bcast( parent ? 0 : UINT_MAX ),
616 8911882 : wu( 0, 1, 2, 3, 4, 5, 6, 7 ) );
617 8911882 : wu_t ctr_lo = wu_add( wu_bcast( counter ), ctr_add );
618 8911882 : wu_t ctr_carry = wi_gt ( wu_xor( ctr_add, wu_bcast( 0x80000000 ) ),
619 8911882 : wu_xor( ctr_lo, wu_bcast( 0x80000000 ) ) );
620 8911882 : wu_t ctr_hi = wu_sub( wu_bcast( counter>>32 ), ctr_carry );
621 8911882 : wu_t sz_vec = wu_bcast( FD_BLAKE3_BLOCK_SZ );
622 :
623 8911882 : wu_t const iv0 = wu_bcast( FD_BLAKE3_IV[0] );
624 8911882 : wu_t const iv1 = wu_bcast( FD_BLAKE3_IV[1] );
625 8911882 : wu_t const iv2 = wu_bcast( FD_BLAKE3_IV[2] );
626 8911882 : wu_t const iv3 = wu_bcast( FD_BLAKE3_IV[3] );
627 8911882 : wu_t const iv4 = wu_bcast( FD_BLAKE3_IV[4] );
628 8911882 : wu_t const iv5 = wu_bcast( FD_BLAKE3_IV[5] );
629 8911882 : wu_t const iv6 = wu_bcast( FD_BLAKE3_IV[6] );
630 8911882 : wu_t const iv7 = wu_bcast( FD_BLAKE3_IV[7] );
631 :
632 8911882 : wu_t h0=iv0; wu_t h1=iv1; wu_t h2=iv2; wu_t h3=iv3;
633 8911882 : wu_t h4=iv4; wu_t h5=iv5; wu_t h6=iv6; wu_t h7=iv7;
634 :
635 8911882 : ulong off = 0UL;
636 104180722 : do {
637 104180722 : ulong const off_next = off+FD_BLAKE3_BLOCK_SZ;
638 104180722 : uint chunk_flags =
639 104180722 : ( off ==0UL ? FD_BLAKE3_FLAG_CHUNK_START : 0u ) |
640 104180722 : ( off_next==sz ? FD_BLAKE3_FLAG_CHUNK_END : 0u );
641 104180722 : uint flags_ = flags | fd_uint_if( parent, 0, chunk_flags );
642 104180722 : wu_t flags_vec = wu_bcast( flags_ );
643 :
644 104180722 : wu_t m[16];
645 104180722 : m[ 0] = wu_ldu( msg + (0<<lg_sz) + off );
646 104180722 : m[ 1] = wu_ldu( msg + (1<<lg_sz) + off );
647 104180722 : m[ 2] = wu_ldu( msg + (2<<lg_sz) + off );
648 104180722 : m[ 3] = wu_ldu( msg + (3<<lg_sz) + off );
649 104180722 : m[ 4] = wu_ldu( msg + (4<<lg_sz) + off );
650 104180722 : m[ 5] = wu_ldu( msg + (5<<lg_sz) + off );
651 104180722 : m[ 6] = wu_ldu( msg + (6<<lg_sz) + off );
652 104180722 : m[ 7] = wu_ldu( msg + (7<<lg_sz) + off );
653 104180722 : m[ 8] = wu_ldu( msg + (0<<lg_sz) + off + 32 );
654 104180722 : m[ 9] = wu_ldu( msg + (1<<lg_sz) + off + 32 );
655 104180722 : m[10] = wu_ldu( msg + (2<<lg_sz) + off + 32 );
656 104180722 : m[11] = wu_ldu( msg + (3<<lg_sz) + off + 32 );
657 104180722 : m[12] = wu_ldu( msg + (4<<lg_sz) + off + 32 );
658 104180722 : m[13] = wu_ldu( msg + (5<<lg_sz) + off + 32 );
659 104180722 : m[14] = wu_ldu( msg + (6<<lg_sz) + off + 32 );
660 104180722 : m[15] = wu_ldu( msg + (7<<lg_sz) + off + 32 );
661 :
662 104180722 : wu_transpose_8x8( m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7],
663 104180722 : m[0x0], m[0x1], m[0x2], m[0x3], m[0x4], m[0x5], m[0x6], m[0x7] );
664 104180722 : wu_transpose_8x8( m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf],
665 104180722 : m[0x8], m[0x9], m[0xa], m[0xb], m[0xc], m[0xd], m[0xe], m[0xf] );
666 :
667 104180722 : wu_t v[16] = {
668 104180722 : h0, h1, h2, h3,
669 104180722 : h4, h5, h6, h7,
670 104180722 : iv0, iv1, iv2, iv3,
671 104180722 : ctr_lo, ctr_hi, sz_vec, flags_vec,
672 104180722 : };
673 :
674 104180722 : round_fn8( v, m, 0 );
675 104180722 : round_fn8( v, m, 1 );
676 104180722 : round_fn8( v, m, 2 );
677 104180722 : round_fn8( v, m, 3 );
678 104180722 : round_fn8( v, m, 4 );
679 104180722 : round_fn8( v, m, 5 );
680 104180722 : round_fn8( v, m, 6 );
681 :
682 104180722 : h0 = wu_xor( v[ 0], v[ 8] );
683 104180722 : h1 = wu_xor( v[ 1], v[ 9] );
684 104180722 : h2 = wu_xor( v[ 2], v[10] );
685 104180722 : h3 = wu_xor( v[ 3], v[11] );
686 104180722 : h4 = wu_xor( v[ 4], v[12] );
687 104180722 : h5 = wu_xor( v[ 5], v[13] );
688 104180722 : h6 = wu_xor( v[ 6], v[14] );
689 104180722 : h7 = wu_xor( v[ 7], v[15] );
690 :
691 104180722 : off = off_next;
692 104180722 : } while( off!=sz );
693 :
694 8911882 : wu_transpose_8x8( h0, h1, h2, h3, h4, h5, h6, h7,
695 8911882 : h0, h1, h2, h3, h4, h5, h6, h7 );
696 :
697 8911882 : wu_st( (uint *)( out + (0UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h0 );
698 8911882 : wu_st( (uint *)( out + (1UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h1 );
699 8911882 : wu_st( (uint *)( out + (2UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h2 );
700 8911882 : wu_st( (uint *)( out + (3UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h3 );
701 8911882 : wu_st( (uint *)( out + (4UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h4 );
702 8911882 : wu_st( (uint *)( out + (5UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h5 );
703 8911882 : wu_st( (uint *)( out + (6UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h6 );
704 8911882 : wu_st( (uint *)( out + (7UL<<FD_BLAKE3_OUTCHAIN_LG_SZ) ), h7 );
705 8911882 : }
|