Line data Source code
1 : #include "fd_blake3.h"
2 : #include "fd_blake3_private.h"
3 : #include <assert.h>
4 :
5 : /* Hash state machine *************************************************/
6 :
7 : static FD_FN_UNUSED fd_blake3_pos_t *
8 : fd_blake3_pos_init( fd_blake3_pos_t * s,
9 : uchar const * data,
10 88117259 : ulong sz ) {
11 88117259 : *s = (fd_blake3_pos_t) {
12 88117259 : .input = data,
13 88117259 : .input_sz = sz,
14 88117259 : .magic = FD_BLAKE3_MAGIC,
15 88117259 : };
16 88117259 : return s;
17 88117259 : }
18 :
19 : /* fd_blake3_l0_complete returns 1 if all leaf nodes have been hashed,
20 : 0 otherwise. */
21 :
22 : FD_FN_PURE static inline int
23 665827251 : fd_blake3_l0_complete( fd_blake3_pos_t const * s ) {
24 665827251 : return ( s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ ) >= fd_ulong_max( s->input_sz, 64 );
25 665827251 : }
26 :
27 : FD_FN_PURE static inline int
28 : fd_blake3_is_finished( fd_blake3_pos_t const * s,
29 354966304 : ulong tick ) {
30 354966304 : int l0_complete = fd_blake3_l0_complete( s );
31 354966304 : int ln_complete = s->live_cnt == 1UL;
32 354966304 : int idle = tick >= s->next_tick;
33 354966304 : return l0_complete & ln_complete & idle;
34 354966304 : }
35 :
36 : static fd_blake3_op_t *
37 : fd_blake3_prepare_leaf( fd_blake3_pos_t * restrict s,
38 : fd_blake3_buf_t * restrict buf,
39 : fd_blake3_op_t * restrict op,
40 111629180 : ulong tick ) {
41 :
42 111629180 : ulong msg_off = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
43 111629180 : ulong msg_sz = fd_ulong_min( s->input_sz - msg_off, 1024UL );
44 111629180 : uchar const * msg = s->input + msg_off;
45 111629180 : uchar * out = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
46 :
47 111629180 : int flags = fd_int_if( s->input_sz <= FD_BLAKE3_CHUNK_SZ, FD_BLAKE3_FLAG_ROOT, 0 );
48 :
49 111629180 : *op = (fd_blake3_op_t) {
50 111629180 : .msg = msg,
51 111629180 : .out = out,
52 111629180 : .counter = s->leaf_idx,
53 111629180 : .sz = (ushort)msg_sz,
54 111629180 : .flags = (uchar)flags
55 111629180 : };
56 :
57 111629180 : s->head.uc[ 0 ] = (uchar)( s->head.uc[ 0 ]+1 );
58 111629180 : s->leaf_idx++;
59 111629180 : s->live_cnt++;
60 111629180 : s->next_tick = tick+1;
61 :
62 111629180 : return op;
63 :
64 111629180 : }
65 :
66 : static int
67 : fd_blake3_seek_branch( fd_blake3_pos_t * restrict s,
68 : fd_blake3_buf_t * restrict buf,
69 85432477 : ulong tick ) {
70 :
71 85432477 : if( s->live_cnt == 1UL )
72 6725649 : return 0;
73 :
74 78706828 : if( !fd_blake3_l0_complete( s ) )
75 1255872 : return ( s->tail.uc[ s->layer - 1 ] + 1 ) <
76 1255872 : ( s->head.uc[ s->layer - 1 ] );
77 :
78 77450956 : # if FD_HAS_AVX
79 :
80 77450956 : wb_t diff = wb_sub( s->head.wb, s->tail.wb );
81 :
82 77450956 : uint mergeable_layers = (uint)_mm256_movemask_epi8( wb_gt( diff, wb_bcast( 1 ) ) );
83 77450956 : int merge_layer = fd_uint_find_lsb_w_default( mergeable_layers, -1 );
84 77450956 : if( merge_layer>=0 ) {
85 66010141 : if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
86 12212666 : return 0; /* still waiting for previous merge */
87 53797475 : s->layer = (uint)merge_layer+1U;
88 53797475 : return 1;
89 66010141 : }
90 :
91 11440815 : uint single_layers = (uint)_mm256_movemask_epi8( wb_eq( diff, wb_bcast( 1 ) ) );
92 11440815 : uint single_lo = (uint)fd_uint_find_lsb( single_layers );
93 11440815 : uint single_hi = (uint)fd_uint_find_lsb( single_layers & ( ~fd_uint_mask_lsb( (int)(single_lo+1U) ) ) );
94 :
95 11440815 : wb_t node = wb_ld( buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ] );
96 11440815 : wb_st( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ], node );
97 :
98 : # else /* FD_HAS_AVX */
99 :
100 : uchar diff[ 32 ];
101 : for( ulong j=0UL; j<32UL; j++ ) diff[j] = (uchar)( s->head.uc[j] - s->tail.uc[j] );
102 :
103 : int merge_layer = -1;
104 : for( uint j=0U; j<32U; j++ ) {
105 : if( diff[j]>1 ) {
106 : merge_layer = (int)j;
107 : break;
108 : }
109 : }
110 : if( merge_layer>=0 ) {
111 : if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
112 : return 0; /* still waiting for previous merge */
113 : s->layer = (uint)(merge_layer+1);
114 : return 1;
115 : }
116 :
117 : uint j=0U;
118 : uint single_lo = 0UL;
119 : uint single_hi = 0UL;
120 : for( ; j<32U; j++ ) {
121 : if( diff[j] ) {
122 : single_lo = j;
123 : break;
124 : }
125 : }
126 : j++;
127 : for( ; j<32U; j++ ) {
128 : if( diff[j] ) {
129 : single_hi = j;
130 : break;
131 : }
132 : }
133 :
134 : memcpy( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ],
135 : buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ],
136 : 32UL );
137 :
138 : # endif /* FD_HAS_AVX */
139 :
140 11440815 : FD_BLAKE3_TRACE(( "fd_blake3_seek_branch: moving up %u/%u to %u/%u",
141 11440815 : single_lo, s->tail.uc[ single_lo ],
142 11440815 : single_hi, s->head.uc[ single_hi ] ));
143 :
144 11440815 : if( ((uint)single_hi >= s->layer) & (tick < s->next_tick) )
145 4706778 : return 0; /* still waiting for previous merge */
146 :
147 6734037 : s->head.uc[ single_lo ] = (uchar)( s->head.uc[ single_lo ]-1 );
148 6734037 : s->head.uc[ single_hi ] = (uchar)( s->head.uc[ single_hi ]+1 );
149 :
150 6734037 : s->layer = (uint)single_hi+1U;
151 6734037 : return 1;
152 11440815 : }
153 :
154 : static fd_blake3_op_t *
155 : fd_blake3_prepare_branch( fd_blake3_pos_t * restrict s,
156 : fd_blake3_buf_t * restrict buf,
157 : fd_blake3_op_t * restrict op,
158 85432477 : ulong tick ) {
159 :
160 85432477 : if( !fd_blake3_seek_branch( s, buf, tick ) )
161 23645093 : return NULL;
162 :
163 85432477 : assert( s->layer < FD_BLAKE3_ROW_CNT );
164 :
165 61787384 : uchar const * msg = buf->slots[ s->layer-1U ][ s->tail.uc[ s->layer-1U ] ];
166 61787384 : uchar * out = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
167 :
168 61787384 : s->head.uc[ s->layer ] = (uchar)( s->head.uc[ s->layer ]+1 );
169 61787384 : s->tail.uc[ s->layer-1 ] = (uchar)( s->tail.uc[ s->layer-1 ]+2 );
170 61787384 : s->live_cnt--;
171 61787384 : s->next_tick = tick+1;
172 :
173 61787384 : uint flags = FD_BLAKE3_FLAG_PARENT |
174 61787384 : fd_uint_if( s->live_cnt==1UL, FD_BLAKE3_FLAG_ROOT, 0u );
175 :
176 61787384 : *op = (fd_blake3_op_t) {
177 61787384 : .msg = msg,
178 61787384 : .out = out,
179 61787384 : .counter = 0UL,
180 61787384 : .sz = 64U,
181 61787384 : .flags = (uchar)flags
182 61787384 : };
183 61787384 : return op;
184 :
185 61787384 : }
186 :
187 : static void
188 85741963 : fd_blake3_advance( fd_blake3_pos_t * restrict s ) {
189 :
190 85741963 : # if FD_HAS_AVX
191 :
192 85741963 : wb_t mask = wb_eq( s->tail.wb, s->head.wb );
193 85741963 : s->tail.wb = wb_andnot( mask, s->tail.wb );
194 85741963 : s->head.wb = wb_andnot( mask, s->head.wb );
195 :
196 : # else /* FD_HAS_AVX */
197 :
198 : for( ulong j=0UL; j<32UL; j++ ) {
199 : if( s->tail.uc[j] == s->head.uc[j] ) {
200 : s->tail.uc[j] = 0;
201 : s->head.uc[j] = 0;
202 : }
203 : }
204 :
205 : # endif /* FD_HAS_AVX */
206 :
207 85741963 : if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT ) {
208 275418 : s->layer++;
209 275418 : }
210 85466545 : else if( ( s->layer > 0UL ) &&
211 85466545 : ( s->tail.uc[ s->layer-1 ] < s->head.uc[ s->layer-1 ] ) ) {
212 : /* pass */
213 7407838 : }
214 78058707 : else if( fd_blake3_l0_complete( s ) ) {
215 19975764 : s->layer++;
216 19975764 : }
217 58082943 : else if( s->layer > 0UL ) {
218 90281 : s->layer = 0UL;
219 90281 : }
220 :
221 85741963 : }
222 :
223 : static fd_blake3_op_t *
224 : fd_blake3_prepare( fd_blake3_pos_t * restrict s,
225 : fd_blake3_buf_t * restrict buf,
226 : fd_blake3_op_t * restrict op,
227 226389913 : ulong tick ) {
228 :
229 226389913 : assert( s->layer < FD_BLAKE3_ROW_CNT );
230 :
231 226389913 : if( fd_blake3_is_finished( s, tick ) )
232 0 : return NULL;
233 :
234 226389913 : if( tick >= s->next_tick )
235 85741963 : fd_blake3_advance( s );
236 :
237 226389913 : if( s->layer != 0 )
238 85432477 : return fd_blake3_prepare_branch( s, buf, op, tick );
239 :
240 140957436 : if( ( s->head.uc[0] >= FD_BLAKE3_COL_CNT ) |
241 140957436 : ( fd_blake3_l0_complete( s ) ) ) {
242 57954621 : return NULL;
243 57954621 : }
244 :
245 83002815 : return fd_blake3_prepare_leaf( s, buf, op, tick );
246 :
247 140957436 : }
248 :
249 : #if FD_BLAKE3_PARA_MAX>1
250 :
251 : /* fd_blake3_prepare_fast does streamlined hashing of full chunks or
252 : full branches. */
253 :
254 : static fd_blake3_op_t *
255 : fd_blake3_prepare_fast( fd_blake3_pos_t * restrict s,
256 : fd_blake3_buf_t * restrict buf,
257 : fd_blake3_op_t * restrict op,
258 : ulong n,
259 38072253 : ulong min ) {
260 :
261 38072253 : if( s->layer && s->head.uc[ s->layer-1 ]==FD_BLAKE3_COL_CNT ) {
262 2718646 : op->msg = buf->rows[ s->layer-1 ];
263 2718646 : op->out = buf->rows[ s->layer ] + (s->head.uc[ s->layer ]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
264 2718646 : op->counter = 0UL;
265 2718646 : op->flags = FD_BLAKE3_FLAG_PARENT;
266 :
267 : /* Assume that branch layer is fully hashed (up to col cnt) */
268 2718646 : s->head.uc[ s->layer-1 ] = 0;
269 2718646 : s->head.uc[ s->layer ] = (uchar)( (ulong)s->head.uc[ s->layer ]+n );
270 2718646 : s->live_cnt -= n;
271 2718646 : s->layer = fd_uint_if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT,
272 2718646 : s->layer+1U, 0U );
273 :
274 2718646 : return op;
275 2718646 : }
276 :
277 35353607 : ulong pos = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
278 35353607 : ulong avail = fd_ulong_align_dn( s->input_sz - pos, FD_BLAKE3_CHUNK_SZ ) >> FD_BLAKE3_CHUNK_LG_SZ;
279 35353607 : n = fd_ulong_min( n, avail );
280 :
281 : /* This constants controls the threshold when to use the (slow)
282 : scheduler instead of fast single-message hashing. Carefully tuned
283 : for best overall performance. */
284 35353607 : if( n<min ) return NULL;
285 :
286 7380416 : op->msg = s->input + (s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ);
287 7380416 : op->out = buf->rows[0] + (s->head.uc[0]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
288 7380416 : op->counter = s->leaf_idx;
289 7380416 : op->flags = 0;
290 :
291 7380416 : s->head.uc[0] = (uchar)( (ulong)s->head.uc[0]+n );
292 7380416 : s->leaf_idx += n;
293 7380416 : s->live_cnt += n;
294 7380416 : s->layer = fd_uint_if( s->head.uc[0]==FD_BLAKE3_COL_CNT, 1U, 0U );
295 :
296 7380416 : return op;
297 35353607 : }
298 :
299 : static void
300 : fd_blake3_batch_hash( fd_blake3_op_t const * ops,
301 82223473 : ulong op_cnt ) {
302 82223473 : uchar const * batch_data [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
303 82223473 : uint batch_data_sz[ FD_BLAKE3_PARA_MAX ] = {0};
304 82223473 : uchar * batch_hash [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
305 82223473 : ulong batch_ctr [ FD_BLAKE3_PARA_MAX ];
306 82223473 : uint batch_flags [ FD_BLAKE3_PARA_MAX ];
307 223495182 : for( ulong j=0UL; j<op_cnt; j++ ) {
308 141271709 : batch_data [ j ] = ops[ j ].msg;
309 141271709 : batch_hash [ j ] = ops[ j ].out;
310 141271709 : batch_data_sz[ j ] = ops[ j ].sz;
311 141271709 : batch_ctr [ j ] = ops[ j ].counter;
312 141271709 : batch_flags [ j ] = ops[ j ].flags;
313 141271709 : }
314 27413539 : #if FD_HAS_AVX512
315 27413539 : fd_blake3_avx512_compress16( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
316 : #elif FD_HAS_AVX
317 54809934 : fd_blake3_avx_compress8 ( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
318 : #else
319 : #error "FIXME missing para support"
320 : #endif
321 82223473 : }
322 :
323 : #endif
324 :
325 : /* Simple API *********************************************************/
326 :
327 : ulong
328 75 : fd_blake3_align( void ) {
329 75 : return FD_BLAKE3_ALIGN;
330 75 : }
331 :
332 : ulong
333 24 : fd_blake3_footprint( void ) {
334 24 : return FD_BLAKE3_FOOTPRINT;
335 24 : }
336 :
337 : void *
338 27 : fd_blake3_new( void * shmem ) {
339 27 : fd_blake3_t * sha = (fd_blake3_t *)shmem;
340 :
341 27 : if( FD_UNLIKELY( !shmem ) ) {
342 3 : FD_LOG_WARNING(( "NULL shmem" ));
343 3 : return NULL;
344 3 : }
345 :
346 24 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_blake3_align() ) ) ) {
347 3 : FD_LOG_WARNING(( "misaligned shmem" ));
348 3 : return NULL;
349 3 : }
350 :
351 21 : ulong footprint = fd_blake3_footprint();
352 :
353 21 : fd_memset( sha, 0, footprint );
354 :
355 21 : FD_COMPILER_MFENCE();
356 21 : FD_VOLATILE( sha->pos.magic ) = FD_BLAKE3_MAGIC;
357 21 : FD_COMPILER_MFENCE();
358 :
359 21 : return (void *)sha;
360 24 : }
361 :
362 : fd_blake3_t *
363 27 : fd_blake3_join( void * shsha ) {
364 :
365 27 : if( FD_UNLIKELY( !shsha ) ) {
366 3 : FD_LOG_WARNING(( "NULL shsha" ));
367 3 : return NULL;
368 3 : }
369 :
370 24 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
371 3 : FD_LOG_WARNING(( "misaligned shsha" ));
372 3 : return NULL;
373 3 : }
374 :
375 21 : fd_blake3_t * sha = (fd_blake3_t *)shsha;
376 :
377 21 : if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
378 0 : FD_LOG_WARNING(( "bad magic" ));
379 0 : return NULL;
380 0 : }
381 :
382 21 : return sha;
383 21 : }
384 :
385 : void *
386 24 : fd_blake3_leave( fd_blake3_t * sha ) {
387 :
388 24 : if( FD_UNLIKELY( !sha ) ) {
389 3 : FD_LOG_WARNING(( "NULL sha" ));
390 3 : return NULL;
391 3 : }
392 :
393 21 : return (void *)sha;
394 24 : }
395 :
396 : void *
397 27 : fd_blake3_delete( void * shsha ) {
398 :
399 27 : if( FD_UNLIKELY( !shsha ) ) {
400 3 : FD_LOG_WARNING(( "NULL shsha" ));
401 3 : return NULL;
402 3 : }
403 :
404 24 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
405 3 : FD_LOG_WARNING(( "misaligned shsha" ));
406 3 : return NULL;
407 3 : }
408 :
409 21 : fd_blake3_t * sha = (fd_blake3_t *)shsha;
410 :
411 21 : if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
412 0 : FD_LOG_WARNING(( "bad magic" ));
413 0 : return NULL;
414 0 : }
415 :
416 21 : FD_COMPILER_MFENCE();
417 21 : FD_VOLATILE( sha->pos.magic ) = 0UL;
418 21 : FD_COMPILER_MFENCE();
419 :
420 21 : return (void *)sha;
421 21 : }
422 :
423 :
424 : fd_blake3_t *
425 60144068 : fd_blake3_init( fd_blake3_t * sha ) {
426 60144068 : FD_BLAKE3_TRACE(( "fd_blake3_init(sha=%p)", (void *)sha ));
427 60144068 : fd_blake3_pos_init( &sha->pos, NULL, 0UL );
428 60144068 : sha->block_sz = 0UL;
429 60144068 : return sha;
430 60144068 : }
431 :
432 : #if FD_BLAKE3_PARA_MAX>1
433 :
434 : static void
435 : fd_blake3_append_blocks( fd_blake3_pos_t * s,
436 : fd_blake3_buf_t * tbl,
437 : uchar const * data,
438 1957190 : ulong buf_cnt ) {
439 1957190 : s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
440 6193811 : for( ulong i=0UL; i<buf_cnt; i++ ) {
441 4236621 : fd_blake3_op_t op[1];
442 6024808 : do {
443 6024808 : if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, FD_BLAKE3_PARA_MAX ) )
444 0 : return;
445 822624 : #if FD_HAS_AVX512
446 822624 : fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
447 : #elif FD_HAS_AVX
448 5202184 : fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
449 : #else
450 : #error "missing para support"
451 : #endif
452 6024808 : } while( op->flags & FD_BLAKE3_FLAG_PARENT );
453 4236621 : }
454 1957190 : }
455 :
456 : #else
457 :
458 : static void
459 : fd_blake3_append_blocks( fd_blake3_pos_t * s,
460 : fd_blake3_buf_t * tbl,
461 : uchar const * data,
462 : ulong buf_cnt ) {
463 : (void)buf_cnt;
464 : s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
465 : fd_blake3_op_t op[1];
466 : while( buf_cnt ) {
467 : if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) ) {
468 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: no more ops to prepare" ));
469 : break;
470 : }
471 : if( op->flags & FD_BLAKE3_FLAG_PARENT ) {
472 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing output chaining values (layer %u)", s->layer ));
473 : fd_blake3_ref_compress1( op->out, op->msg, 64UL, op->counter, op->flags, NULL, NULL );
474 : } else {
475 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing %lu leaf chunks", FD_BLAKE3_COL_CNT ));
476 : fd_blake3_ref_compress1( op->out, op->msg, FD_BLAKE3_CHUNK_SZ, op->counter, op->flags, NULL, NULL );
477 : buf_cnt--;
478 : }
479 : s->next_tick++;
480 : }
481 : }
482 :
483 : #endif
484 :
485 : fd_blake3_t *
486 : fd_blake3_append( fd_blake3_t * sha,
487 : void const * _data,
488 60227120 : ulong sz ) {
489 :
490 : /* If no data to append, we are done */
491 :
492 60227120 : if( FD_UNLIKELY( !sz ) ) return sha;
493 60187996 : FD_BLAKE3_TRACE(( "fd_blake3_append(sha=%p,data=%p,sz=%lu)", (void *)sha, _data, sz ));
494 :
495 : /* Unpack inputs */
496 :
497 60187996 : fd_blake3_pos_t * s = &sha->pos;
498 60187996 : fd_blake3_buf_t * tbl = &sha->buf;
499 60187996 : uchar * buf = sha->block;
500 60187996 : ulong buf_used = sha->block_sz;
501 :
502 60187996 : uchar const * data = (uchar const *)_data;
503 :
504 : /* Update input_sz */
505 :
506 60187996 : s->input_sz += sz;
507 :
508 : /* Edge case: For the first completed 1024 bytes of input, don't
509 : immediately hash, since it is not clear whether this chunk has
510 : the root flag set. */
511 60187996 : if( FD_UNLIKELY( FD_BLAKE3_PARA_MAX==1 && s->input_sz==1024UL ) ) {
512 0 : fd_memcpy( buf + buf_used, data, sz );
513 0 : sha->block_sz = FD_BLAKE3_CHUNK_SZ;
514 0 : return sha;
515 0 : }
516 :
517 : /* Handle buffered bytes from previous appends */
518 :
519 60187996 : if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
520 :
521 : /* If the append isn't large enough to complete the current block,
522 : buffer these bytes too and return */
523 :
524 47942 : ulong buf_rem = FD_BLAKE3_PRIVATE_BUF_MAX - buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
525 47942 : if( FD_UNLIKELY( sz < buf_rem ) ) { /* optimize for large append */
526 8649 : fd_memcpy( buf + buf_used, data, sz );
527 8649 : sha->block_sz = buf_used + sz;
528 8649 : return sha;
529 8649 : }
530 :
531 : /* Otherwise, buffer enough leading bytes of data to complete the
532 : block, update the hash and then continue processing any remaining
533 : bytes of data. */
534 :
535 39293 : fd_memcpy( buf + buf_used, data, buf_rem );
536 39293 : data += buf_rem;
537 39293 : sz -= buf_rem;
538 :
539 39293 : fd_blake3_append_blocks( s, tbl, buf, 1UL );
540 39293 : sha->block_sz = 0UL;
541 39293 : }
542 :
543 : /* Append the bulk of the data */
544 :
545 60179347 : ulong buf_cnt = sz >> FD_BLAKE3_PRIVATE_LG_BUF_MAX;
546 60179347 : if( FD_LIKELY( buf_cnt ) ) fd_blake3_append_blocks( s, tbl, data, buf_cnt ); /* optimized for large append */
547 :
548 : /* Buffer any leftover bytes */
549 :
550 60179347 : buf_used = sz & (FD_BLAKE3_PRIVATE_BUF_MAX-1UL); /* In [0,FD_BLAKE3_PRIVATE_BUF_MAX) */
551 60179347 : if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
552 59876262 : fd_memcpy( buf, data + (buf_cnt << FD_BLAKE3_PRIVATE_LG_BUF_MAX), buf_used );
553 59876262 : sha->block_sz = buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
554 59876262 : }
555 :
556 60179347 : FD_BLAKE3_TRACE(( "fd_blake3_append: done" ));
557 60179347 : return sha;
558 60187996 : }
559 :
560 : static void const *
561 : fd_blake3_single_hash( fd_blake3_pos_t * s,
562 55972404 : fd_blake3_buf_t * tbl ) {
563 55972404 : #if FD_BLAKE3_PARA_MAX>1
564 55972404 : ulong tick = 0UL;
565 128576391 : while( !fd_blake3_is_finished( s, tick ) ) {
566 72603987 : fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
567 72603987 : ulong op_cnt = 0UL;
568 179647468 : while( op_cnt<FD_BLAKE3_PARA_MAX ) {
569 179349375 : fd_blake3_op_t * op = &ops[ op_cnt ];
570 179349375 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
571 72305894 : break;
572 107043481 : op_cnt++;
573 107043481 : }
574 :
575 72603987 : fd_blake3_batch_hash( ops, op_cnt );
576 72603987 : tick++;
577 72603987 : }
578 : #else
579 : while( !fd_blake3_is_finished( s, s->next_tick ) ) {
580 : fd_blake3_op_t op[1] = {0};
581 : if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) )
582 : break;
583 : s->next_tick++;
584 : FD_BLAKE3_TRACE(( "fd_blake3_single_hash: compressing %hu bytes at layer %u, counter %lu, flags 0x%x",
585 : op->sz, s->layer, op->counter, op->flags ));
586 : # if FD_HAS_SSE
587 : fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
588 : # else
589 : fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
590 : # endif
591 : }
592 : #endif
593 55972404 : return tbl->slots[ s->layer ][0];
594 55972404 : }
595 :
596 : void *
597 : fd_blake3_fini( fd_blake3_t * sha,
598 27999213 : void * hash ) {
599 :
600 : /* Unpack inputs */
601 :
602 27999213 : fd_blake3_pos_t * s = &sha->pos;
603 27999213 : fd_blake3_buf_t * tbl = &sha->buf;
604 27999213 : uchar * buf = sha->block;
605 27999213 : ulong buf_used = sha->block_sz;
606 27999213 : FD_BLAKE3_TRACE(( "fd_blake3_fini(sha=%p,sz=%lu)", (void *)sha, s->input_sz ));
607 :
608 : /* TODO HACKY!! */
609 27999213 : s->input = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
610 27999213 : s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
611 :
612 27999213 : void const * hash_ = fd_blake3_single_hash( s, tbl );
613 27999213 : memcpy( hash, hash_, 32UL );
614 27999213 : return hash;
615 27999213 : }
616 :
617 : /* fd_blake3_fini_xof_compress performs BLAKE3 compression (input
618 : hashing) for all blocks in the hash tree except for the root block.
619 : Root compression inputs are returned via the function's out pointers:
620 : On return, root_msg[0..64] contains the padded message input for the
621 : root block, root_cv_pre[0..64] contains the output chaining value of
622 : the previous block (or the BLAKE3 IV if root block is the only block
623 : in the hash operation, i.e. <=64 byte hash input).
624 : Other values (counter, flags, size) are re-derived by the XOF
625 : implementation using the blake3 state object. */
626 :
627 : void
628 : fd_blake3_fini_xof_compress( fd_blake3_t * sha,
629 : uchar * root_msg,
630 32144855 : uchar * root_cv_pre ) {
631 32144855 : fd_blake3_pos_t * s = &sha->pos;
632 32144855 : fd_blake3_buf_t * tbl = &sha->buf;
633 32144855 : uchar * buf = sha->block;
634 32144855 : ulong buf_used = sha->block_sz;
635 :
636 : /* TODO HACKY!! */
637 32144855 : s->input = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
638 32144855 : s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
639 :
640 : /* The root block is contained in a leaf. Process all but the last
641 : blocks of the chunk. (The last block is the "root" block) */
642 32144855 : if( s->input_sz<=FD_BLAKE3_CHUNK_SZ ) {
643 28626365 : fd_blake3_op_t op[1];
644 28626365 : if( !fd_blake3_prepare_leaf( s, tbl, op, s->next_tick ) )
645 0 : FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare compression of <=1024 byte message (duplicate call to fini?)" ));
646 28626365 : #if FD_HAS_SSE
647 28626365 : fd_blake3_sse_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
648 : #else
649 : fd_blake3_ref_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
650 : #endif
651 28626365 : return;
652 28626365 : }
653 :
654 : /* The root block is a branch node. Continue working until there are
655 : only two blocks remaining. */
656 3518490 : ulong tick = sha->pos.next_tick+1;
657 13137976 : for(;;) {
658 13137976 : int l0_complete = fd_blake3_l0_complete( s );
659 13137976 : int ln_complete = s->live_cnt == 2UL;
660 13137976 : if( l0_complete & ln_complete ) break;
661 :
662 9619486 : #if FD_BLAKE3_PARA_MAX>1
663 9619486 : fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
664 9619486 : ulong op_cnt = 0UL;
665 43847714 : while( op_cnt<FD_BLAKE3_PARA_MAX ) {
666 43522048 : fd_blake3_op_t * op = &ops[ op_cnt ];
667 43522048 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
668 9293820 : break;
669 34228228 : op_cnt++;
670 34228228 : }
671 9619486 : if( FD_UNLIKELY( !op_cnt ) ) {
672 0 : FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare branch compression with live_cnt=%lu (duplicate call to fini?)", s->live_cnt ));
673 0 : }
674 :
675 9619486 : fd_blake3_batch_hash( ops, op_cnt );
676 : #else
677 : fd_blake3_op_t op[1] = {0};
678 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
679 : break;
680 : # if FD_HAS_SSE
681 : fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
682 : # else
683 : fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
684 : # endif
685 : #endif
686 9619486 : tick++;
687 9619486 : }
688 3518490 : }
689 :
690 : void *
691 : fd_blake3_fini_2048( fd_blake3_t * sha,
692 32144791 : void * hash ) {
693 32144791 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048(sha=%p,hash=%p)", (void *)sha, hash ));
694 :
695 : /* Compress input until the last remaining piece of work is the BLAKE3
696 : root block. This root block is put through the compression
697 : function repeatedly to "expand" the hash output (XOF hashing).
698 : Solana uses this to generate a 2048 byte 'LtHash' value.
699 : fd_blake3 does this SIMD-parallel for better performance. */
700 32144791 : uchar root_msg [ 64 ] __attribute__((aligned(64)));
701 32144791 : uchar root_cv_pre[ 32 ] __attribute__((aligned(32)));
702 32144791 : fd_blake3_fini_xof_compress( sha, root_msg, root_cv_pre );
703 :
704 : /* Restore root block details */
705 32144791 : uint last_block_sz = 64u;
706 32144791 : uint last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_PARENT;
707 32144791 : ulong ctr0 = 0UL;
708 32144791 : if( sha->pos.input_sz<=FD_BLAKE3_CHUNK_SZ ) {
709 28626301 : last_block_sz = (uint)sha->pos.input_sz & 63u;
710 28626301 : if( fd_ulong_is_aligned( sha->pos.input_sz, 64 ) ) last_block_sz = 64;
711 28626301 : if( FD_UNLIKELY( sha->pos.input_sz==0UL ) ) last_block_sz = 0u;
712 28626301 : last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_CHUNK_END;
713 28626301 : if( sha->pos.input_sz<=FD_BLAKE3_BLOCK_SZ ) last_block_flags |= FD_BLAKE3_FLAG_CHUNK_START;
714 28626301 : ctr0 = sha->pos.leaf_idx-1UL;
715 28626301 : } else {
716 3518490 : fd_blake3_op_t op[1];
717 3518490 : if( FD_UNLIKELY( !fd_blake3_prepare( &sha->pos, &sha->buf, op, sha->pos.next_tick+1UL ) ) ) {
718 0 : FD_LOG_ERR(( "fd_blake3_fini_2048 invariant violation: failed to prepare branch root compression (duplicate call to fini?)" ));
719 0 : }
720 3518490 : memcpy( root_msg, op->msg, 64UL );
721 3518490 : memcpy( root_cv_pre, FD_BLAKE3_IV, 32UL );
722 3518490 : }
723 32144791 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: sz=%lu ctr0=%lu flags=%x",
724 32144791 : sha->pos.input_sz, ctr0, last_block_flags ));
725 :
726 : /* Expand LtHash
727 : For now, this uses the generic AVX2/AVX512 compress backend.
728 : Could write a more optimized version in the future saving some of
729 : the matrix transpose work. */
730 137160761 : for( ulong i=0UL; i<32UL; i+=FD_BLAKE3_PARA_MAX ) {
731 23563194 : #if FD_HAS_AVX512
732 23563194 : ulong batch_data [ 16 ] __attribute__((aligned(64)));
733 400574298 : /* */ for( ulong j=0; j<16; j++ ) batch_data [ j ] = (ulong)root_msg;
734 400574298 : uint batch_sz [ 16 ]; for( ulong j=0; j<16; j++ ) batch_sz [ j ] = last_block_sz;
735 400574298 : ulong batch_ctr [ 16 ]; for( ulong j=0; j<16; j++ ) batch_ctr [ j ] = ctr0+i+j;
736 400574298 : uint batch_flags[ 16 ]; for( ulong j=0; j<16; j++ ) batch_flags[ j ] = last_block_flags;
737 400574298 : void * batch_hash [ 16 ]; for( ulong j=0; j<16; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
738 400574298 : void * batch_cv [ 16 ]; for( ulong j=0; j<16; j++ ) batch_cv [ j ] = root_cv_pre;
739 23563194 : fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
740 : #elif FD_HAS_AVX
741 733074984 : ulong batch_data [ 8 ]; for( ulong j=0; j<8; j++ ) batch_data [ j ] = (ulong)root_msg;
742 733074984 : uint batch_sz [ 8 ]; for( ulong j=0; j<8; j++ ) batch_sz [ j ] = last_block_sz;
743 733074984 : ulong batch_ctr [ 8 ]; for( ulong j=0; j<8; j++ ) batch_ctr [ j ] = ctr0+i+j;
744 733074984 : uint batch_flags[ 8 ]; for( ulong j=0; j<8; j++ ) batch_flags[ j ] = last_block_flags;
745 733074984 : void * batch_hash [ 8 ]; for( ulong j=0; j<8; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
746 733074984 : void * batch_cv [ 8 ]; for( ulong j=0; j<8; j++ ) batch_cv [ j ] = root_cv_pre;
747 81452776 : fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
748 : #elif FD_HAS_SSE
749 : fd_blake3_sse_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
750 : #else
751 : fd_blake3_ref_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
752 : #endif
753 105015970 : }
754 :
755 32144791 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: done" ));
756 32144791 : return hash;
757 32144791 : }
758 :
759 : void *
760 : fd_blake3_hash( void const * data,
761 : ulong sz,
762 27973191 : void * hash ) {
763 :
764 27973191 : fd_blake3_buf_t tbl[1];
765 27973191 : fd_blake3_pos_t s[1];
766 27973191 : fd_blake3_pos_init( s, data, sz );
767 :
768 27973191 : #if FD_BLAKE3_PARA_MAX>1
769 32047445 : for(;;) {
770 32047445 : fd_blake3_op_t op[1];
771 32047445 : if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, 4 ) )
772 27973191 : break;
773 965162 : #if FD_HAS_AVX512
774 965162 : fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
775 : #elif FD_HAS_AVX
776 3109092 : fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
777 : #else
778 : #error "missing para support"
779 : #endif
780 4074254 : }
781 27973191 : #endif
782 :
783 27973191 : void const * hash_ = fd_blake3_single_hash( s, tbl );
784 27973191 : memcpy( hash, hash_, 32UL );
785 27973191 : return hash;
786 27973191 : }
787 :
788 : #if FD_HAS_AVX
789 :
790 : void
791 : fd_blake3_lthash_batch8(
792 : void const * batch_data[8], /* align=32 ele_align=1 */
793 : uint const batch_sz [8], /* align=32 */
794 : void * out_lthash /* align=32 */
795 300006 : ) {
796 300006 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 32 ) ) ) {
797 0 : FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
798 0 : }
799 300006 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 32 ) ) ) {
800 0 : FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
801 0 : }
802 300006 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 32 ) ) ) {
803 0 : FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
804 0 : }
805 :
806 300006 : ulong batch_ctr [ 8 ] = {0};
807 2700054 : uint batch_flags[ 8 ]; for( uint i=0; i<8; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
808 300006 : fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
809 300006 : }
810 :
811 : #endif
812 :
813 : #if FD_HAS_AVX512
814 :
815 : void
816 : fd_blake3_lthash_batch16(
817 : void const * batch_data[16], /* align=32 ele_align=1 */
818 : uint const batch_sz [16], /* align=32 */
819 : void * out_lthash /* align=32 */
820 100001 : ) {
821 100001 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 64 ) ) ) {
822 0 : FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
823 0 : }
824 100001 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 64 ) ) ) {
825 0 : FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
826 0 : }
827 100001 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 64 ) ) ) {
828 0 : FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
829 0 : }
830 :
831 100001 : ulong batch_ctr [ 16 ] = {0};
832 1700017 : uint batch_flags[ 16 ]; for( uint i=0; i<16; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
833 100001 : fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
834 100001 : }
835 :
836 : #endif
|