Line data Source code
1 : #include "fd_blake3.h"
2 : #include "fd_blake3_private.h"
3 :
4 : /* Hash state machine *************************************************/
5 :
6 : static FD_FN_UNUSED fd_blake3_pos_t *
7 : fd_blake3_pos_init( fd_blake3_pos_t * s,
8 : uchar const * data,
9 10973006 : ulong sz ) {
10 10973006 : *s = (fd_blake3_pos_t) {
11 10973006 : .input = data,
12 10973006 : .input_sz = sz,
13 10973006 : .magic = FD_BLAKE3_MAGIC,
14 10973006 : };
15 10973006 : return s;
16 10973006 : }
17 :
18 : /* fd_blake3_l0_complete returns 1 if all leaf nodes have been hashed,
19 : 0 otherwise. */
20 :
21 : FD_FN_PURE static inline int
22 31792715 : fd_blake3_l0_complete( fd_blake3_pos_t const * s ) {
23 31792715 : return ( s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ ) >= fd_ulong_max( s->input_sz, 64 );
24 31792715 : }
25 :
26 : FD_FN_PURE static inline int
27 : fd_blake3_is_finished( fd_blake3_pos_t const * s,
28 14519532 : ulong tick ) {
29 14519532 : int l0_complete = fd_blake3_l0_complete( s );
30 14519532 : int ln_complete = s->live_cnt == 1UL;
31 14519532 : int idle = tick >= s->next_tick;
32 14519532 : return l0_complete & ln_complete & idle;
33 14519532 : }
34 :
35 : static fd_blake3_op_t *
36 : fd_blake3_prepare_leaf( fd_blake3_pos_t * restrict s,
37 : fd_blake3_buf_t * restrict buf,
38 : fd_blake3_op_t * restrict op,
39 12419288 : ulong tick ) {
40 :
41 12419288 : ulong msg_off = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
42 12419288 : ulong msg_sz = fd_ulong_min( s->input_sz - msg_off, 1024UL );
43 12419288 : uchar const * msg = s->input + msg_off;
44 12419288 : uchar * out = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
45 :
46 12419288 : int flags = fd_int_if( s->input_sz <= FD_BLAKE3_CHUNK_SZ, FD_BLAKE3_FLAG_ROOT, 0 );
47 :
48 12419288 : *op = (fd_blake3_op_t) {
49 12419288 : .msg = msg,
50 12419288 : .out = out,
51 12419288 : .counter = s->leaf_idx,
52 12419288 : .sz = (ushort)msg_sz,
53 12419288 : .flags = (uchar)flags
54 12419288 : };
55 :
56 12419288 : s->head.uc[ 0 ] = (uchar)( s->head.uc[ 0 ]+1 );
57 12419288 : s->leaf_idx++;
58 12419288 : s->live_cnt++;
59 12419288 : s->next_tick = tick+1;
60 :
61 12419288 : return op;
62 :
63 12419288 : }
64 :
65 : static int
66 : fd_blake3_seek_branch( fd_blake3_pos_t * restrict s,
67 : fd_blake3_buf_t * restrict buf,
68 11537475 : ulong tick ) {
69 :
70 11537475 : if( s->live_cnt == 1UL )
71 77571 : return 0;
72 :
73 11459904 : if( !fd_blake3_l0_complete( s ) )
74 1691520 : return ( s->tail.uc[ s->layer - 1 ] + 1 ) <
75 1691520 : ( s->head.uc[ s->layer - 1 ] );
76 :
77 9768384 : # if FD_HAS_AVX
78 :
79 9768384 : wb_t diff = wb_sub( s->head.wb, s->tail.wb );
80 :
81 9768384 : uint mergeable_layers = (uint)_mm256_movemask_epi8( wb_gt( diff, wb_bcast( 1 ) ) );
82 9768384 : int merge_layer = fd_uint_find_lsb_w_default( mergeable_layers, -1 );
83 9768384 : if( merge_layer>=0 ) {
84 8996601 : if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
85 1633421 : return 0; /* still waiting for previous merge */
86 7363180 : s->layer = (uint)merge_layer+1U;
87 7363180 : return 1;
88 8996601 : }
89 :
90 771783 : uint single_layers = (uint)_mm256_movemask_epi8( wb_eq( diff, wb_bcast( 1 ) ) );
91 771783 : uint single_lo = (uint)fd_uint_find_lsb( single_layers );
92 771783 : uint single_hi = (uint)fd_uint_find_lsb( single_layers & ( ~fd_uint_mask_lsb( (int)(single_lo+1U) ) ) );
93 :
94 771783 : wb_t node = wb_ld( buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ] );
95 771783 : wb_st( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ], node );
96 :
97 : # else /* FD_HAS_AVX */
98 :
99 : uchar diff[ 32 ];
100 : for( ulong j=0UL; j<32UL; j++ ) diff[j] = (uchar)( s->head.uc[j] - s->tail.uc[j] );
101 :
102 : int merge_layer = -1;
103 : for( uint j=0U; j<32U; j++ ) {
104 : if( diff[j]>1 ) {
105 : merge_layer = (int)j;
106 : break;
107 : }
108 : }
109 : if( merge_layer>=0 ) {
110 : if( ((uint)merge_layer >= s->layer) & (tick < s->next_tick) )
111 : return 0; /* still waiting for previous merge */
112 : s->layer = (uint)(merge_layer+1);
113 : return 1;
114 : }
115 :
116 : uint j=0U;
117 : uint single_lo = 0UL;
118 : uint single_hi = 0UL;
119 : for( ; j<32U; j++ ) {
120 : if( diff[j] ) {
121 : single_lo = j;
122 : break;
123 : }
124 : }
125 : j++;
126 : for( ; j<32U; j++ ) {
127 : if( diff[j] ) {
128 : single_hi = j;
129 : break;
130 : }
131 : }
132 :
133 : memcpy( buf->slots[ single_hi ][ s->head.uc[ single_hi ] ],
134 : buf->slots[ single_lo ][ s->tail.uc[ single_lo ] ],
135 : 32UL );
136 :
137 : # endif /* FD_HAS_AVX */
138 :
139 771783 : FD_BLAKE3_TRACE(( "fd_blake3_seek_branch: moving up %u/%u to %u/%u",
140 771783 : single_lo, s->tail.uc[ single_lo ],
141 771783 : single_hi, s->head.uc[ single_hi ] ));
142 :
143 771783 : if( ((uint)single_hi >= s->layer) & (tick < s->next_tick) )
144 263007 : return 0; /* still waiting for previous merge */
145 :
146 508776 : s->head.uc[ single_lo ] = (uchar)( s->head.uc[ single_lo ]-1 );
147 508776 : s->head.uc[ single_hi ] = (uchar)( s->head.uc[ single_hi ]+1 );
148 :
149 508776 : s->layer = (uint)single_hi+1U;
150 508776 : return 1;
151 771783 : }
152 :
153 : static fd_blake3_op_t *
154 : fd_blake3_prepare_branch( fd_blake3_pos_t * restrict s,
155 : fd_blake3_buf_t * restrict buf,
156 : fd_blake3_op_t * restrict op,
157 11537475 : ulong tick ) {
158 :
159 11537475 : if( !fd_blake3_seek_branch( s, buf, tick ) )
160 1973999 : return NULL;
161 :
162 9563476 : FD_DCHECK_CRIT( s->layer < FD_BLAKE3_ROW_CNT, "invariant violation" );
163 :
164 9563476 : uchar const * msg = buf->slots[ s->layer-1U ][ s->tail.uc[ s->layer-1U ] ];
165 9563476 : uchar * out = buf->slots[ s->layer ][ s->head.uc[ s->layer ] ];
166 :
167 9563476 : s->head.uc[ s->layer ] = (uchar)( s->head.uc[ s->layer ]+1 );
168 9563476 : s->tail.uc[ s->layer-1 ] = (uchar)( s->tail.uc[ s->layer-1 ]+2 );
169 9563476 : s->live_cnt--;
170 9563476 : s->next_tick = tick+1;
171 :
172 9563476 : uint flags = FD_BLAKE3_FLAG_PARENT |
173 9563476 : fd_uint_if( s->live_cnt==1UL, FD_BLAKE3_FLAG_ROOT, 0u );
174 :
175 9563476 : *op = (fd_blake3_op_t) {
176 9563476 : .msg = msg,
177 9563476 : .out = out,
178 9563476 : .counter = 0UL,
179 9563476 : .sz = 64U,
180 9563476 : .flags = (uchar)flags
181 9563476 : };
182 9563476 : return op;
183 :
184 11537475 : }
185 :
186 : static void
187 2738049 : fd_blake3_advance( fd_blake3_pos_t * restrict s ) {
188 :
189 2738049 : # if FD_HAS_AVX
190 :
191 2738049 : wb_t mask = wb_eq( s->tail.wb, s->head.wb );
192 2738049 : s->tail.wb = wb_andnot( mask, s->tail.wb );
193 2738049 : s->head.wb = wb_andnot( mask, s->head.wb );
194 :
195 : # else /* FD_HAS_AVX */
196 :
197 : for( ulong j=0UL; j<32UL; j++ ) {
198 : if( s->tail.uc[j] == s->head.uc[j] ) {
199 : s->tail.uc[j] = 0;
200 : s->head.uc[j] = 0;
201 : }
202 : }
203 :
204 : # endif /* FD_HAS_AVX */
205 :
206 2738049 : if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT ) {
207 93938 : s->layer++;
208 93938 : }
209 2644111 : else if( ( s->layer > 0UL ) &&
210 2644111 : ( s->tail.uc[ s->layer-1 ] < s->head.uc[ s->layer-1 ] ) ) {
211 : /* pass */
212 785099 : }
213 1859012 : else if( fd_blake3_l0_complete( s ) ) {
214 1537307 : s->layer++;
215 1537307 : }
216 321705 : else if( s->layer > 0UL ) {
217 116270 : s->layer = 0UL;
218 116270 : }
219 :
220 2738049 : }
221 :
222 : static fd_blake3_op_t *
223 : fd_blake3_prepare( fd_blake3_pos_t * restrict s,
224 : fd_blake3_buf_t * restrict buf,
225 : fd_blake3_op_t * restrict op,
226 13597518 : ulong tick ) {
227 :
228 13597518 : FD_DCHECK_CRIT( s->layer < FD_BLAKE3_ROW_CNT, "invariant violation" );
229 :
230 13597518 : if( fd_blake3_is_finished( s, tick ) )
231 0 : return NULL;
232 :
233 13597518 : if( tick >= s->next_tick )
234 2738049 : fd_blake3_advance( s );
235 :
236 13597518 : if( s->layer != 0 )
237 11537475 : return fd_blake3_prepare_branch( s, buf, op, tick );
238 :
239 2060043 : if( ( s->head.uc[0] >= FD_BLAKE3_COL_CNT ) |
240 2060043 : ( fd_blake3_l0_complete( s ) ) ) {
241 291927 : return NULL;
242 291927 : }
243 :
244 1768116 : return fd_blake3_prepare_leaf( s, buf, op, tick );
245 :
246 2060043 : }
247 :
248 : #if FD_BLAKE3_PARA_MAX>1
249 :
250 : /* fd_blake3_prepare_fast does streamlined hashing of full chunks or
251 : full branches. */
252 :
253 : static fd_blake3_op_t *
254 : fd_blake3_prepare_fast( fd_blake3_pos_t * restrict s,
255 : fd_blake3_buf_t * restrict buf,
256 : fd_blake3_op_t * restrict op,
257 : ulong n,
258 8436075 : ulong min ) {
259 :
260 8436075 : if( s->layer && s->head.uc[ s->layer-1 ]==FD_BLAKE3_COL_CNT ) {
261 3803638 : op->msg = buf->rows[ s->layer-1 ];
262 3803638 : op->out = buf->rows[ s->layer ] + (s->head.uc[ s->layer ]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
263 3803638 : op->counter = 0UL;
264 3803638 : op->flags = FD_BLAKE3_FLAG_PARENT;
265 :
266 : /* Assume that branch layer is fully hashed (up to col cnt) */
267 3803638 : s->head.uc[ s->layer-1 ] = 0;
268 3803638 : s->head.uc[ s->layer ] = (uchar)( (ulong)s->head.uc[ s->layer ]+n );
269 3803638 : s->live_cnt -= n;
270 3803638 : s->layer = fd_uint_if( s->head.uc[ s->layer ]==FD_BLAKE3_COL_CNT,
271 3803638 : s->layer+1U, 0U );
272 :
273 3803638 : return op;
274 3803638 : }
275 :
276 4632437 : ulong pos = s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ;
277 4632437 : ulong avail = fd_ulong_align_dn( s->input_sz - pos, FD_BLAKE3_CHUNK_SZ ) >> FD_BLAKE3_CHUNK_LG_SZ;
278 4632437 : n = fd_ulong_min( n, avail );
279 :
280 : /* This constants controls the threshold when to use the (slow)
281 : scheduler instead of fast single-message hashing. Carefully tuned
282 : for best overall performance. */
283 4632437 : if( n<min ) return NULL;
284 :
285 4606460 : op->msg = s->input + (s->leaf_idx<<FD_BLAKE3_CHUNK_LG_SZ);
286 4606460 : op->out = buf->rows[0] + (s->head.uc[0]<<FD_BLAKE3_OUTCHAIN_LG_SZ);
287 4606460 : op->counter = s->leaf_idx;
288 4606460 : op->flags = 0;
289 :
290 4606460 : s->head.uc[0] = (uchar)( (ulong)s->head.uc[0]+n );
291 4606460 : s->leaf_idx += n;
292 4606460 : s->live_cnt += n;
293 4606460 : s->layer = fd_uint_if( s->head.uc[0]==FD_BLAKE3_COL_CNT, 1U, 0U );
294 :
295 4606460 : return op;
296 4632437 : }
297 :
298 : static void
299 : fd_blake3_batch_hash( fd_blake3_op_t const * ops,
300 2494404 : ulong op_cnt ) {
301 2494404 : uchar const * batch_data [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
302 2494404 : uint batch_data_sz[ FD_BLAKE3_PARA_MAX ] = {0};
303 2494404 : uchar * batch_hash [ FD_BLAKE3_PARA_MAX ] __attribute__((aligned(64)));
304 2494404 : ulong batch_ctr [ FD_BLAKE3_PARA_MAX ];
305 2494404 : uint batch_flags [ FD_BLAKE3_PARA_MAX ];
306 13582351 : for( ulong j=0UL; j<op_cnt; j++ ) {
307 11087947 : batch_data [ j ] = ops[ j ].msg;
308 11087947 : batch_hash [ j ] = ops[ j ].out;
309 11087947 : batch_data_sz[ j ] = ops[ j ].sz;
310 11087947 : batch_ctr [ j ] = ops[ j ].counter;
311 11087947 : batch_flags [ j ] = ops[ j ].flags;
312 11087947 : }
313 827402 : #if FD_HAS_AVX512
314 827402 : fd_blake3_avx512_compress16( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
315 : #elif FD_HAS_AVX
316 1667002 : fd_blake3_avx_compress8 ( op_cnt, batch_data, batch_data_sz, batch_ctr, batch_flags, fd_type_pun( batch_hash ), NULL, 32U, NULL );
317 : #else
318 : #error "FIXME missing para support"
319 : #endif
320 2494404 : }
321 :
322 : #endif
323 :
324 : /* Simple API *********************************************************/
325 :
326 : ulong
327 66 : fd_blake3_align( void ) {
328 66 : return FD_BLAKE3_ALIGN;
329 66 : }
330 :
331 : ulong
332 21 : fd_blake3_footprint( void ) {
333 21 : return FD_BLAKE3_FOOTPRINT;
334 21 : }
335 :
336 : void *
337 24 : fd_blake3_new( void * shmem ) {
338 24 : fd_blake3_t * sha = (fd_blake3_t *)shmem;
339 :
340 24 : if( FD_UNLIKELY( !shmem ) ) {
341 3 : FD_LOG_WARNING(( "NULL shmem" ));
342 3 : return NULL;
343 3 : }
344 :
345 21 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, fd_blake3_align() ) ) ) {
346 3 : FD_LOG_WARNING(( "misaligned shmem" ));
347 3 : return NULL;
348 3 : }
349 :
350 18 : ulong footprint = fd_blake3_footprint();
351 :
352 18 : fd_memset( sha, 0, footprint );
353 :
354 18 : FD_COMPILER_MFENCE();
355 18 : FD_VOLATILE( sha->pos.magic ) = FD_BLAKE3_MAGIC;
356 18 : FD_COMPILER_MFENCE();
357 :
358 18 : return (void *)sha;
359 21 : }
360 :
361 : fd_blake3_t *
362 24 : fd_blake3_join( void * shsha ) {
363 :
364 24 : if( FD_UNLIKELY( !shsha ) ) {
365 3 : FD_LOG_WARNING(( "NULL shsha" ));
366 3 : return NULL;
367 3 : }
368 :
369 21 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
370 3 : FD_LOG_WARNING(( "misaligned shsha" ));
371 3 : return NULL;
372 3 : }
373 :
374 18 : fd_blake3_t * sha = (fd_blake3_t *)shsha;
375 :
376 18 : if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
377 0 : FD_LOG_WARNING(( "bad magic" ));
378 0 : return NULL;
379 0 : }
380 :
381 18 : return sha;
382 18 : }
383 :
384 : void *
385 21 : fd_blake3_leave( fd_blake3_t * sha ) {
386 :
387 21 : if( FD_UNLIKELY( !sha ) ) {
388 3 : FD_LOG_WARNING(( "NULL sha" ));
389 3 : return NULL;
390 3 : }
391 :
392 18 : return (void *)sha;
393 21 : }
394 :
395 : void *
396 24 : fd_blake3_delete( void * shsha ) {
397 :
398 24 : if( FD_UNLIKELY( !shsha ) ) {
399 3 : FD_LOG_WARNING(( "NULL shsha" ));
400 3 : return NULL;
401 3 : }
402 :
403 21 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shsha, fd_blake3_align() ) ) ) {
404 3 : FD_LOG_WARNING(( "misaligned shsha" ));
405 3 : return NULL;
406 3 : }
407 :
408 18 : fd_blake3_t * sha = (fd_blake3_t *)shsha;
409 :
410 18 : if( FD_UNLIKELY( sha->pos.magic!=FD_BLAKE3_MAGIC ) ) {
411 0 : FD_LOG_WARNING(( "bad magic" ));
412 0 : return NULL;
413 0 : }
414 :
415 18 : FD_COMPILER_MFENCE();
416 18 : FD_VOLATILE( sha->pos.magic ) = 0UL;
417 18 : FD_COMPILER_MFENCE();
418 :
419 18 : return (void *)sha;
420 18 : }
421 :
422 :
423 : fd_blake3_t *
424 10947029 : fd_blake3_init( fd_blake3_t * sha ) {
425 10947029 : FD_BLAKE3_TRACE(( "fd_blake3_init(sha=%p)", (void *)sha ));
426 10947029 : fd_blake3_pos_init( &sha->pos, NULL, 0UL );
427 10947029 : sha->block_sz = 0UL;
428 10947029 : return sha;
429 10947029 : }
430 :
431 : #if FD_BLAKE3_PARA_MAX>1
432 :
433 : static void
434 : fd_blake3_append_blocks( fd_blake3_pos_t * s,
435 : fd_blake3_buf_t * tbl,
436 : uchar const * data,
437 350331 : ulong buf_cnt ) {
438 350331 : s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
439 4266433 : for( ulong i=0UL; i<buf_cnt; i++ ) {
440 3916102 : fd_blake3_op_t op[1];
441 7117589 : do {
442 7117589 : if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, FD_BLAKE3_PARA_MAX ) )
443 0 : return;
444 1336483 : #if FD_HAS_AVX512
445 1336483 : fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
446 : #elif FD_HAS_AVX
447 5781106 : fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
448 : #else
449 : #error "missing para support"
450 : #endif
451 7117589 : } while( op->flags & FD_BLAKE3_FLAG_PARENT );
452 3916102 : }
453 350331 : }
454 :
455 : #else
456 :
457 : static void
458 : fd_blake3_append_blocks( fd_blake3_pos_t * s,
459 : fd_blake3_buf_t * tbl,
460 : uchar const * data,
461 : ulong buf_cnt ) {
462 : (void)buf_cnt;
463 : s->input = data - (s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ); /* TODO HACKY!! */
464 : fd_blake3_op_t op[1];
465 : while( buf_cnt ) {
466 : if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) ) {
467 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: no more ops to prepare" ));
468 : break;
469 : }
470 : if( op->flags & FD_BLAKE3_FLAG_PARENT ) {
471 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing output chaining values (layer %u)", s->layer ));
472 : fd_blake3_ref_compress1( op->out, op->msg, 64UL, op->counter, op->flags, NULL, NULL );
473 : } else {
474 : FD_BLAKE3_TRACE(( "fd_blake3_append_blocks: compressing %lu leaf chunks", FD_BLAKE3_COL_CNT ));
475 : fd_blake3_ref_compress1( op->out, op->msg, FD_BLAKE3_CHUNK_SZ, op->counter, op->flags, NULL, NULL );
476 : buf_cnt--;
477 : }
478 : s->next_tick++;
479 : }
480 : }
481 :
482 : #endif
483 :
484 : fd_blake3_t *
485 : fd_blake3_append( fd_blake3_t * sha,
486 : void const * _data,
487 11130437 : ulong sz ) {
488 :
489 : /* If no data to append, we are done */
490 :
491 11130437 : if( FD_UNLIKELY( !sz ) ) return sha;
492 11091205 : FD_BLAKE3_TRACE(( "fd_blake3_append(sha=%p,data=%p,sz=%lu)", (void *)sha, _data, sz ));
493 :
494 : /* Unpack inputs */
495 :
496 11091205 : fd_blake3_pos_t * s = &sha->pos;
497 11091205 : fd_blake3_buf_t * tbl = &sha->buf;
498 11091205 : uchar * buf = sha->block;
499 11091205 : ulong buf_used = sha->block_sz;
500 :
501 11091205 : uchar const * data = (uchar const *)_data;
502 :
503 : /* Update input_sz */
504 :
505 11091205 : s->input_sz += sz;
506 :
507 : /* Edge case: For the first completed 1024 bytes of input, don't
508 : immediately hash, since it is not clear whether this chunk has
509 : the root flag set. */
510 11091205 : if( FD_UNLIKELY( FD_BLAKE3_PARA_MAX==1 && s->input_sz==1024UL ) ) {
511 0 : fd_memcpy( buf + buf_used, data, sz );
512 0 : sha->block_sz = FD_BLAKE3_CHUNK_SZ;
513 0 : return sha;
514 0 : }
515 :
516 : /* Handle buffered bytes from previous appends */
517 :
518 11091205 : if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
519 :
520 : /* If the append isn't large enough to complete the current block,
521 : buffer these bytes too and return */
522 :
523 148190 : ulong buf_rem = FD_BLAKE3_PRIVATE_BUF_MAX - buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
524 148190 : if( FD_UNLIKELY( sz < buf_rem ) ) { /* optimize for large append */
525 88211 : fd_memcpy( buf + buf_used, data, sz );
526 88211 : sha->block_sz = buf_used + sz;
527 88211 : return sha;
528 88211 : }
529 :
530 : /* Otherwise, buffer enough leading bytes of data to complete the
531 : block, update the hash and then continue processing any remaining
532 : bytes of data. */
533 :
534 59979 : fd_memcpy( buf + buf_used, data, buf_rem );
535 59979 : data += buf_rem;
536 59979 : sz -= buf_rem;
537 :
538 59979 : fd_blake3_append_blocks( s, tbl, buf, 1UL );
539 59979 : sha->block_sz = 0UL;
540 59979 : }
541 :
542 : /* Append the bulk of the data */
543 :
544 11002994 : ulong buf_cnt = sz >> FD_BLAKE3_PRIVATE_LG_BUF_MAX;
545 11002994 : if( FD_LIKELY( buf_cnt ) ) fd_blake3_append_blocks( s, tbl, data, buf_cnt ); /* optimized for large append */
546 :
547 : /* Buffer any leftover bytes */
548 :
549 11002994 : buf_used = sz & (FD_BLAKE3_PRIVATE_BUF_MAX-1UL); /* In [0,FD_BLAKE3_PRIVATE_BUF_MAX) */
550 11002994 : if( FD_UNLIKELY( buf_used ) ) { /* optimized for well aligned use of append */
551 11002920 : fd_memcpy( buf, data + (buf_cnt << FD_BLAKE3_PRIVATE_LG_BUF_MAX), buf_used );
552 11002920 : sha->block_sz = buf_used; /* In (0,FD_BLAKE3_PRIVATE_BUF_MAX) */
553 11002920 : }
554 :
555 11002994 : FD_BLAKE3_TRACE(( "fd_blake3_append: done" ));
556 11002994 : return sha;
557 11091205 : }
558 :
559 : static void const *
560 : fd_blake3_single_hash( fd_blake3_pos_t * s,
561 78189 : fd_blake3_buf_t * tbl ) {
562 78189 : #if FD_BLAKE3_PARA_MAX>1
563 78189 : ulong tick = 0UL;
564 922014 : while( !fd_blake3_is_finished( s, tick ) ) {
565 843825 : fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
566 843825 : ulong op_cnt = 0UL;
567 4304248 : while( op_cnt<FD_BLAKE3_PARA_MAX ) {
568 4246194 : fd_blake3_op_t * op = &ops[ op_cnt ];
569 4246194 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
570 785771 : break;
571 3460423 : op_cnt++;
572 3460423 : }
573 :
574 843825 : fd_blake3_batch_hash( ops, op_cnt );
575 843825 : tick++;
576 843825 : }
577 : #else
578 : while( !fd_blake3_is_finished( s, s->next_tick ) ) {
579 : fd_blake3_op_t op[1] = {0};
580 : if( !fd_blake3_prepare( s, tbl, op, s->next_tick ) )
581 : break;
582 : s->next_tick++;
583 : FD_BLAKE3_TRACE(( "fd_blake3_single_hash: compressing %hu bytes at layer %u, counter %lu, flags 0x%x",
584 : op->sz, s->layer, op->counter, op->flags ));
585 : # if FD_HAS_SSE
586 : fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
587 : # else
588 : fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
589 : # endif
590 : }
591 : #endif
592 78189 : return tbl->slots[ s->layer ][0];
593 78189 : }
594 :
595 : void *
596 : fd_blake3_fini( fd_blake3_t * sha,
597 52212 : void * hash ) {
598 :
599 : /* Unpack inputs */
600 :
601 52212 : fd_blake3_pos_t * s = &sha->pos;
602 52212 : fd_blake3_buf_t * tbl = &sha->buf;
603 52212 : uchar * buf = sha->block;
604 52212 : ulong buf_used = sha->block_sz;
605 52212 : FD_BLAKE3_TRACE(( "fd_blake3_fini(sha=%p,sz=%lu)", (void *)sha, s->input_sz ));
606 :
607 : /* TODO HACKY!! */
608 52212 : s->input = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
609 52212 : s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
610 :
611 52212 : void const * hash_ = fd_blake3_single_hash( s, tbl );
612 52212 : memcpy( hash, hash_, 32UL );
613 52212 : return hash;
614 52212 : }
615 :
616 : /* fd_blake3_fini_xof_compress performs BLAKE3 compression (input
617 : hashing) for all blocks in the hash tree except for the root block.
618 : Root compression inputs are returned via the function's out pointers:
619 : On return, root_msg[0..64] contains the padded message input for the
620 : root block, root_cv_pre[0..64] contains the output chaining value of
621 : the previous block (or the BLAKE3 IV if root block is the only block
622 : in the hash operation, i.e. <=64 byte hash input).
623 : Other values (counter, flags, size) are re-derived by the XOF
624 : implementation using the blake3 state object. */
625 :
626 : void
627 : fd_blake3_fini_xof_compress( fd_blake3_t * sha,
628 : uchar * root_msg,
629 10894817 : uchar * root_cv_pre ) {
630 10894817 : fd_blake3_pos_t * s = &sha->pos;
631 10894817 : fd_blake3_buf_t * tbl = &sha->buf;
632 10894817 : uchar * buf = sha->block;
633 10894817 : ulong buf_used = sha->block_sz;
634 :
635 : /* TODO HACKY!! */
636 10894817 : s->input = buf - ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ );
637 10894817 : s->input_sz = ( s->leaf_idx << FD_BLAKE3_CHUNK_LG_SZ ) + buf_used;
638 :
639 : /* The root block is contained in a leaf. Process all but the last
640 : blocks of the chunk. (The last block is the "root" block) */
641 10894817 : if( s->input_sz<=FD_BLAKE3_CHUNK_SZ ) {
642 10651172 : fd_blake3_op_t op[1];
643 10651172 : if( !fd_blake3_prepare_leaf( s, tbl, op, s->next_tick ) )
644 0 : FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare compression of <=1024 byte message (duplicate call to fini?)" ));
645 10651172 : #if FD_HAS_SSE
646 10651172 : fd_blake3_sse_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
647 : #else
648 : fd_blake3_ref_compress1( root_msg, op->msg, op->sz, op->counter, op->flags, root_cv_pre, NULL );
649 : #endif
650 10651172 : return;
651 10651172 : }
652 :
653 : /* The root block is a branch node. Continue working until there are
654 : only two blocks remaining. */
655 243645 : ulong tick = sha->pos.next_tick+1;
656 1894224 : for(;;) {
657 1894224 : int l0_complete = fd_blake3_l0_complete( s );
658 1894224 : int ln_complete = s->live_cnt == 2UL;
659 1894224 : if( l0_complete & ln_complete ) break;
660 :
661 1650579 : #if FD_BLAKE3_PARA_MAX>1
662 1650579 : fd_blake3_op_t ops[ FD_BLAKE3_PARA_MAX ] = {0};
663 1650579 : ulong op_cnt = 0UL;
664 9278103 : while( op_cnt<FD_BLAKE3_PARA_MAX ) {
665 9107679 : fd_blake3_op_t * op = &ops[ op_cnt ];
666 9107679 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
667 1480155 : break;
668 7627524 : op_cnt++;
669 7627524 : }
670 1650579 : if( FD_UNLIKELY( !op_cnt ) ) {
671 0 : FD_LOG_ERR(( "fd_blake3_fini_xof_compress invariant violation: failed to prepare branch compression with live_cnt=%lu (duplicate call to fini?)", s->live_cnt ));
672 0 : }
673 :
674 1650579 : fd_blake3_batch_hash( ops, op_cnt );
675 : #else
676 : fd_blake3_op_t op[1] = {0};
677 : if( !fd_blake3_prepare( s, tbl, op, tick ) )
678 : break;
679 : # if FD_HAS_SSE
680 : fd_blake3_sse_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
681 : # else
682 : fd_blake3_ref_compress1( op->out, op->msg, op->sz, op->counter, op->flags, NULL, NULL );
683 : # endif
684 : #endif
685 1650579 : tick++;
686 1650579 : }
687 243645 : }
688 :
689 : void *
690 : fd_blake3_fini_2048( fd_blake3_t * sha,
691 10894753 : void * hash ) {
692 10894753 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048(sha=%p,hash=%p)", (void *)sha, hash ));
693 :
694 : /* Compress input until the last remaining piece of work is the BLAKE3
695 : root block. This root block is put through the compression
696 : function repeatedly to "expand" the hash output (XOF hashing).
697 : Solana uses this to generate a 2048 byte 'LtHash' value.
698 : fd_blake3 does this SIMD-parallel for better performance. */
699 10894753 : uchar root_msg [ 64 ] __attribute__((aligned(64)));
700 10894753 : uchar root_cv_pre[ 32 ] __attribute__((aligned(32)));
701 10894753 : fd_blake3_fini_xof_compress( sha, root_msg, root_cv_pre );
702 :
703 : /* Restore root block details */
704 10894753 : uint last_block_sz = 64u;
705 10894753 : uint last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_PARENT;
706 10894753 : ulong ctr0 = 0UL;
707 10894753 : if( sha->pos.input_sz<=FD_BLAKE3_CHUNK_SZ ) {
708 10651108 : last_block_sz = (uint)sha->pos.input_sz & 63u;
709 10651108 : if( fd_ulong_is_aligned( sha->pos.input_sz, 64 ) ) last_block_sz = 64;
710 10651108 : if( FD_UNLIKELY( sha->pos.input_sz==0UL ) ) last_block_sz = 0u;
711 10651108 : last_block_flags = FD_BLAKE3_FLAG_ROOT | FD_BLAKE3_FLAG_CHUNK_END;
712 10651108 : if( sha->pos.input_sz<=FD_BLAKE3_BLOCK_SZ ) last_block_flags |= FD_BLAKE3_FLAG_CHUNK_START;
713 10651108 : ctr0 = sha->pos.leaf_idx-1UL;
714 10651108 : } else {
715 243645 : fd_blake3_op_t op[1];
716 243645 : if( FD_UNLIKELY( !fd_blake3_prepare( &sha->pos, &sha->buf, op, sha->pos.next_tick+1UL ) ) ) {
717 0 : FD_LOG_ERR(( "fd_blake3_fini_2048 invariant violation: failed to prepare branch root compression (duplicate call to fini?)" ));
718 0 : }
719 243645 : memcpy( root_msg, op->msg, 64UL );
720 243645 : memcpy( root_cv_pre, FD_BLAKE3_IV, 32UL );
721 243645 : }
722 10894753 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: sz=%lu ctr0=%lu flags=%x",
723 10894753 : sha->pos.input_sz, ctr0, last_block_flags ));
724 :
725 : /* Expand LtHash
726 : For now, this uses the generic AVX2/AVX512 compress backend.
727 : Could write a more optimized version in the future saving some of
728 : the matrix transpose work. */
729 45069519 : for( ulong i=0UL; i<32UL; i+=FD_BLAKE3_PARA_MAX ) {
730 9404246 : #if FD_HAS_AVX512
731 9404246 : ulong batch_data [ 16 ] __attribute__((aligned(64)));
732 159872182 : /* */ for( ulong j=0; j<16; j++ ) batch_data [ j ] = (ulong)root_msg;
733 159872182 : uint batch_sz [ 16 ]; for( ulong j=0; j<16; j++ ) batch_sz [ j ] = last_block_sz;
734 159872182 : ulong batch_ctr [ 16 ]; for( ulong j=0; j<16; j++ ) batch_ctr [ j ] = ctr0+i+j;
735 159872182 : uint batch_flags[ 16 ]; for( ulong j=0; j<16; j++ ) batch_flags[ j ] = last_block_flags;
736 159872182 : void * batch_hash [ 16 ]; for( ulong j=0; j<16; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
737 159872182 : void * batch_cv [ 16 ]; for( ulong j=0; j<16; j++ ) batch_cv [ j ] = root_cv_pre;
738 9404246 : fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
739 : #elif FD_HAS_AVX
740 222934680 : ulong batch_data [ 8 ]; for( ulong j=0; j<8; j++ ) batch_data [ j ] = (ulong)root_msg;
741 222934680 : uint batch_sz [ 8 ]; for( ulong j=0; j<8; j++ ) batch_sz [ j ] = last_block_sz;
742 222934680 : ulong batch_ctr [ 8 ]; for( ulong j=0; j<8; j++ ) batch_ctr [ j ] = ctr0+i+j;
743 222934680 : uint batch_flags[ 8 ]; for( ulong j=0; j<8; j++ ) batch_flags[ j ] = last_block_flags;
744 222934680 : void * batch_hash [ 8 ]; for( ulong j=0; j<8; j++ ) batch_hash [ j ] = (uchar *)hash + (i+j)*64;
745 222934680 : void * batch_cv [ 8 ]; for( ulong j=0; j<8; j++ ) batch_cv [ j ] = root_cv_pre;
746 24770520 : fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, batch_hash, NULL, 64U, batch_cv );
747 : #elif FD_HAS_SSE
748 : fd_blake3_sse_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
749 : #else
750 : fd_blake3_ref_compress1( (uchar *)hash+i*64, root_msg, last_block_sz, ctr0+i, last_block_flags, NULL, root_cv_pre );
751 : #endif
752 34174766 : }
753 :
754 10894753 : FD_BLAKE3_TRACE(( "fd_blake3_fini_2048: done" ));
755 10894753 : return hash;
756 10894753 : }
757 :
758 : void *
759 : fd_blake3_hash( void const * data,
760 : ulong sz,
761 25977 : void * hash ) {
762 :
763 25977 : fd_blake3_buf_t tbl[1];
764 25977 : fd_blake3_pos_t s[1];
765 25977 : fd_blake3_pos_init( s, data, sz );
766 :
767 25977 : #if FD_BLAKE3_PARA_MAX>1
768 1318486 : for(;;) {
769 1318486 : fd_blake3_op_t op[1];
770 1318486 : if( !fd_blake3_prepare_fast( s, tbl, op, FD_BLAKE3_PARA_MAX, 4 ) )
771 25977 : break;
772 251627 : #if FD_HAS_AVX512
773 251627 : fd_blake3_avx512_compress16_fast( op->msg, op->out, op->counter, op->flags );
774 : #elif FD_HAS_AVX
775 1040882 : fd_blake3_avx_compress8_fast( op->msg, op->out, op->counter, op->flags );
776 : #else
777 : #error "missing para support"
778 : #endif
779 1292509 : }
780 25977 : #endif
781 :
782 25977 : void const * hash_ = fd_blake3_single_hash( s, tbl );
783 25977 : memcpy( hash, hash_, 32UL );
784 25977 : return hash;
785 25977 : }
786 :
787 : #if FD_HAS_AVX
788 :
789 : void
790 : fd_blake3_lthash_batch8(
791 : void const * batch_data[8], /* align=32 ele_align=1 */
792 : uint const batch_sz [8], /* align=32 */
793 : void * out_lthash /* align=32 */
794 1378478 : ) {
795 1378478 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 32 ) ) ) {
796 0 : FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
797 0 : }
798 1378478 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 32 ) ) ) {
799 0 : FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
800 0 : }
801 1378478 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 32 ) ) ) {
802 0 : FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
803 0 : }
804 :
805 1378478 : ulong batch_ctr [ 8 ] = {0};
806 12406302 : uint batch_flags[ 8 ]; for( uint i=0; i<8; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
807 1378478 : fd_blake3_avx_compress8( 8UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
808 1378478 : }
809 :
810 : #endif
811 :
812 : #if FD_HAS_AVX512
813 :
814 : void
815 : fd_blake3_lthash_batch16(
816 : void const * batch_data[16], /* align=32 ele_align=1 */
817 : uint const batch_sz [16], /* align=32 */
818 : void * out_lthash /* align=32 */
819 369256 : ) {
820 369256 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_data, 64 ) ) ) {
821 0 : FD_LOG_ERR(( "misaligned batch_data: %p", (void *)batch_data ));
822 0 : }
823 369256 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)batch_sz, 64 ) ) ) {
824 0 : FD_LOG_ERR(( "misaligned batch_sz: %p", (void *)batch_sz ));
825 0 : }
826 369256 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)out_lthash, 64 ) ) ) {
827 0 : FD_LOG_ERR(( "misaligned out_lthash: %p", (void *)out_lthash ));
828 0 : }
829 :
830 369256 : ulong batch_ctr [ 16 ] = {0};
831 6277352 : uint batch_flags[ 16 ]; for( uint i=0; i<16; i++ ) batch_flags[ i ] = FD_BLAKE3_FLAG_ROOT;
832 369256 : fd_blake3_avx512_compress16( 16UL, batch_data, batch_sz, batch_ctr, batch_flags, NULL, out_lthash, 32U, NULL );
833 369256 : }
834 :
835 : #endif
|