Line data Source code
1 : #ifndef HEADER_fd_src_ballet_blake3_fd_blake3_h
2 : #define HEADER_fd_src_ballet_blake3_fd_blake3_h
3 :
4 : #include "../fd_ballet_base.h"
5 : #if FD_HAS_AVX
6 : #include "../../util/simd/fd_avx.h"
7 : #endif
8 :
9 : /* fd_blake3 provides APIs for BLAKE3 hashing of messages.
10 :
11 : The BLAKE3 specification is available here:
12 : https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf
13 :
14 : ### High-level overview
15 :
16 : fd_blake3 provides the "hash" mode of BLAKE3 with variable size
17 : output. Keyed hashing and key derivation are not supported. For
18 : hashes with more than 1024 bytes of input data, uses SIMD parallelism
19 : depending on hardware capabilities. For smaller message sizes, use
20 : the batch API to process multiple independent inputs in parallel.
21 :
22 : ### Usage (simple)
23 :
24 : fd_blake3_t hasher[1];
25 : fd_blake3_init( hasher );
26 : fd_blake3_append( hasher, data, sz );
27 : uchar hash[ 32 ];
28 : fd_blake3_fini( hasher, hash );
29 :
30 : ### Usage (batched)
31 :
32 : ... TODO ...
33 :
34 : ### Hash Construction
35 :
36 : The "core" of BLAKE3 is an add-rotate-xor compression function with
37 : a 512-bit state size. This state is created from the following
38 : 896-bit input:
39 :
40 : - 256-bit chaining value (optionally used to create a hash chain)
41 : - 512-bit input data
42 : - 64-bit counter
43 : - 32-bit input data size
44 : - 32-bit flags
45 :
46 : The BLAKE3 hash is constructed purely by repeated invocation of the
47 : compression function while mixing in input data and metadata.
48 :
49 : At a high-level, there exist two phases: Compress, and expand.
50 : The data dependencies of the compression phase form a hash tree,
51 : ending in a 896-bit root input. In the expand phase, the compression
52 : function is repeatedly applied on the root input with increasing
53 : counter values (each call producing 512-bit of final output data).
54 :
55 : The compress phase is further divided into the chunk phase and the
56 : tree phase. In the chunk phase, each 8192-bit input is hashed to
57 : a 256-bit output via serial calls to the compression function.
58 : (Note that each chunk can be computed independently)
59 :
60 : In the tree phase, the chunks are joined pairwise into a hash tree.
61 :
62 : Figure 1 illustrates a BLAKE3 hash tree with a 2170 byte input
63 : (34 chunks in{X}), one branch nodes (b{Y}), the root state (RS), and
64 : a 192 byte hash output (h{Z}). */
65 :
66 : /*** Figure 1: BLAKE3 Hash Tree ******************************
67 : * *
68 : * ┌────┐ ┌────┐ ┌────┐ ─┐ *
69 : * │ h0 │ │ h1 │ │ h2 │ │ *
70 : * └──▲─┘ └─▲──┘ └─▲──┘ ├─ Expand *
71 : * │ │ │ │ *
72 : * └───────┐│┌────────┘ ─┘ *
73 : * │││ *
74 : * ┌┴┴┴─┐ ─┐ *
75 : * ┌──────►│ RS ├───────┐ │ *
76 : * │ └────┘ │ │ *
77 : * │ │ ├─ Compress Tree *
78 : * ┌─┴──┐ │ │ *
79 : * ┌──►│ b0 │◄──┐ │ │ *
80 : * │ └────┘ │ │ ─┘ *
81 : * │ │ │ *
82 : * ┌──┴───┐ ┌──┴───┐ ┌──┴───┐ ─┐ *
83 : * │ in15 │ │ in31 │ │ in33 │ │ *
84 : * └──▲───┘ └──▲───┘ └──▲───┘ │ *
85 : * │ │ │ │ *
86 : * ... ... ┌──┴───┐ │ *
87 : * ▲ ▲ │ in32 │ │ *
88 : * │ │ └──────┘ ├─ Compress Chunk *
89 : * ┌──┴───┐ ┌──┴───┐ │ *
90 : * │ in1 │ │ in17 │ │ *
91 : * └──▲───┘ └──▲───┘ │ *
92 : * │ │ │ *
93 : * ┌──┴───┐ ┌──┴───┐ │ *
94 : * │ in0 │ │ in16 │ │ *
95 : * └──────┘ └──────┘ ─┘ *
96 : * *
97 : **************************************************************/
98 :
99 : /* ### Implementation
100 :
101 : fd_blake3 consists of three major parts:
102 :
103 : (1) Hash state machines, which track the progress of hash
104 : calculations and prepare operations to advance them;
105 : (2) Schedulers, which accumulate batches of operations from state
106 : machines, then send them to hash backends;
107 : (3) Hash backends (SSE, AVX2, AVX512) which work off a static size
108 : vector of independent hash operations.
109 :
110 : The goal is to maximize throughput. The fastest backend usually is
111 : the widest, creating a scheduling problem. The scheduler should be
112 : able to flexibly schedule operations in parallel without taking up
113 : valuable time that could be used for hashing.
114 :
115 : The simplest opportunity to parallelize is during chunk compression.
116 : The bulk of the work is done in the chunk phase, independently for
117 : each 1024 bytes of input data. This is effective for inputs of size
118 : (width*FD_CHUNK_SZ), i.e. >=8192 bytes of input for AVX2.
119 :
120 : To accelerate processing of smaller inputs, a batch API is offered.
121 : Batching allows the scheduler to process operations over multiple
122 : independent messages at once. This has a significantly higher
123 : scheduling overhead though.
124 :
125 : It is worth noting that compression operations require a variable
126 : amount of compression function calls. (Recall that each call
127 : processes 64 bytes of input data, but a chunk can have up to 1024
128 : bytes of data) fd_blake3 therefore has an internal clock that ticks
129 : each time a hash backend processes a vector of blocks. When a state
130 : machine schedules an op with a 1024 byte input, it knows that the op
131 : completes 16 ticks into the future. */
132 :
133 :
134 : /* Protocol constants *************************************************/
135 :
136 : /* FD_BLAKE3_BLOCK_SZ is the byte size of the inputs to the internal
137 : compression function. This is a protocol constant. */
138 :
139 : #define FD_BLAKE3_BLOCK_LG_SZ (6)
140 4287908907 : #define FD_BLAKE3_BLOCK_SZ (64UL)
141 :
142 : /* FD_BLAKE3_OUTCHAIN_SZ is the byte size of an "output chaining
143 : value". This is a protocol constant. */
144 :
145 124101779 : #define FD_BLAKE3_OUTCHAIN_LG_SZ (5)
146 28626365 : #define FD_BLAKE3_OUTCHAIN_SZ (32UL)
147 :
148 : /* FD_BLAKE3_CHUNK_SZ is the max number of input bytes of a leaf node.
149 : This is a protocol constant.
150 : (1<<FD_BLAKE3_CHUNK_LG_SZ)==FD_BLAKE3_CHUNK_SZ */
151 :
152 1168972154 : #define FD_BLAKE3_CHUNK_LG_SZ (10)
153 211873033 : #define FD_BLAKE3_CHUNK_SZ (1024UL)
154 :
155 : /* FD_BLAKE3_KEY_SZ is the byte size of the optional key in expanded
156 : form. This is a protocol constant. */
157 :
158 : #define FD_BLAKE3_KEY_SZ (32UL)
159 :
160 : /* Implementation constants *******************************************/
161 :
162 : /* FD_BLAKE3_ROW_CNT is the max supported tree height of fd_blake3. */
163 :
164 : #define FD_BLAKE3_ROW_CNT (32UL)
165 :
166 : /* FD_BLAKE3_INPUT_MAX_SZ is the max supported message size of
167 : fd_blake3, derived by FD_BLAKE3_ROW_CNT. (About 4.40 terabytes) */
168 :
169 : #define FD_BLAKE3_INPUT_MAX_SZ ((1UL<<FD_BLAKE3_ROW_CNT)<<FD_BLAKE3_CHUNK_LG_SZ)
170 :
171 : /* FD_BLAKE3_COL_CNT is the max number of adjacent tree nodes to be
172 : buffered per hash state. Used for parallel processing.
173 : (1<<FD_BLAKE3_COL_LG_CNT) == FD_BLAKE3_COL_CNT */
174 :
175 : #if FD_HAS_AVX512
176 : #define FD_BLAKE3_COL_LG_CNT ( 5UL)
177 81834475 : #define FD_BLAKE3_COL_CNT (32UL)
178 : #else
179 : #define FD_BLAKE3_COL_LG_CNT ( 4UL)
180 157682632 : #define FD_BLAKE3_COL_CNT (16UL)
181 : #endif
182 :
183 : /* FD_BLAKE3_{ALIGN,FOOTPRINT} describe the alignment and footprint needed
184 : for a memory region to hold a fd_blake3_t. ALIGN is a positive
185 : integer power of 2. FOOTPRINT is a multiple of align. ALIGN is
186 : recommended to be at least double cache line to mitigate various
187 : kinds of false sharing. These are provided to facilitate compile
188 : time declarations. */
189 :
190 75 : #define FD_BLAKE3_ALIGN (128UL)
191 :
192 : /* A fd_blake3_t should be treated as an opaque handle of a blake3
193 : calculation state. (It technically isn't here facilitate compile
194 : time declarations of fd_blake3_t memory.) */
195 :
196 88117280 : #define FD_BLAKE3_MAGIC (0xF17EDA2CEB1A4E30) /* FIREDANCE BLAKE3 V0 */
197 :
198 : /* Hash state machine *************************************************/
199 :
200 : /* fd_blake3_pos_t is a hash state machine. The user should consider
201 : this struct implementation-defined. It prepares inputs to all
202 : compression function calls. It also tracks dependencies between
203 : those calls. For every fd_blake3_pos_t, there is a fd_blake3_buf_t.
204 : Depending on input size, it may be able to prepare multiple ops that
205 : can be worked on in parallel. */
206 :
207 : struct __attribute__((aligned(FD_BLAKE3_ALIGN))) fd_blake3_pos {
208 :
209 : /* The tail and head arrays track the hash progress of each tree
210 : layer. head.uc[n] is the number of nodes buffered for that layer.
211 : tail.uc[n] is the number of nodes already hashed into the next
212 : layer. The 32-byte "output chaining value" for that node is stored
213 : in fd_blake3_buf_t. */
214 :
215 : /* This point is 128-byte aligned */
216 :
217 : # if FD_HAS_AVX
218 : union { uchar uc[ 32 ]; wb_t wb; } tail;
219 : union { uchar uc[ 32 ]; wb_t wb; } head;
220 : # else
221 : union { uchar uc[ 32 ]; } tail;
222 : union { uchar uc[ 32 ]; } head;
223 : # endif
224 :
225 : /* leaf_idx is the number of leaf chunks processed so far. All but
226 : the last leaf chunk are of size FD_CHUNK_SZs. live_cnt is the
227 : number of nodes for which an output chaining value is buffered and
228 : awaiting further processing. next_tick keeps track of relative
229 : time to inform scheduling when a batch of operations will complete.
230 : layer is the tree layer that the scheduler will work on next. */
231 :
232 : /* This point is 64-byte aligned */
233 :
234 : ulong leaf_idx;
235 : ulong live_cnt;
236 : ulong next_tick;
237 : uint layer;
238 : uchar _pad[4];
239 :
240 : /* [input,input+input_sz) is the user-provided memory region
241 : containing the hash input. May be unaligned. */
242 :
243 : /* This point is 32-byte aligned */
244 :
245 : uchar const * input;
246 : ulong input_sz;
247 :
248 : /* magic==FD_BLAKE3_MAGIC (useful for debugging and detecting memory
249 : corruption) */
250 :
251 : ulong magic;
252 :
253 : };
254 :
255 : typedef struct fd_blake3_pos fd_blake3_pos_t;
256 :
257 : /* fd_blake3_buf_t contains intermediate results of hash tree
258 : construction. Internally, it is a table of output chaining values.
259 : Each row contains a contiguous window of output chaining values for
260 : the nodes at a specific tree layer. Row 0 is the leaf layer. */
261 :
262 : union __attribute__((aligned(FD_BLAKE3_ALIGN))) fd_blake3_buf {
263 :
264 : uchar slots[ FD_BLAKE3_ROW_CNT ][ FD_BLAKE3_COL_CNT ][ FD_BLAKE3_OUTCHAIN_SZ ];
265 : uchar rows [ FD_BLAKE3_ROW_CNT ][ FD_BLAKE3_COL_CNT * FD_BLAKE3_OUTCHAIN_SZ ];
266 :
267 : };
268 :
269 : typedef union fd_blake3_buf fd_blake3_buf_t;
270 :
271 : /* Simple API *********************************************************/
272 :
273 : #if FD_HAS_AVX512
274 178496379 : #define FD_BLAKE3_PARA_LG_MAX (4UL)
275 : #elif FD_HAS_AVX
276 374394732 : #define FD_BLAKE3_PARA_LG_MAX (3UL)
277 : #else
278 : #define FD_BLAKE3_PARA_LG_MAX (0UL)
279 : #endif
280 372608213 : #define FD_BLAKE3_PARA_MAX (1<<FD_BLAKE3_PARA_LG_MAX)
281 :
282 180282898 : #define FD_BLAKE3_PRIVATE_LG_BUF_MAX (FD_BLAKE3_PARA_LG_MAX+FD_BLAKE3_CHUNK_LG_SZ)
283 60227289 : #define FD_BLAKE3_PRIVATE_BUF_MAX (1UL<<FD_BLAKE3_PRIVATE_LG_BUF_MAX)
284 :
285 : struct fd_blake3 {
286 : fd_blake3_buf_t buf;
287 : uchar block[ FD_BLAKE3_PRIVATE_BUF_MAX ];
288 : fd_blake3_pos_t pos;
289 : ulong block_sz;
290 : };
291 :
292 : typedef struct fd_blake3 fd_blake3_t;
293 :
294 24 : #define FD_BLAKE3_FOOTPRINT (sizeof(fd_blake3_t))
295 :
296 : FD_PROTOTYPES_BEGIN
297 :
298 : /* fd_blake3_{align,footprint,new,join,leave,delete} usage is identical to
299 : that of their fd_sha512 counterparts. See ../sha512/fd_sha512.h */
300 :
301 : FD_FN_CONST ulong
302 : fd_blake3_align( void );
303 :
304 : FD_FN_CONST ulong
305 : fd_blake3_footprint( void );
306 :
307 : void *
308 : fd_blake3_new( void * shmem );
309 :
310 : fd_blake3_t *
311 : fd_blake3_join( void * shsha );
312 :
313 : void *
314 : fd_blake3_leave( fd_blake3_t * sha );
315 :
316 : void *
317 : fd_blake3_delete( void * shsha );
318 :
319 : /* fd_blake3_init starts a blake3 calculation. sha is assumed to be a
320 : current local join to a blake3 calculation state with no other
321 : concurrent operation that would modify the state while this is
322 : executing. Any preexisting state for an in-progress or recently
323 : completed calculation will be discarded. Returns sha (on return, sha
324 : will have the state of a new in-progress calculation). */
325 :
326 : fd_blake3_t *
327 : fd_blake3_init( fd_blake3_t * sha );
328 :
329 : /* fd_blake3_append adds sz bytes locally pointed to by data an
330 : in-progress blake3 calculation. sha, data and sz are assumed to be
331 : valid (i.e. sha is a current local join to a blake3 calculation state
332 : with no other concurrent operations that would modify the state while
333 : this is executing, data points to the first of the sz bytes and will
334 : be unmodified while this is running with no interest retained after
335 : return ... data==NULL is fine if sz==0). Returns sha (on return, sha
336 : will have the updated state of the in-progress calculation).
337 :
338 : It does not matter how the user group data bytes for a blake3
339 : calculation; the final hash will be identical. It is preferable for
340 : performance to try to append as many bytes as possible as a time
341 : though. It is also preferable for performance if sz is a multiple of
342 : 64 for all but the last append (it is also preferable if sz is less
343 : than 56 for the last append). */
344 :
345 : fd_blake3_t *
346 : fd_blake3_append( fd_blake3_t * sha,
347 : void const * data,
348 : ulong sz );
349 :
350 : /* fd_blake3_fini finishes a a blake3 calculation. sha and hash are
351 : assumed to be valid (i.e. sha is a local join to a blake3 calculation
352 : state that has an in-progress calculation with no other concurrent
353 : operations that would modify the state while this is executing and
354 : hash points to the first byte of a 32-byte memory region where the
355 : result of the calculation should be stored). Returns hash (on
356 : return, there will be no calculation in-progress on sha and 32-byte
357 : buffer pointed to by hash will be populated with the calculation
358 : result). */
359 :
360 : void *
361 : fd_blake3_fini( fd_blake3_t * sha,
362 : void * hash );
363 :
364 : void *
365 : fd_blake3_fini_2048( fd_blake3_t * sha,
366 : void * hash );
367 :
368 : void *
369 : fd_blake3_hash( void const * data,
370 : ulong sz,
371 : void * hash );
372 :
373 : /* fd_blake3_lthash_batch{n} calculate a batch of n independent BLAKE3
374 : hash operations with 2048 byte XOF output, and up to 1024 byte input.
375 : The outputs are reduced down to a single value using 'LtHash'
376 : group-add arithmetic.
377 :
378 : batch_data[i] give a pointer to the input message. batch_data is
379 : assumed to be 64-byte aligned. batch_data[i] does not have to be
380 : aligned. batch_sz[i] give the input size (in [0,1024]). batch_sz is
381 : assumed to be 64-byte aligned. On return, 2048 bytes of output are
382 : written to out_lthash. out_lthash is assumed to be 64-byte aligned.
383 :
384 : Execution time is bound by the largest batch_sz[i] input. */
385 :
386 : #if FD_HAS_AVX
387 :
388 : void
389 : fd_blake3_lthash_batch8(
390 : void const * batch_data[8], /* align=32 ele_align=1 */
391 : uint const batch_sz [8], /* align=32 */
392 : void * out_lthash /* align=32 */
393 : );
394 :
395 : #endif
396 :
397 : #if FD_HAS_AVX512
398 :
399 : void
400 : fd_blake3_lthash_batch16(
401 : void const * batch_data[16], /* align=64 ele_align=1 */
402 : uint const batch_sz [16], /* align=64 */
403 : void * out_lthash /* align=64 */
404 : );
405 :
406 : #endif
407 :
408 : FD_PROTOTYPES_END
409 :
410 : #endif /* HEADER_fd_src_ballet_blake3_fd_blake3_h */
|