Line data Source code
1 : #ifndef HEADER_fd_src_ballet_blake3_fd_blake3_private_h
2 : #define HEADER_fd_src_ballet_blake3_fd_blake3_private_h
3 :
4 : #include "fd_blake3.h"
5 :
6 : /* Set FD_BLAKE3_TRACING to 1 to dump out a high-level trace of BLAKE3
7 : operations to the debug log. This is useful during debugging or
8 : development. */
9 : #define FD_BLAKE3_TRACING 0
10 :
11 : #if FD_BLAKE3_TRACING
12 : #define FD_BLAKE3_TRACE( ... ) FD_LOG_DEBUG( __VA_ARGS__ )
13 : #else
14 840982671 : #define FD_BLAKE3_TRACE( ... ) (void)0
15 : #endif
16 :
17 : /* Protocol constants *************************************************/
18 :
19 : static const uchar FD_BLAKE3_MSG_SCHEDULE[7][16] = {
20 : { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
21 : { 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8 },
22 : { 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1 },
23 : { 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6 },
24 : { 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4 },
25 : { 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7 },
26 : { 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13 },
27 : };
28 :
29 : static const uint FD_BLAKE3_IV[8] = {
30 : 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
31 : 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
32 : };
33 :
34 204854549 : #define FD_BLAKE3_FLAG_CHUNK_START (1u<<0) /* 1 */
35 626610327 : #define FD_BLAKE3_FLAG_CHUNK_END (1u<<1) /* 2 */
36 204521773 : #define FD_BLAKE3_FLAG_PARENT (1u<<2) /* 4 */
37 496116480 : #define FD_BLAKE3_FLAG_ROOT (1u<<3) /* 8 */
38 :
39 : /* Possible flag combinations:
40 : 0x1: first block of a chunk with at least 2 blocks
41 : 0x2: last block of a chunk, tree that has at least 1 parent
42 : 0x3: last chunk (<=64 bytes), input >1024 bytes
43 : 0x4: non-root parent node
44 : 0xa: last block of the only chunk, input_sz>64 input_sz<=1024
45 : 0xb: only block, input_sz<=64
46 : 0xc: root parent node */
47 :
48 : /* Scheduler **********************************************************/
49 :
50 : union __attribute__((aligned(32))) fd_blake3_op {
51 :
52 : struct {
53 : uchar const * msg;
54 : uchar * out;
55 :
56 : ulong counter;
57 : union {
58 : struct {
59 : ushort off;
60 : ushort sz;
61 : };
62 : uint off_sz;
63 : };
64 : uchar flags;
65 : };
66 :
67 : };
68 :
69 : typedef union fd_blake3_op fd_blake3_op_t;
70 :
71 : /* Compression function ***********************************************/
72 :
73 : FD_PROTOTYPES_BEGIN
74 :
75 : void
76 : fd_blake3_fini_xof_compress( fd_blake3_t * sha,
77 : uchar * root_msg,
78 : uchar * root_cv_pre );
79 :
80 : void
81 : fd_blake3_ref_compress1( uchar * restrict out, /* align==1 len==32 */
82 : uchar const * restrict msg, /* align==1 len==64 */
83 : uint msg_sz,
84 : ulong counter,
85 : uint flags,
86 : uchar * restrict out_chain, /* optional, 16 byte output chaining value of last block */
87 : uchar const * restrict in_chain ); /* optional, 16 byte input chaining value of first block (default IV) */
88 :
89 : #if FD_HAS_SSE
90 :
91 : void
92 : fd_blake3_sse_compress1( uchar * restrict out, /* align==1 len==32 */
93 : uchar const * restrict msg, /* align==1 len==64 */
94 : uint msg_sz,
95 : ulong counter,
96 : uint flags,
97 : uchar * restrict out_chain,
98 : uchar const * restrict in_chain );
99 :
100 : #endif /* FD_HAS_SSE */
101 :
102 : #if FD_HAS_AVX
103 :
104 : /* BLAKE3 AVX cores
105 :
106 : compress8 compresses one to eight tree nodes. batch_cnt is the
107 : number of nodes to process. For each node in the batch with index i,
108 : - _batch_data[i] points to the input data of the node (message bytes
109 : for leaf nodes, a pair of output chaining values for branch nodes)
110 : - batch_sz[i] is the input byte count of the node, from which the
111 : 'len' value of each of the node's blocks is derived
112 : - ctr_vec[i] is the 'counter' value of the node
113 : - batch_flags[i] is the 'flag' value of the node
114 : - cv is optional. If set, cv[i] is the 'chaining value' of the first
115 : block of the node. This is useful for XOF.
116 :
117 : compress8 has three different output modes:
118 : - "LtHash in-place": If lthash is set, each node's output is expanded
119 : (XOF) to 2048 bytes and interpreted as an 'LtHash' value (i.e.
120 : a vector of 1024 uint16). These vectors are then added together
121 : and the result is written to lthash. The root flag MUST be set for
122 : all batch_flags inputs, otherwise this function will read OOB.
123 : - "Simple": Otherwise, _batch_hash[i] is populated with the 32-byte
124 : output chaining value. (If node i is a root node, this is 'the
125 : BLAKE3 hash', i.e. the first 32 bytes of the XOF stream).
126 :
127 : These modes are all packed into the same function because the
128 : alternatives are worse (either worse code footprint due to duplicated
129 : core, or worse throughput due to high penalty passing vector regs
130 : between functions in SysV ABI).
131 :
132 : compress8_fast does a subset of what compress8 can, but is ~10-20%
133 : faster. */
134 :
135 : void
136 : fd_blake3_avx_compress8( ulong batch_cnt,
137 : void const * restrict _batch_data, /* align==32 len in [1,8) */
138 : uint const * restrict batch_sz, /* len in [1,8] */
139 : ulong const * restrict ctr_vec, /* len==8 */
140 : uint const * restrict batch_flags, /* align==32 len==8 */
141 : void * const * restrict _batch_hash, /* align==32 len in [1,8) */
142 : ushort * restrict lthash, /* align==32 byte_sz=2048 */
143 : uint out_sz, /* 32 or 64 */
144 : void const * restrict batch_cv ); /* align==8 len==8 ele_align==32 optional */
145 :
146 : void
147 : fd_blake3_avx_compress8_fast( uchar const * restrict batch_data, /* align==32 len==8*64 */
148 : uchar * restrict batch_hash, /* align==32 len==8*32 */
149 : ulong counter,
150 : uchar flags );
151 :
152 : #endif /* FD_HAS_AVX */
153 :
154 : #if FD_HAS_AVX512
155 :
156 : /* fd_blake3_avx512_compress16{,fast} are analogous to the avx APIs
157 : above. The only difference is larger alignment assumptions and that
158 : these process up to sixteen elements. */
159 :
160 : void
161 : fd_blake3_avx512_compress16( ulong batch_cnt,
162 : void const * restrict _batch_data, /* align=64 len=16 ele_align=1 */
163 : uint const * restrict batch_sz, /* align= 4 len=16 */
164 : ulong const * restrict ctr_vec, /* align= 8 len=16 */
165 : uint const * restrict batch_flags, /* align= 4 len=16 */
166 : void * const * restrict _batch_hash, /* align=64 len=16 */
167 : ushort * restrict lthash, /* align=32 byte_sz=2048 */
168 : uint out_sz, /* 32 or 64 */
169 : void const * restrict batch_cv ); /* align= 8 len=16 ele_align=16 optional */
170 :
171 : void
172 : fd_blake3_avx512_compress16_fast( uchar const * restrict batch_data, /* align==32 len==16*64 */
173 : uchar * restrict batch_hash, /* align==32 len==16*32 */
174 : ulong counter,
175 : uchar flags );
176 :
177 : #endif /* FD_HAS_AVX512 */
178 :
179 : FD_PROTOTYPES_END
180 :
181 : #endif /* HEADER_fd_src_ballet_blake3_fd_blake3_private_h */
|