Line data Source code
1 : #ifndef BLAKE3_IMPL_H
2 : #define BLAKE3_IMPL_H
3 :
4 : #include <assert.h>
5 : #include <stdbool.h>
6 : #include <stddef.h>
7 : #include <stdint.h>
8 : #include <string.h>
9 :
10 : #include "blake3.h"
11 :
12 : #ifdef __cplusplus
13 : extern "C" {
14 : #endif
15 :
16 : // internal flags
17 : enum blake3_flags {
18 : CHUNK_START = 1 << 0,
19 : CHUNK_END = 1 << 1,
20 : PARENT = 1 << 2,
21 : ROOT = 1 << 3,
22 : KEYED_HASH = 1 << 4,
23 : DERIVE_KEY_CONTEXT = 1 << 5,
24 : DERIVE_KEY_MATERIAL = 1 << 6,
25 : };
26 :
27 : // This C implementation tries to support recent versions of GCC, Clang, and
28 : // MSVC.
29 : #if defined(_MSC_VER)
30 : #define INLINE static __forceinline
31 : #else
32 : #define INLINE static inline __attribute__((always_inline))
33 : #endif
34 :
35 : #ifdef __cplusplus
36 : #define NOEXCEPT noexcept
37 : #else
38 : #define NOEXCEPT
39 : #endif
40 :
41 : #if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
42 : #define IS_X86
43 : #define IS_X86_64
44 : #endif
45 :
46 : #if defined(__i386__) || defined(_M_IX86)
47 : #define IS_X86
48 : #define IS_X86_32
49 : #endif
50 :
51 : #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
52 : #define IS_AARCH64
53 : #endif
54 :
55 : #if defined(IS_X86)
56 : #if defined(_MSC_VER)
57 : #include <intrin.h>
58 : #endif
59 : #endif
60 :
61 : #if !defined(BLAKE3_USE_NEON)
62 : // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
63 : #if defined(IS_AARCH64)
64 : #if defined(__ARM_BIG_ENDIAN)
65 : #define BLAKE3_USE_NEON 0
66 : #else
67 : #define BLAKE3_USE_NEON 1
68 : #endif
69 : #else
70 : #define BLAKE3_USE_NEON 0
71 : #endif
72 : #endif
73 :
74 : #if defined(IS_X86)
75 : #define MAX_SIMD_DEGREE 16
76 : #elif BLAKE3_USE_NEON == 1
77 : #define MAX_SIMD_DEGREE 4
78 : #else
79 : #define MAX_SIMD_DEGREE 1
80 : #endif
81 :
82 : // There are some places where we want a static size that's equal to the
83 : // MAX_SIMD_DEGREE, but also at least 2.
84 : #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
85 :
86 : static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
87 : 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
88 : 0x1F83D9ABUL, 0x5BE0CD19UL};
89 :
90 : static const uint8_t MSG_SCHEDULE[7][16] = {
91 : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
92 : {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
93 : {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
94 : {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
95 : {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
96 : {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
97 : {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
98 : };
99 :
100 : /* Find index of the highest set bit */
101 : /* x is assumed to be nonzero. */
102 1523099 : static unsigned int highest_one(uint64_t x) {
103 1523099 : #if defined(__GNUC__) || defined(__clang__)
104 1523099 : return 63 ^ (unsigned int)__builtin_clzll(x);
105 : #elif defined(_MSC_VER) && defined(IS_X86_64)
106 : unsigned long index;
107 : _BitScanReverse64(&index, x);
108 : return index;
109 : #elif defined(_MSC_VER) && defined(IS_X86_32)
110 : if(x >> 32) {
111 : unsigned long index;
112 : _BitScanReverse(&index, (unsigned long)(x >> 32));
113 : return 32 + index;
114 : } else {
115 : unsigned long index;
116 : _BitScanReverse(&index, (unsigned long)x);
117 : return index;
118 : }
119 : #else
120 : unsigned int c = 0;
121 : if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
122 : if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
123 : if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
124 : if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
125 : if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
126 : if(x & 0x0000000000000002ULL) { c += 1; }
127 : return c;
128 : #endif
129 1523099 : }
130 :
131 : // Count the number of 1 bits.
132 26218176 : INLINE unsigned int popcnt(uint64_t x) {
133 26218176 : #if defined(__GNUC__) || defined(__clang__)
134 26218176 : return (unsigned int)__builtin_popcountll(x);
135 : #else
136 : unsigned int count = 0;
137 : while (x != 0) {
138 : count += 1;
139 : x &= x - 1;
140 : }
141 : return count;
142 : #endif
143 26218176 : }
144 :
145 : // Largest power of two less than or equal to x. As a special case, returns 1
146 : // when x is 0.
147 1523099 : INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
148 1523099 : return 1ULL << highest_one(x | 1);
149 1523099 : }
150 :
151 77952680 : INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
152 :
153 77952680 : INLINE uint32_t counter_high(uint64_t counter) {
154 77952680 : return (uint32_t)(counter >> 32);
155 77952680 : }
156 :
157 9760 : INLINE uint32_t load32(const void *src) {
158 9760 : const uint8_t *p = (const uint8_t *)src;
159 9760 : return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
160 9760 : ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
161 9760 : }
162 :
163 : INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
164 0 : uint32_t key_words[8]) {
165 0 : key_words[0] = load32(&key[0 * 4]);
166 0 : key_words[1] = load32(&key[1 * 4]);
167 0 : key_words[2] = load32(&key[2 * 4]);
168 0 : key_words[3] = load32(&key[3 * 4]);
169 0 : key_words[4] = load32(&key[4 * 4]);
170 0 : key_words[5] = load32(&key[5 * 4]);
171 0 : key_words[6] = load32(&key[6 * 4]);
172 0 : key_words[7] = load32(&key[7 * 4]);
173 0 : }
174 :
175 : INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
176 610 : uint32_t block_words[16]) {
177 10370 : for (size_t i = 0; i < 16; i++) {
178 9760 : block_words[i] = load32(&block[i * 4]);
179 9760 : }
180 610 : }
181 :
182 59424 : INLINE void store32(void *dst, uint32_t w) {
183 59424 : uint8_t *p = (uint8_t *)dst;
184 59424 : p[0] = (uint8_t)(w >> 0);
185 59424 : p[1] = (uint8_t)(w >> 8);
186 59424 : p[2] = (uint8_t)(w >> 16);
187 59424 : p[3] = (uint8_t)(w >> 24);
188 59424 : }
189 :
190 7428 : INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
191 7428 : store32(&bytes_out[0 * 4], cv_words[0]);
192 7428 : store32(&bytes_out[1 * 4], cv_words[1]);
193 7428 : store32(&bytes_out[2 * 4], cv_words[2]);
194 7428 : store32(&bytes_out[3 * 4], cv_words[3]);
195 7428 : store32(&bytes_out[4 * 4], cv_words[4]);
196 7428 : store32(&bytes_out[5 * 4], cv_words[5]);
197 7428 : store32(&bytes_out[6 * 4], cv_words[6]);
198 7428 : store32(&bytes_out[7 * 4], cv_words[7]);
199 7428 : }
200 :
201 : void blake3_compress_in_place(uint32_t cv[8],
202 : const uint8_t block[BLAKE3_BLOCK_LEN],
203 : uint8_t block_len, uint64_t counter,
204 : uint8_t flags);
205 :
206 : void blake3_compress_xof(const uint32_t cv[8],
207 : const uint8_t block[BLAKE3_BLOCK_LEN],
208 : uint8_t block_len, uint64_t counter, uint8_t flags,
209 : uint8_t out[64]);
210 :
211 : void blake3_xof_many(const uint32_t cv[8],
212 : const uint8_t block[BLAKE3_BLOCK_LEN],
213 : uint8_t block_len, uint64_t counter, uint8_t flags,
214 : uint8_t out[64], size_t outblocks);
215 :
216 : void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
217 : size_t blocks, const uint32_t key[8], uint64_t counter,
218 : bool increment_counter, uint8_t flags,
219 : uint8_t flags_start, uint8_t flags_end, uint8_t *out);
220 :
221 : size_t blake3_simd_degree(void);
222 :
223 : // Declarations for implementation-specific functions.
224 : void blake3_compress_in_place_portable(uint32_t cv[8],
225 : const uint8_t block[BLAKE3_BLOCK_LEN],
226 : uint8_t block_len, uint64_t counter,
227 : uint8_t flags);
228 :
229 : void blake3_compress_xof_portable(const uint32_t cv[8],
230 : const uint8_t block[BLAKE3_BLOCK_LEN],
231 : uint8_t block_len, uint64_t counter,
232 : uint8_t flags, uint8_t out[64]);
233 :
234 : void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
235 : size_t blocks, const uint32_t key[8],
236 : uint64_t counter, bool increment_counter,
237 : uint8_t flags, uint8_t flags_start,
238 : uint8_t flags_end, uint8_t *out);
239 :
240 : #if defined(IS_X86)
241 : #if !defined(BLAKE3_NO_SSE2)
242 : void blake3_compress_in_place_sse2(uint32_t cv[8],
243 : const uint8_t block[BLAKE3_BLOCK_LEN],
244 : uint8_t block_len, uint64_t counter,
245 : uint8_t flags);
246 : void blake3_compress_xof_sse2(const uint32_t cv[8],
247 : const uint8_t block[BLAKE3_BLOCK_LEN],
248 : uint8_t block_len, uint64_t counter,
249 : uint8_t flags, uint8_t out[64]);
250 : void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
251 : size_t blocks, const uint32_t key[8],
252 : uint64_t counter, bool increment_counter,
253 : uint8_t flags, uint8_t flags_start,
254 : uint8_t flags_end, uint8_t *out);
255 : #endif
256 : #if !defined(BLAKE3_NO_SSE41)
257 : void blake3_compress_in_place_sse41(uint32_t cv[8],
258 : const uint8_t block[BLAKE3_BLOCK_LEN],
259 : uint8_t block_len, uint64_t counter,
260 : uint8_t flags);
261 : void blake3_compress_xof_sse41(const uint32_t cv[8],
262 : const uint8_t block[BLAKE3_BLOCK_LEN],
263 : uint8_t block_len, uint64_t counter,
264 : uint8_t flags, uint8_t out[64]);
265 : void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
266 : size_t blocks, const uint32_t key[8],
267 : uint64_t counter, bool increment_counter,
268 : uint8_t flags, uint8_t flags_start,
269 : uint8_t flags_end, uint8_t *out);
270 : #endif
271 : #if !defined(BLAKE3_NO_AVX2)
272 : void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
273 : size_t blocks, const uint32_t key[8],
274 : uint64_t counter, bool increment_counter,
275 : uint8_t flags, uint8_t flags_start,
276 : uint8_t flags_end, uint8_t *out);
277 : #endif
278 : #if !defined(BLAKE3_NO_AVX512)
279 : void blake3_compress_in_place_avx512(uint32_t cv[8],
280 : const uint8_t block[BLAKE3_BLOCK_LEN],
281 : uint8_t block_len, uint64_t counter,
282 : uint8_t flags);
283 :
284 : void blake3_compress_xof_avx512(const uint32_t cv[8],
285 : const uint8_t block[BLAKE3_BLOCK_LEN],
286 : uint8_t block_len, uint64_t counter,
287 : uint8_t flags, uint8_t out[64]);
288 :
289 : void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
290 : size_t blocks, const uint32_t key[8],
291 : uint64_t counter, bool increment_counter,
292 : uint8_t flags, uint8_t flags_start,
293 : uint8_t flags_end, uint8_t *out);
294 :
295 : #if !defined(_WIN32)
296 : void blake3_xof_many_avx512(const uint32_t cv[8],
297 : const uint8_t block[BLAKE3_BLOCK_LEN],
298 : uint8_t block_len, uint64_t counter, uint8_t flags,
299 : uint8_t* out, size_t outblocks);
300 : #endif
301 : #endif
302 : #endif
303 :
304 : #if BLAKE3_USE_NEON == 1
305 : void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
306 : size_t blocks, const uint32_t key[8],
307 : uint64_t counter, bool increment_counter,
308 : uint8_t flags, uint8_t flags_start,
309 : uint8_t flags_end, uint8_t *out);
310 : #endif
311 :
312 : #ifdef __cplusplus
313 : }
314 : #endif
315 :
316 : #endif /* BLAKE3_IMPL_H */
|