Line data Source code
1 : // Source originally from https://github.com/BLAKE3-team/BLAKE3
2 : // From commit: 64747d48ffe9d1fbf4b71e94cabeb8a211461081
3 :
4 : #ifndef BLAKE3_IMPL_H
5 : #define BLAKE3_IMPL_H
6 :
7 : #include <assert.h>
8 : #include <stdbool.h>
9 : #include <stddef.h>
10 : #include <stdint.h>
11 : #include <string.h>
12 :
13 : #include "blake3.h"
14 :
15 : // internal flags
16 : enum blake3_flags {
17 : CHUNK_START = 1 << 0,
18 : CHUNK_END = 1 << 1,
19 : PARENT = 1 << 2,
20 : ROOT = 1 << 3,
21 : KEYED_HASH = 1 << 4,
22 : DERIVE_KEY_CONTEXT = 1 << 5,
23 : DERIVE_KEY_MATERIAL = 1 << 6,
24 : };
25 :
26 : #define INLINE static inline __attribute__((always_inline))
27 :
28 : #define BLAKE3_USE_NEON 0
29 :
30 : #if FD_HAS_X86
31 0 : #define MAX_SIMD_DEGREE 16
32 : #elif BLAKE3_USE_NEON == 1
33 : #define MAX_SIMD_DEGREE 4
34 : #else
35 : #define MAX_SIMD_DEGREE 1
36 : #endif
37 :
38 : // There are some places where we want a static size that's equal to the
39 : // MAX_SIMD_DEGREE, but also at least 2.
40 0 : #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
41 :
42 : static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
43 : 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
44 : 0x1F83D9ABUL, 0x5BE0CD19UL};
45 :
46 : static const uint8_t MSG_SCHEDULE[7][16] = {
47 : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
48 : {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
49 : {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
50 : {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
51 : {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
52 : {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
53 : {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
54 : };
55 :
56 : /* Find index of the highest set bit */
57 : /* x is assumed to be nonzero. */
58 0 : static unsigned int highest_one(uint64_t x) {
59 0 : #if defined(__GNUC__) || defined(__clang__)
60 0 : return 63 ^ (unsigned int)__builtin_clzll(x);
61 : #elif defined(_MSC_VER) && defined(IS_X86_64)
62 : unsigned long index;
63 : _BitScanReverse64(&index, x);
64 : return index;
65 : #elif defined(_MSC_VER) && defined(IS_X86_32)
66 : if(x >> 32) {
67 : unsigned long index;
68 : _BitScanReverse(&index, (unsigned long)(x >> 32));
69 : return 32 + index;
70 : } else {
71 : unsigned long index;
72 : _BitScanReverse(&index, (unsigned long)x);
73 : return index;
74 : }
75 : #else
76 : unsigned int c = 0;
77 : if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
78 : if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
79 : if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
80 : if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
81 : if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
82 : if(x & 0x0000000000000002ULL) { c += 1; }
83 : return c;
84 : #endif
85 0 : }
86 :
87 : // Count the number of 1 bits.
88 15 : INLINE unsigned int popcnt(uint64_t x) {
89 15 : #if defined(__GNUC__) || defined(__clang__)
90 15 : return (unsigned int)__builtin_popcountll(x);
91 : #else
92 : unsigned int count = 0;
93 : while (x != 0) {
94 : count += 1;
95 : x &= x - 1;
96 : }
97 : return count;
98 : #endif
99 15 : }
100 :
101 : // Largest power of two less than or equal to x. As a special case, returns 1
102 : // when x is 0.
103 0 : INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
104 0 : return 1ULL << highest_one(x | 1);
105 0 : }
106 :
107 15 : INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
108 :
109 15 : INLINE uint32_t counter_high(uint64_t counter) {
110 15 : return (uint32_t)(counter >> 32);
111 15 : }
112 :
113 0 : INLINE uint32_t load32(const void *src) {
114 0 : const uint8_t *p = (const uint8_t *)src;
115 0 : return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
116 0 : ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
117 0 : }
118 :
119 : INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
120 0 : uint32_t key_words[8]) {
121 0 : key_words[0] = load32(&key[0 * 4]);
122 0 : key_words[1] = load32(&key[1 * 4]);
123 0 : key_words[2] = load32(&key[2 * 4]);
124 0 : key_words[3] = load32(&key[3 * 4]);
125 0 : key_words[4] = load32(&key[4 * 4]);
126 0 : key_words[5] = load32(&key[5 * 4]);
127 0 : key_words[6] = load32(&key[6 * 4]);
128 0 : key_words[7] = load32(&key[7 * 4]);
129 0 : }
130 :
131 0 : INLINE void store32(void *dst, uint32_t w) {
132 0 : uint8_t *p = (uint8_t *)dst;
133 0 : p[0] = (uint8_t)(w >> 0);
134 0 : p[1] = (uint8_t)(w >> 8);
135 0 : p[2] = (uint8_t)(w >> 16);
136 0 : p[3] = (uint8_t)(w >> 24);
137 0 : }
138 :
139 0 : INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
140 0 : store32(&bytes_out[0 * 4], cv_words[0]);
141 0 : store32(&bytes_out[1 * 4], cv_words[1]);
142 0 : store32(&bytes_out[2 * 4], cv_words[2]);
143 0 : store32(&bytes_out[3 * 4], cv_words[3]);
144 0 : store32(&bytes_out[4 * 4], cv_words[4]);
145 0 : store32(&bytes_out[5 * 4], cv_words[5]);
146 0 : store32(&bytes_out[6 * 4], cv_words[6]);
147 0 : store32(&bytes_out[7 * 4], cv_words[7]);
148 0 : }
149 :
150 : void fd_blake3_compress_in_place(uint32_t cv[8],
151 : const uint8_t block[BLAKE3_BLOCK_LEN],
152 : uint8_t block_len, uint64_t counter,
153 : uint8_t flags);
154 :
155 : void fd_blake3_compress_xof(const uint32_t cv[8],
156 : const uint8_t block[BLAKE3_BLOCK_LEN],
157 : uint8_t block_len, uint64_t counter, uint8_t flags,
158 : uint8_t out[64]);
159 :
160 : void fd_blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
161 : size_t blocks, const uint32_t key[8], uint64_t counter,
162 : bool increment_counter, uint8_t flags,
163 : uint8_t flags_start, uint8_t flags_end, uint8_t *out);
164 :
165 : size_t fd_blake3_simd_degree(void);
166 :
167 :
168 : // Declarations for implementation-specific functions.
169 : void fd_blake3_compress_in_place_portable(uint32_t cv[8],
170 : const uint8_t block[BLAKE3_BLOCK_LEN],
171 : uint8_t block_len, uint64_t counter,
172 : uint8_t flags);
173 :
174 : void fd_blake3_compress_xof_portable(const uint32_t cv[8],
175 : const uint8_t block[BLAKE3_BLOCK_LEN],
176 : uint8_t block_len, uint64_t counter,
177 : uint8_t flags, uint8_t out[64]);
178 :
179 : void fd_blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
180 : size_t blocks, const uint32_t key[8],
181 : uint64_t counter, bool increment_counter,
182 : uint8_t flags, uint8_t flags_start,
183 : uint8_t flags_end, uint8_t *out);
184 :
185 : #if FD_HAS_X86
186 : #if FD_HAS_SSE
187 : void fd_blake3_compress_in_place_sse2(uint32_t cv[8],
188 : const uint8_t block[BLAKE3_BLOCK_LEN],
189 : uint8_t block_len, uint64_t counter,
190 : uint8_t flags);
191 : void fd_blake3_compress_xof_sse2(const uint32_t cv[8],
192 : const uint8_t block[BLAKE3_BLOCK_LEN],
193 : uint8_t block_len, uint64_t counter,
194 : uint8_t flags, uint8_t out[64]);
195 : void fd_blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
196 : size_t blocks, const uint32_t key[8],
197 : uint64_t counter, bool increment_counter,
198 : uint8_t flags, uint8_t flags_start,
199 : uint8_t flags_end, uint8_t *out);
200 : #endif /* FD_HAS_SSE */
201 : #if FD_HAS_AVX
202 : void fd_blake3_compress_in_place_sse41(uint32_t cv[8],
203 : const uint8_t block[BLAKE3_BLOCK_LEN],
204 : uint8_t block_len, uint64_t counter,
205 : uint8_t flags);
206 : void fd_blake3_compress_xof_sse41(const uint32_t cv[8],
207 : const uint8_t block[BLAKE3_BLOCK_LEN],
208 : uint8_t block_len, uint64_t counter,
209 : uint8_t flags, uint8_t out[64]);
210 : void fd_blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
211 : size_t blocks, const uint32_t key[8],
212 : uint64_t counter, bool increment_counter,
213 : uint8_t flags, uint8_t flags_start,
214 : uint8_t flags_end, uint8_t *out);
215 : void fd_blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
216 : size_t blocks, const uint32_t key[8],
217 : uint64_t counter, bool increment_counter,
218 : uint8_t flags, uint8_t flags_start,
219 : uint8_t flags_end, uint8_t *out);
220 : #endif /* FD_HAS_AVX */
221 : #if FD_HAS_AVX512
222 : void fd_blake3_compress_in_place_avx512(uint32_t cv[8],
223 : const uint8_t block[BLAKE3_BLOCK_LEN],
224 : uint8_t block_len, uint64_t counter,
225 : uint8_t flags);
226 :
227 : void fd_blake3_compress_xof_avx512(const uint32_t cv[8],
228 : const uint8_t block[BLAKE3_BLOCK_LEN],
229 : uint8_t block_len, uint64_t counter,
230 : uint8_t flags, uint8_t out[64]);
231 :
232 : void fd_blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
233 : size_t blocks, const uint32_t key[8],
234 : uint64_t counter, bool increment_counter,
235 : uint8_t flags, uint8_t flags_start,
236 : uint8_t flags_end, uint8_t *out);
237 : #endif /* FD_HAS_AVX512 */
238 : #endif /* FD_HAS_X86 */
239 :
240 : #if BLAKE3_USE_NEON == 1
241 : void fd_blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
242 : size_t blocks, const uint32_t key[8],
243 : uint64_t counter, bool increment_counter,
244 : uint8_t flags, uint8_t flags_start,
245 : uint8_t flags_end, uint8_t *out);
246 : #endif
247 :
248 :
249 : #endif /* BLAKE3_IMPL_H */
|