LCOV - code coverage report
Current view: top level - ballet/blake3 - blake3_impl.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 42 52 80.8 %
Date: 2025-08-21 04:41:08 Functions: 11 60 18.3 %

          Line data    Source code
       1             : #ifndef BLAKE3_IMPL_H
       2             : #define BLAKE3_IMPL_H
       3             : 
       4             : #include <assert.h>
       5             : #include <stdbool.h>
       6             : #include <stddef.h>
       7             : #include <stdint.h>
       8             : #include <string.h>
       9             : 
      10             : #include "blake3.h"
      11             : 
      12             : #ifdef __cplusplus
      13             : extern "C" {
      14             : #endif
      15             : 
      16             : // internal flags
      17             : enum blake3_flags {
      18             :   CHUNK_START         = 1 << 0,
      19             :   CHUNK_END           = 1 << 1,
      20             :   PARENT              = 1 << 2,
      21             :   ROOT                = 1 << 3,
      22             :   KEYED_HASH          = 1 << 4,
      23             :   DERIVE_KEY_CONTEXT  = 1 << 5,
      24             :   DERIVE_KEY_MATERIAL = 1 << 6,
      25             : };
      26             : 
      27             : // This C implementation tries to support recent versions of GCC, Clang, and
      28             : // MSVC.
      29             : #if defined(_MSC_VER)
      30             : #define INLINE static __forceinline
      31             : #else
      32             : #define INLINE static inline __attribute__((always_inline))
      33             : #endif
      34             : 
      35             : #ifdef __cplusplus
      36             : #define NOEXCEPT noexcept
      37             : #else
      38             : #define NOEXCEPT
      39             : #endif
      40             : 
      41             : #if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
      42             : #define IS_X86
      43             : #define IS_X86_64
      44             : #endif
      45             : 
      46             : #if defined(__i386__) || defined(_M_IX86)
      47             : #define IS_X86
      48             : #define IS_X86_32
      49             : #endif
      50             : 
      51             : #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
      52             : #define IS_AARCH64
      53             : #endif
      54             : 
      55             : #if defined(IS_X86)
      56             : #if defined(_MSC_VER)
      57             : #include <intrin.h>
      58             : #endif
      59             : #endif
      60             : 
      61             : #if !defined(BLAKE3_USE_NEON)
      62             :   // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
      63             :   #if defined(IS_AARCH64)
      64             :     #if defined(__ARM_BIG_ENDIAN)
      65             :       #define BLAKE3_USE_NEON 0
      66             :     #else
      67             :       #define BLAKE3_USE_NEON 1
      68             :     #endif
      69             :   #else
      70             :     #define BLAKE3_USE_NEON 0
      71             :   #endif
      72             : #endif
      73             : 
      74             : #if defined(IS_X86)
      75             : #define MAX_SIMD_DEGREE 16
      76             : #elif BLAKE3_USE_NEON == 1
      77             : #define MAX_SIMD_DEGREE 4
      78             : #else
      79             : #define MAX_SIMD_DEGREE 1
      80             : #endif
      81             : 
      82             : // There are some places where we want a static size that's equal to the
      83             : // MAX_SIMD_DEGREE, but also at least 2.
      84             : #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
      85             : 
      86             : static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
      87             :                                0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
      88             :                                0x1F83D9ABUL, 0x5BE0CD19UL};
      89             : 
      90             : static const uint8_t MSG_SCHEDULE[7][16] = {
      91             :     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      92             :     {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
      93             :     {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
      94             :     {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
      95             :     {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
      96             :     {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
      97             :     {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
      98             : };
      99             : 
     100             : /* Find index of the highest set bit */
     101             : /* x is assumed to be nonzero.       */
     102     1523099 : static unsigned int highest_one(uint64_t x) {
     103     1523099 : #if defined(__GNUC__) || defined(__clang__)
     104     1523099 :   return 63 ^ (unsigned int)__builtin_clzll(x);
     105             : #elif defined(_MSC_VER) && defined(IS_X86_64)
     106             :   unsigned long index;
     107             :   _BitScanReverse64(&index, x);
     108             :   return index;
     109             : #elif defined(_MSC_VER) && defined(IS_X86_32)
     110             :   if(x >> 32) {
     111             :     unsigned long index;
     112             :     _BitScanReverse(&index, (unsigned long)(x >> 32));
     113             :     return 32 + index;
     114             :   } else {
     115             :     unsigned long index;
     116             :     _BitScanReverse(&index, (unsigned long)x);
     117             :     return index;
     118             :   }
     119             : #else
     120             :   unsigned int c = 0;
     121             :   if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
     122             :   if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
     123             :   if(x & 0x000000000000ff00ULL) { x >>=  8; c +=  8; }
     124             :   if(x & 0x00000000000000f0ULL) { x >>=  4; c +=  4; }
     125             :   if(x & 0x000000000000000cULL) { x >>=  2; c +=  2; }
     126             :   if(x & 0x0000000000000002ULL) {           c +=  1; }
     127             :   return c;
     128             : #endif
     129     1523099 : }
     130             : 
     131             : // Count the number of 1 bits.
     132    26218176 : INLINE unsigned int popcnt(uint64_t x) {
     133    26218176 : #if defined(__GNUC__) || defined(__clang__)
     134    26218176 :   return (unsigned int)__builtin_popcountll(x);
     135             : #else
     136             :   unsigned int count = 0;
     137             :   while (x != 0) {
     138             :     count += 1;
     139             :     x &= x - 1;
     140             :   }
     141             :   return count;
     142             : #endif
     143    26218176 : }
     144             : 
     145             : // Largest power of two less than or equal to x. As a special case, returns 1
     146             : // when x is 0.
     147     1523099 : INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
     148     1523099 :   return 1ULL << highest_one(x | 1);
     149     1523099 : }
     150             : 
     151    77952680 : INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
     152             : 
     153    77952680 : INLINE uint32_t counter_high(uint64_t counter) {
     154    77952680 :   return (uint32_t)(counter >> 32);
     155    77952680 : }
     156             : 
     157        9760 : INLINE uint32_t load32(const void *src) {
     158        9760 :   const uint8_t *p = (const uint8_t *)src;
     159        9760 :   return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
     160        9760 :          ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
     161        9760 : }
     162             : 
     163             : INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
     164           0 :                            uint32_t key_words[8]) {
     165           0 :   key_words[0] = load32(&key[0 * 4]);
     166           0 :   key_words[1] = load32(&key[1 * 4]);
     167           0 :   key_words[2] = load32(&key[2 * 4]);
     168           0 :   key_words[3] = load32(&key[3 * 4]);
     169           0 :   key_words[4] = load32(&key[4 * 4]);
     170           0 :   key_words[5] = load32(&key[5 * 4]);
     171           0 :   key_words[6] = load32(&key[6 * 4]);
     172           0 :   key_words[7] = load32(&key[7 * 4]);
     173           0 : }
     174             : 
     175             : INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
     176         610 :                              uint32_t block_words[16]) {
     177       10370 :   for (size_t i = 0; i < 16; i++) {
     178        9760 :       block_words[i] = load32(&block[i * 4]);
     179        9760 :   }
     180         610 : }
     181             : 
     182       59424 : INLINE void store32(void *dst, uint32_t w) {
     183       59424 :   uint8_t *p = (uint8_t *)dst;
     184       59424 :   p[0] = (uint8_t)(w >> 0);
     185       59424 :   p[1] = (uint8_t)(w >> 8);
     186       59424 :   p[2] = (uint8_t)(w >> 16);
     187       59424 :   p[3] = (uint8_t)(w >> 24);
     188       59424 : }
     189             : 
     190        7428 : INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
     191        7428 :   store32(&bytes_out[0 * 4], cv_words[0]);
     192        7428 :   store32(&bytes_out[1 * 4], cv_words[1]);
     193        7428 :   store32(&bytes_out[2 * 4], cv_words[2]);
     194        7428 :   store32(&bytes_out[3 * 4], cv_words[3]);
     195        7428 :   store32(&bytes_out[4 * 4], cv_words[4]);
     196        7428 :   store32(&bytes_out[5 * 4], cv_words[5]);
     197        7428 :   store32(&bytes_out[6 * 4], cv_words[6]);
     198        7428 :   store32(&bytes_out[7 * 4], cv_words[7]);
     199        7428 : }
     200             : 
     201             : void blake3_compress_in_place(uint32_t cv[8],
     202             :                               const uint8_t block[BLAKE3_BLOCK_LEN],
     203             :                               uint8_t block_len, uint64_t counter,
     204             :                               uint8_t flags);
     205             : 
     206             : void blake3_compress_xof(const uint32_t cv[8],
     207             :                          const uint8_t block[BLAKE3_BLOCK_LEN],
     208             :                          uint8_t block_len, uint64_t counter, uint8_t flags,
     209             :                          uint8_t out[64]);
     210             : 
     211             : void blake3_xof_many(const uint32_t cv[8],
     212             :                      const uint8_t block[BLAKE3_BLOCK_LEN],
     213             :                      uint8_t block_len, uint64_t counter, uint8_t flags,
     214             :                      uint8_t out[64], size_t outblocks);
     215             : 
     216             : void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
     217             :                       size_t blocks, const uint32_t key[8], uint64_t counter,
     218             :                       bool increment_counter, uint8_t flags,
     219             :                       uint8_t flags_start, uint8_t flags_end, uint8_t *out);
     220             : 
     221             : size_t blake3_simd_degree(void);
     222             : 
     223             : // Declarations for implementation-specific functions.
     224             : void blake3_compress_in_place_portable(uint32_t cv[8],
     225             :                                        const uint8_t block[BLAKE3_BLOCK_LEN],
     226             :                                        uint8_t block_len, uint64_t counter,
     227             :                                        uint8_t flags);
     228             : 
     229             : void blake3_compress_xof_portable(const uint32_t cv[8],
     230             :                                   const uint8_t block[BLAKE3_BLOCK_LEN],
     231             :                                   uint8_t block_len, uint64_t counter,
     232             :                                   uint8_t flags, uint8_t out[64]);
     233             : 
     234             : void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
     235             :                                size_t blocks, const uint32_t key[8],
     236             :                                uint64_t counter, bool increment_counter,
     237             :                                uint8_t flags, uint8_t flags_start,
     238             :                                uint8_t flags_end, uint8_t *out);
     239             : 
     240             : #if defined(IS_X86)
     241             : #if !defined(BLAKE3_NO_SSE2)
     242             : void blake3_compress_in_place_sse2(uint32_t cv[8],
     243             :                                    const uint8_t block[BLAKE3_BLOCK_LEN],
     244             :                                    uint8_t block_len, uint64_t counter,
     245             :                                    uint8_t flags);
     246             : void blake3_compress_xof_sse2(const uint32_t cv[8],
     247             :                               const uint8_t block[BLAKE3_BLOCK_LEN],
     248             :                               uint8_t block_len, uint64_t counter,
     249             :                               uint8_t flags, uint8_t out[64]);
     250             : void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
     251             :                            size_t blocks, const uint32_t key[8],
     252             :                            uint64_t counter, bool increment_counter,
     253             :                            uint8_t flags, uint8_t flags_start,
     254             :                            uint8_t flags_end, uint8_t *out);
     255             : #endif
     256             : #if !defined(BLAKE3_NO_SSE41)
     257             : void blake3_compress_in_place_sse41(uint32_t cv[8],
     258             :                                     const uint8_t block[BLAKE3_BLOCK_LEN],
     259             :                                     uint8_t block_len, uint64_t counter,
     260             :                                     uint8_t flags);
     261             : void blake3_compress_xof_sse41(const uint32_t cv[8],
     262             :                                const uint8_t block[BLAKE3_BLOCK_LEN],
     263             :                                uint8_t block_len, uint64_t counter,
     264             :                                uint8_t flags, uint8_t out[64]);
     265             : void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
     266             :                             size_t blocks, const uint32_t key[8],
     267             :                             uint64_t counter, bool increment_counter,
     268             :                             uint8_t flags, uint8_t flags_start,
     269             :                             uint8_t flags_end, uint8_t *out);
     270             : #endif
     271             : #if !defined(BLAKE3_NO_AVX2)
     272             : void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
     273             :                            size_t blocks, const uint32_t key[8],
     274             :                            uint64_t counter, bool increment_counter,
     275             :                            uint8_t flags, uint8_t flags_start,
     276             :                            uint8_t flags_end, uint8_t *out);
     277             : #endif
     278             : #if !defined(BLAKE3_NO_AVX512)
     279             : void blake3_compress_in_place_avx512(uint32_t cv[8],
     280             :                                      const uint8_t block[BLAKE3_BLOCK_LEN],
     281             :                                      uint8_t block_len, uint64_t counter,
     282             :                                      uint8_t flags);
     283             : 
     284             : void blake3_compress_xof_avx512(const uint32_t cv[8],
     285             :                                 const uint8_t block[BLAKE3_BLOCK_LEN],
     286             :                                 uint8_t block_len, uint64_t counter,
     287             :                                 uint8_t flags, uint8_t out[64]);
     288             : 
     289             : void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
     290             :                              size_t blocks, const uint32_t key[8],
     291             :                              uint64_t counter, bool increment_counter,
     292             :                              uint8_t flags, uint8_t flags_start,
     293             :                              uint8_t flags_end, uint8_t *out);
     294             : 
     295             : #if !defined(_WIN32)
     296             : void blake3_xof_many_avx512(const uint32_t cv[8],
     297             :                             const uint8_t block[BLAKE3_BLOCK_LEN],
     298             :                             uint8_t block_len, uint64_t counter, uint8_t flags,
     299             :                             uint8_t* out, size_t outblocks);
     300             : #endif
     301             : #endif
     302             : #endif
     303             : 
     304             : #if BLAKE3_USE_NEON == 1
     305             : void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
     306             :                            size_t blocks, const uint32_t key[8],
     307             :                            uint64_t counter, bool increment_counter,
     308             :                            uint8_t flags, uint8_t flags_start,
     309             :                            uint8_t flags_end, uint8_t *out);
     310             : #endif
     311             : 
     312             : #ifdef __cplusplus
     313             : }
     314             : #endif
     315             : 
     316             : #endif /* BLAKE3_IMPL_H */

Generated by: LCOV version 1.14