LCOV - code coverage report
Current view: top level - ballet/blake3 - blake3_avx512.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 1067 1164 91.7 %
Date: 2025-08-21 04:41:08 Functions: 56 58 96.6 %

          Line data    Source code
       1             : 
       2             : // Source originally from https://github.com/BLAKE3-team/BLAKE3
       3             : // From commit: 80b83effbd50425939483e5503b186db4dac4d9d
       4             : 
       5             : #include "blake3_impl.h"
       6             : 
       7             : #include <immintrin.h>
       8             : 
       9             : #define _mm_shuffle_ps2(a, b, c)                                               \
      10   415643520 :   (_mm_castps_si128(                                                           \
      11   415643520 :       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
      12             : 
      13   190816016 : INLINE __m128i loadu_128(const uint8_t src[16]) {
      14   190816016 :   return _mm_loadu_si128((void*)src);
      15   190816016 : }
      16             : 
      17     9002528 : INLINE __m256i loadu_256(const uint8_t src[32]) {
      18     9002528 :   return _mm256_loadu_si256((void*)src);
      19     9002528 : }
      20             : 
      21    44367664 : INLINE __m512i loadu_512(const uint8_t src[64]) {
      22    44367664 :   return _mm512_loadu_si512((void*)src);
      23    44367664 : }
      24             : 
      25    69959264 : INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
      26    69959264 :   _mm_storeu_si128((void*)dest, src);
      27    69959264 : }
      28             : 
      29      529264 : INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
      30      529264 :   _mm256_storeu_si256((void*)dest, src);
      31      529264 : }
      32             : 
      33        9760 : INLINE void storeu_512(__m512i src, uint8_t dest[16]) {
      34        9760 :   _mm512_storeu_si512((void*)dest, src);
      35        9760 : }
      36             : 
      37  2560227264 : INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
      38             : 
      39   189053088 : INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
      40             : 
      41   931925904 : INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
      42             : 
      43  1784720960 : INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
      44             : 
      45   130536656 : INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
      46             : 
      47   643477528 : INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
      48             : 
      49     7810596 : INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
      50             : 
      51     3905212 : INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
      52             : 
      53    21409198 : INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
      54             : 
      55    51955440 : INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
      56    51955440 :   return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
      57    51955440 : }
      58             : 
      59   426704544 : INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
      60             : 
      61    31508848 : INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
      62             : 
      63   155320984 : INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
      64             : 
      65   426704544 : INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
      66             : 
      67    31508848 : INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
      68             : 
      69   155320984 : INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
      70             : 
      71   426704544 : INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
      72             : 
      73    31508848 : INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
      74             : 
      75   155320984 : INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
      76             : 
      77   426704544 : INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
      78             : 
      79    31508848 : INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
      80             : 
      81   155320984 : INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
      82             : 
      83             : /*
      84             :  * ----------------------------------------------------------------------------
      85             :  * compress_avx512
      86             :  * ----------------------------------------------------------------------------
      87             :  */
      88             : 
      89             : INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
      90   363688080 :                __m128i m) {
      91   363688080 :   *row0 = add_128(add_128(*row0, m), *row1);
      92   363688080 :   *row3 = xor_128(*row3, *row0);
      93   363688080 :   *row3 = rot16_128(*row3);
      94   363688080 :   *row2 = add_128(*row2, *row3);
      95   363688080 :   *row1 = xor_128(*row1, *row2);
      96   363688080 :   *row1 = rot12_128(*row1);
      97   363688080 : }
      98             : 
      99             : INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
     100   363688080 :                __m128i m) {
     101   363688080 :   *row0 = add_128(add_128(*row0, m), *row1);
     102   363688080 :   *row3 = xor_128(*row3, *row0);
     103   363688080 :   *row3 = rot8_128(*row3);
     104   363688080 :   *row2 = add_128(*row2, *row3);
     105   363688080 :   *row1 = xor_128(*row1, *row2);
     106   363688080 :   *row1 = rot7_128(*row1);
     107   363688080 : }
     108             : 
     109             : // Note the optimization here of leaving row1 as the unrotated row, rather than
     110             : // row0. All the message loads below are adjusted to compensate for this. See
     111             : // discussion at https://github.com/sneves/blake2-avx2/pull/4
     112   181844040 : INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
     113   181844040 :   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
     114   181844040 :   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
     115   181844040 :   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
     116   181844040 : }
     117             : 
     118   181844040 : INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
     119   181844040 :   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
     120   181844040 :   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
     121   181844040 :   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
     122   181844040 : }
     123             : 
     124             : INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
     125             :                          const uint8_t block[BLAKE3_BLOCK_LEN],
     126    25977720 :                          uint8_t block_len, uint64_t counter, uint8_t flags) {
     127    25977720 :   rows[0] = loadu_128((uint8_t *)&cv[0]);
     128    25977720 :   rows[1] = loadu_128((uint8_t *)&cv[4]);
     129    25977720 :   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
     130    25977720 :   rows[3] = set4(counter_low(counter), counter_high(counter),
     131    25977720 :                  (uint32_t)block_len, (uint32_t)flags);
     132             : 
     133    25977720 :   __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
     134    25977720 :   __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
     135    25977720 :   __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
     136    25977720 :   __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
     137             : 
     138    25977720 :   __m128i t0, t1, t2, t3, tt;
     139             : 
     140             :   // Round 1. The first round permutes the message words from the original
     141             :   // input order, into the groups that get mixed in parallel.
     142    25977720 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
     143    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     144    25977720 :   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
     145    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     146    25977720 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     147    25977720 :   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
     148    25977720 :   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
     149    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     150    25977720 :   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
     151    25977720 :   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
     152    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     153    25977720 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     154    25977720 :   m0 = t0;
     155    25977720 :   m1 = t1;
     156    25977720 :   m2 = t2;
     157    25977720 :   m3 = t3;
     158             : 
     159             :   // Round 2. This round and all following rounds apply a fixed permutation
     160             :   // to the message words from the round before.
     161    25977720 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     162    25977720 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     163    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     164    25977720 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     165    25977720 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     166    25977720 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     167    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     168    25977720 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     169    25977720 :   t2 = _mm_unpacklo_epi64(m3, m1);
     170    25977720 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     171    25977720 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     172    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     173    25977720 :   t3 = _mm_unpackhi_epi32(m1, m3);
     174    25977720 :   tt = _mm_unpacklo_epi32(m2, t3);
     175    25977720 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     176    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     177    25977720 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     178    25977720 :   m0 = t0;
     179    25977720 :   m1 = t1;
     180    25977720 :   m2 = t2;
     181    25977720 :   m3 = t3;
     182             : 
     183             :   // Round 3
     184    25977720 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     185    25977720 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     186    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     187    25977720 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     188    25977720 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     189    25977720 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     190    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     191    25977720 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     192    25977720 :   t2 = _mm_unpacklo_epi64(m3, m1);
     193    25977720 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     194    25977720 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     195    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     196    25977720 :   t3 = _mm_unpackhi_epi32(m1, m3);
     197    25977720 :   tt = _mm_unpacklo_epi32(m2, t3);
     198    25977720 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     199    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     200    25977720 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     201    25977720 :   m0 = t0;
     202    25977720 :   m1 = t1;
     203    25977720 :   m2 = t2;
     204    25977720 :   m3 = t3;
     205             : 
     206             :   // Round 4
     207    25977720 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     208    25977720 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     209    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     210    25977720 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     211    25977720 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     212    25977720 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     213    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     214    25977720 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     215    25977720 :   t2 = _mm_unpacklo_epi64(m3, m1);
     216    25977720 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     217    25977720 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     218    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     219    25977720 :   t3 = _mm_unpackhi_epi32(m1, m3);
     220    25977720 :   tt = _mm_unpacklo_epi32(m2, t3);
     221    25977720 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     222    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     223    25977720 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     224    25977720 :   m0 = t0;
     225    25977720 :   m1 = t1;
     226    25977720 :   m2 = t2;
     227    25977720 :   m3 = t3;
     228             : 
     229             :   // Round 5
     230    25977720 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     231    25977720 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     232    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     233    25977720 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     234    25977720 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     235    25977720 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     236    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     237    25977720 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     238    25977720 :   t2 = _mm_unpacklo_epi64(m3, m1);
     239    25977720 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     240    25977720 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     241    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     242    25977720 :   t3 = _mm_unpackhi_epi32(m1, m3);
     243    25977720 :   tt = _mm_unpacklo_epi32(m2, t3);
     244    25977720 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     245    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     246    25977720 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     247    25977720 :   m0 = t0;
     248    25977720 :   m1 = t1;
     249    25977720 :   m2 = t2;
     250    25977720 :   m3 = t3;
     251             : 
     252             :   // Round 6
     253    25977720 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     254    25977720 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     255    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     256    25977720 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     257    25977720 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     258    25977720 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     259    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     260    25977720 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     261    25977720 :   t2 = _mm_unpacklo_epi64(m3, m1);
     262    25977720 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     263    25977720 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     264    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     265    25977720 :   t3 = _mm_unpackhi_epi32(m1, m3);
     266    25977720 :   tt = _mm_unpacklo_epi32(m2, t3);
     267    25977720 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     268    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     269    25977720 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     270    25977720 :   m0 = t0;
     271    25977720 :   m1 = t1;
     272    25977720 :   m2 = t2;
     273    25977720 :   m3 = t3;
     274             : 
     275             :   // Round 7
     276    25977720 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     277    25977720 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     278    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     279    25977720 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     280    25977720 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     281    25977720 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     282    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     283    25977720 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     284    25977720 :   t2 = _mm_unpacklo_epi64(m3, m1);
     285    25977720 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     286    25977720 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     287    25977720 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     288    25977720 :   t3 = _mm_unpackhi_epi32(m1, m3);
     289    25977720 :   tt = _mm_unpacklo_epi32(m2, t3);
     290    25977720 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     291    25977720 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     292    25977720 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     293    25977720 : }
     294             : 
     295             : void blake3_compress_xof_avx512(const uint32_t cv[8],
     296             :                                 const uint8_t block[BLAKE3_BLOCK_LEN],
     297             :                                 uint8_t block_len, uint64_t counter,
     298     8472496 :                                 uint8_t flags, uint8_t out[64]) {
     299     8472496 :   __m128i rows[4];
     300     8472496 :   compress_pre(rows, cv, block, block_len, counter, flags);
     301     8472496 :   storeu_128(xor_128(rows[0], rows[2]), &out[0]);
     302     8472496 :   storeu_128(xor_128(rows[1], rows[3]), &out[16]);
     303     8472496 :   storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
     304     8472496 :   storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
     305     8472496 : }
     306             : 
     307             : void blake3_compress_in_place_avx512(uint32_t cv[8],
     308             :                                      const uint8_t block[BLAKE3_BLOCK_LEN],
     309             :                                      uint8_t block_len, uint64_t counter,
     310    17505224 :                                      uint8_t flags) {
     311    17505224 :   __m128i rows[4];
     312    17505224 :   compress_pre(rows, cv, block, block_len, counter, flags);
     313    17505224 :   storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
     314    17505224 :   storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
     315    17505224 : }
     316             : 
     317             : /*
     318             :  * ----------------------------------------------------------------------------
     319             :  * hash4_avx512
     320             :  * ----------------------------------------------------------------------------
     321             :  */
     322             : 
     323     7877058 : INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
     324     7877058 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
     325     7877058 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
     326     7877058 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
     327     7877058 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
     328     7877058 :   v[0] = add_128(v[0], v[4]);
     329     7877058 :   v[1] = add_128(v[1], v[5]);
     330     7877058 :   v[2] = add_128(v[2], v[6]);
     331     7877058 :   v[3] = add_128(v[3], v[7]);
     332     7877058 :   v[12] = xor_128(v[12], v[0]);
     333     7877058 :   v[13] = xor_128(v[13], v[1]);
     334     7877058 :   v[14] = xor_128(v[14], v[2]);
     335     7877058 :   v[15] = xor_128(v[15], v[3]);
     336     7877058 :   v[12] = rot16_128(v[12]);
     337     7877058 :   v[13] = rot16_128(v[13]);
     338     7877058 :   v[14] = rot16_128(v[14]);
     339     7877058 :   v[15] = rot16_128(v[15]);
     340     7877058 :   v[8] = add_128(v[8], v[12]);
     341     7877058 :   v[9] = add_128(v[9], v[13]);
     342     7877058 :   v[10] = add_128(v[10], v[14]);
     343     7877058 :   v[11] = add_128(v[11], v[15]);
     344     7877058 :   v[4] = xor_128(v[4], v[8]);
     345     7877058 :   v[5] = xor_128(v[5], v[9]);
     346     7877058 :   v[6] = xor_128(v[6], v[10]);
     347     7877058 :   v[7] = xor_128(v[7], v[11]);
     348     7877058 :   v[4] = rot12_128(v[4]);
     349     7877058 :   v[5] = rot12_128(v[5]);
     350     7877058 :   v[6] = rot12_128(v[6]);
     351     7877058 :   v[7] = rot12_128(v[7]);
     352     7877058 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
     353     7877058 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
     354     7877058 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
     355     7877058 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
     356     7877058 :   v[0] = add_128(v[0], v[4]);
     357     7877058 :   v[1] = add_128(v[1], v[5]);
     358     7877058 :   v[2] = add_128(v[2], v[6]);
     359     7877058 :   v[3] = add_128(v[3], v[7]);
     360     7877058 :   v[12] = xor_128(v[12], v[0]);
     361     7877058 :   v[13] = xor_128(v[13], v[1]);
     362     7877058 :   v[14] = xor_128(v[14], v[2]);
     363     7877058 :   v[15] = xor_128(v[15], v[3]);
     364     7877058 :   v[12] = rot8_128(v[12]);
     365     7877058 :   v[13] = rot8_128(v[13]);
     366     7877058 :   v[14] = rot8_128(v[14]);
     367     7877058 :   v[15] = rot8_128(v[15]);
     368     7877058 :   v[8] = add_128(v[8], v[12]);
     369     7877058 :   v[9] = add_128(v[9], v[13]);
     370     7877058 :   v[10] = add_128(v[10], v[14]);
     371     7877058 :   v[11] = add_128(v[11], v[15]);
     372     7877058 :   v[4] = xor_128(v[4], v[8]);
     373     7877058 :   v[5] = xor_128(v[5], v[9]);
     374     7877058 :   v[6] = xor_128(v[6], v[10]);
     375     7877058 :   v[7] = xor_128(v[7], v[11]);
     376     7877058 :   v[4] = rot7_128(v[4]);
     377     7877058 :   v[5] = rot7_128(v[5]);
     378     7877058 :   v[6] = rot7_128(v[6]);
     379     7877058 :   v[7] = rot7_128(v[7]);
     380             : 
     381     7877058 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
     382     7877058 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
     383     7877058 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
     384     7877058 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
     385     7877058 :   v[0] = add_128(v[0], v[5]);
     386     7877058 :   v[1] = add_128(v[1], v[6]);
     387     7877058 :   v[2] = add_128(v[2], v[7]);
     388     7877058 :   v[3] = add_128(v[3], v[4]);
     389     7877058 :   v[15] = xor_128(v[15], v[0]);
     390     7877058 :   v[12] = xor_128(v[12], v[1]);
     391     7877058 :   v[13] = xor_128(v[13], v[2]);
     392     7877058 :   v[14] = xor_128(v[14], v[3]);
     393     7877058 :   v[15] = rot16_128(v[15]);
     394     7877058 :   v[12] = rot16_128(v[12]);
     395     7877058 :   v[13] = rot16_128(v[13]);
     396     7877058 :   v[14] = rot16_128(v[14]);
     397     7877058 :   v[10] = add_128(v[10], v[15]);
     398     7877058 :   v[11] = add_128(v[11], v[12]);
     399     7877058 :   v[8] = add_128(v[8], v[13]);
     400     7877058 :   v[9] = add_128(v[9], v[14]);
     401     7877058 :   v[5] = xor_128(v[5], v[10]);
     402     7877058 :   v[6] = xor_128(v[6], v[11]);
     403     7877058 :   v[7] = xor_128(v[7], v[8]);
     404     7877058 :   v[4] = xor_128(v[4], v[9]);
     405     7877058 :   v[5] = rot12_128(v[5]);
     406     7877058 :   v[6] = rot12_128(v[6]);
     407     7877058 :   v[7] = rot12_128(v[7]);
     408     7877058 :   v[4] = rot12_128(v[4]);
     409     7877058 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
     410     7877058 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
     411     7877058 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
     412     7877058 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
     413     7877058 :   v[0] = add_128(v[0], v[5]);
     414     7877058 :   v[1] = add_128(v[1], v[6]);
     415     7877058 :   v[2] = add_128(v[2], v[7]);
     416     7877058 :   v[3] = add_128(v[3], v[4]);
     417     7877058 :   v[15] = xor_128(v[15], v[0]);
     418     7877058 :   v[12] = xor_128(v[12], v[1]);
     419     7877058 :   v[13] = xor_128(v[13], v[2]);
     420     7877058 :   v[14] = xor_128(v[14], v[3]);
     421     7877058 :   v[15] = rot8_128(v[15]);
     422     7877058 :   v[12] = rot8_128(v[12]);
     423     7877058 :   v[13] = rot8_128(v[13]);
     424     7877058 :   v[14] = rot8_128(v[14]);
     425     7877058 :   v[10] = add_128(v[10], v[15]);
     426     7877058 :   v[11] = add_128(v[11], v[12]);
     427     7877058 :   v[8] = add_128(v[8], v[13]);
     428     7877058 :   v[9] = add_128(v[9], v[14]);
     429     7877058 :   v[5] = xor_128(v[5], v[10]);
     430     7877058 :   v[6] = xor_128(v[6], v[11]);
     431     7877058 :   v[7] = xor_128(v[7], v[8]);
     432     7877058 :   v[4] = xor_128(v[4], v[9]);
     433     7877058 :   v[5] = rot7_128(v[5]);
     434     7877058 :   v[6] = rot7_128(v[6]);
     435     7877058 :   v[7] = rot7_128(v[7]);
     436     7877058 :   v[4] = rot7_128(v[4]);
     437     7877058 : }
     438             : 
     439     4765884 : INLINE void transpose_vecs_128(__m128i vecs[4]) {
     440             :   // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
     441             :   // 22/33. Note that this doesn't split the vector into two lanes, as the
     442             :   // AVX2 counterparts do.
     443     4765884 :   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
     444     4765884 :   __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
     445     4765884 :   __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
     446     4765884 :   __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
     447             : 
     448             :   // Interleave 64-bit lanes.
     449     4765884 :   __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
     450     4765884 :   __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
     451     4765884 :   __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
     452     4765884 :   __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
     453             : 
     454     4765884 :   vecs[0] = abcd_0;
     455     4765884 :   vecs[1] = abcd_1;
     456     4765884 :   vecs[2] = abcd_2;
     457     4765884 :   vecs[3] = abcd_3;
     458     4765884 : }
     459             : 
     460             : INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
     461     1125294 :                                 size_t block_offset, __m128i out[16]) {
     462     1125294 :   out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
     463     1125294 :   out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
     464     1125294 :   out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
     465     1125294 :   out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
     466     1125294 :   out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
     467     1125294 :   out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
     468     1125294 :   out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
     469     1125294 :   out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
     470     1125294 :   out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
     471     1125294 :   out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
     472     1125294 :   out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
     473     1125294 :   out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
     474     1125294 :   out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
     475     1125294 :   out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
     476     1125294 :   out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
     477     1125294 :   out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
     478     5626470 :   for (size_t i = 0; i < 4; ++i) {
     479     4501176 :     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
     480     4501176 :   }
     481     1125294 :   transpose_vecs_128(&out[0]);
     482     1125294 :   transpose_vecs_128(&out[4]);
     483     1125294 :   transpose_vecs_128(&out[8]);
     484     1125294 :   transpose_vecs_128(&out[12]);
     485     1125294 : }
     486             : 
     487             : INLINE void load_counters4(uint64_t counter, bool increment_counter,
     488      132354 :                            __m128i *out_lo, __m128i *out_hi) {
     489      132354 :   int64_t mask = (increment_counter ? ~0 : 0);
     490      132354 :   __m256i mask_vec = _mm256_set1_epi64x(mask);
     491      132354 :   __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
     492      132354 :   deltas = _mm256_and_si256(mask_vec, deltas);
     493      132354 :   __m256i counters =
     494      132354 :       _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
     495      132354 :   *out_lo = _mm256_cvtepi64_epi32(counters);
     496      132354 :   *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
     497      132354 : }
     498             : 
     499             : static
     500             : void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
     501             :                          const uint32_t key[8], uint64_t counter,
     502             :                          bool increment_counter, uint8_t flags,
     503      132354 :                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
     504      132354 :   __m128i h_vecs[8] = {
     505      132354 :       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
     506      132354 :       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
     507      132354 :   };
     508      132354 :   __m128i counter_low_vec, counter_high_vec;
     509      132354 :   load_counters4(counter, increment_counter, &counter_low_vec,
     510      132354 :                  &counter_high_vec);
     511      132354 :   uint8_t block_flags = flags | flags_start;
     512             : 
     513     1257648 :   for (size_t block = 0; block < blocks; block++) {
     514     1125294 :     if (block + 1 == blocks) {
     515      132354 :       block_flags |= flags_end;
     516      132354 :     }
     517     1125294 :     __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
     518     1125294 :     __m128i block_flags_vec = set1_128(block_flags);
     519     1125294 :     __m128i msg_vecs[16];
     520     1125294 :     transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
     521             : 
     522     1125294 :     __m128i v[16] = {
     523     1125294 :         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
     524     1125294 :         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
     525     1125294 :         set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
     526     1125294 :         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     527     1125294 :     };
     528     1125294 :     round_fn4(v, msg_vecs, 0);
     529     1125294 :     round_fn4(v, msg_vecs, 1);
     530     1125294 :     round_fn4(v, msg_vecs, 2);
     531     1125294 :     round_fn4(v, msg_vecs, 3);
     532     1125294 :     round_fn4(v, msg_vecs, 4);
     533     1125294 :     round_fn4(v, msg_vecs, 5);
     534     1125294 :     round_fn4(v, msg_vecs, 6);
     535     1125294 :     h_vecs[0] = xor_128(v[0], v[8]);
     536     1125294 :     h_vecs[1] = xor_128(v[1], v[9]);
     537     1125294 :     h_vecs[2] = xor_128(v[2], v[10]);
     538     1125294 :     h_vecs[3] = xor_128(v[3], v[11]);
     539     1125294 :     h_vecs[4] = xor_128(v[4], v[12]);
     540     1125294 :     h_vecs[5] = xor_128(v[5], v[13]);
     541     1125294 :     h_vecs[6] = xor_128(v[6], v[14]);
     542     1125294 :     h_vecs[7] = xor_128(v[7], v[15]);
     543             : 
     544     1125294 :     block_flags = flags;
     545     1125294 :   }
     546             : 
     547      132354 :   transpose_vecs_128(&h_vecs[0]);
     548      132354 :   transpose_vecs_128(&h_vecs[4]);
     549             :   // The first four vecs now contain the first half of each output, and the
     550             :   // second four vecs contain the second half of each output.
     551      132354 :   storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
     552      132354 :   storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
     553      132354 :   storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
     554      132354 :   storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
     555      132354 :   storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
     556      132354 :   storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
     557      132354 :   storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
     558      132354 :   storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
     559      132354 : }
     560             : 
     561             : static
     562             : void blake3_xof4_avx512(const uint32_t cv[8],
     563             :                         const uint8_t block[BLAKE3_BLOCK_LEN],
     564             :                         uint8_t block_len, uint64_t counter, uint8_t flags,
     565           0 :                         uint8_t out[4 * 64]) {
     566           0 :   __m128i h_vecs[8] = {
     567           0 :       set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
     568           0 :       set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
     569           0 :   };
     570           0 :   uint32_t block_words[16];
     571           0 :   load_block_words(block, block_words);
     572           0 :   __m128i msg_vecs[16];
     573           0 :   for (size_t i = 0; i < 16; i++) {
     574           0 :       msg_vecs[i] = set1_128(block_words[i]);
     575           0 :   }
     576           0 :   __m128i counter_low_vec, counter_high_vec;
     577           0 :   load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
     578           0 :   __m128i block_len_vec = set1_128(block_len);
     579           0 :   __m128i block_flags_vec = set1_128(flags);
     580           0 :   __m128i v[16] = {
     581           0 :       h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
     582           0 :       h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
     583           0 :       set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
     584           0 :       counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     585           0 :   };
     586           0 :   round_fn4(v, msg_vecs, 0);
     587           0 :   round_fn4(v, msg_vecs, 1);
     588           0 :   round_fn4(v, msg_vecs, 2);
     589           0 :   round_fn4(v, msg_vecs, 3);
     590           0 :   round_fn4(v, msg_vecs, 4);
     591           0 :   round_fn4(v, msg_vecs, 5);
     592           0 :   round_fn4(v, msg_vecs, 6);
     593           0 :   for (size_t i = 0; i < 8; i++) {
     594           0 :       v[i] = xor_128(v[i], v[i+8]);
     595           0 :       v[i+8] = xor_128(v[i+8], h_vecs[i]);
     596           0 :   }
     597           0 :   transpose_vecs_128(&v[0]);
     598           0 :   transpose_vecs_128(&v[4]);
     599           0 :   transpose_vecs_128(&v[8]);
     600           0 :   transpose_vecs_128(&v[12]);
     601           0 :   for (size_t i = 0; i < 4; i++) {
     602           0 :       storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
     603           0 :       storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
     604           0 :       storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
     605           0 :       storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
     606           0 :   }
     607           0 : }
     608             : 
     609             : /*
     610             :  * ----------------------------------------------------------------------------
     611             :  * hash8_avx512
     612             :  * ----------------------------------------------------------------------------
     613             :  */
     614             : 
     615     3938606 : INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
     616     3938606 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
     617     3938606 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
     618     3938606 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
     619     3938606 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
     620     3938606 :   v[0] = add_256(v[0], v[4]);
     621     3938606 :   v[1] = add_256(v[1], v[5]);
     622     3938606 :   v[2] = add_256(v[2], v[6]);
     623     3938606 :   v[3] = add_256(v[3], v[7]);
     624     3938606 :   v[12] = xor_256(v[12], v[0]);
     625     3938606 :   v[13] = xor_256(v[13], v[1]);
     626     3938606 :   v[14] = xor_256(v[14], v[2]);
     627     3938606 :   v[15] = xor_256(v[15], v[3]);
     628     3938606 :   v[12] = rot16_256(v[12]);
     629     3938606 :   v[13] = rot16_256(v[13]);
     630     3938606 :   v[14] = rot16_256(v[14]);
     631     3938606 :   v[15] = rot16_256(v[15]);
     632     3938606 :   v[8] = add_256(v[8], v[12]);
     633     3938606 :   v[9] = add_256(v[9], v[13]);
     634     3938606 :   v[10] = add_256(v[10], v[14]);
     635     3938606 :   v[11] = add_256(v[11], v[15]);
     636     3938606 :   v[4] = xor_256(v[4], v[8]);
     637     3938606 :   v[5] = xor_256(v[5], v[9]);
     638     3938606 :   v[6] = xor_256(v[6], v[10]);
     639     3938606 :   v[7] = xor_256(v[7], v[11]);
     640     3938606 :   v[4] = rot12_256(v[4]);
     641     3938606 :   v[5] = rot12_256(v[5]);
     642     3938606 :   v[6] = rot12_256(v[6]);
     643     3938606 :   v[7] = rot12_256(v[7]);
     644     3938606 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
     645     3938606 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
     646     3938606 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
     647     3938606 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
     648     3938606 :   v[0] = add_256(v[0], v[4]);
     649     3938606 :   v[1] = add_256(v[1], v[5]);
     650     3938606 :   v[2] = add_256(v[2], v[6]);
     651     3938606 :   v[3] = add_256(v[3], v[7]);
     652     3938606 :   v[12] = xor_256(v[12], v[0]);
     653     3938606 :   v[13] = xor_256(v[13], v[1]);
     654     3938606 :   v[14] = xor_256(v[14], v[2]);
     655     3938606 :   v[15] = xor_256(v[15], v[3]);
     656     3938606 :   v[12] = rot8_256(v[12]);
     657     3938606 :   v[13] = rot8_256(v[13]);
     658     3938606 :   v[14] = rot8_256(v[14]);
     659     3938606 :   v[15] = rot8_256(v[15]);
     660     3938606 :   v[8] = add_256(v[8], v[12]);
     661     3938606 :   v[9] = add_256(v[9], v[13]);
     662     3938606 :   v[10] = add_256(v[10], v[14]);
     663     3938606 :   v[11] = add_256(v[11], v[15]);
     664     3938606 :   v[4] = xor_256(v[4], v[8]);
     665     3938606 :   v[5] = xor_256(v[5], v[9]);
     666     3938606 :   v[6] = xor_256(v[6], v[10]);
     667     3938606 :   v[7] = xor_256(v[7], v[11]);
     668     3938606 :   v[4] = rot7_256(v[4]);
     669     3938606 :   v[5] = rot7_256(v[5]);
     670     3938606 :   v[6] = rot7_256(v[6]);
     671     3938606 :   v[7] = rot7_256(v[7]);
     672             : 
     673     3938606 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
     674     3938606 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
     675     3938606 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
     676     3938606 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
     677     3938606 :   v[0] = add_256(v[0], v[5]);
     678     3938606 :   v[1] = add_256(v[1], v[6]);
     679     3938606 :   v[2] = add_256(v[2], v[7]);
     680     3938606 :   v[3] = add_256(v[3], v[4]);
     681     3938606 :   v[15] = xor_256(v[15], v[0]);
     682     3938606 :   v[12] = xor_256(v[12], v[1]);
     683     3938606 :   v[13] = xor_256(v[13], v[2]);
     684     3938606 :   v[14] = xor_256(v[14], v[3]);
     685     3938606 :   v[15] = rot16_256(v[15]);
     686     3938606 :   v[12] = rot16_256(v[12]);
     687     3938606 :   v[13] = rot16_256(v[13]);
     688     3938606 :   v[14] = rot16_256(v[14]);
     689     3938606 :   v[10] = add_256(v[10], v[15]);
     690     3938606 :   v[11] = add_256(v[11], v[12]);
     691     3938606 :   v[8] = add_256(v[8], v[13]);
     692     3938606 :   v[9] = add_256(v[9], v[14]);
     693     3938606 :   v[5] = xor_256(v[5], v[10]);
     694     3938606 :   v[6] = xor_256(v[6], v[11]);
     695     3938606 :   v[7] = xor_256(v[7], v[8]);
     696     3938606 :   v[4] = xor_256(v[4], v[9]);
     697     3938606 :   v[5] = rot12_256(v[5]);
     698     3938606 :   v[6] = rot12_256(v[6]);
     699     3938606 :   v[7] = rot12_256(v[7]);
     700     3938606 :   v[4] = rot12_256(v[4]);
     701     3938606 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
     702     3938606 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
     703     3938606 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
     704     3938606 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
     705     3938606 :   v[0] = add_256(v[0], v[5]);
     706     3938606 :   v[1] = add_256(v[1], v[6]);
     707     3938606 :   v[2] = add_256(v[2], v[7]);
     708     3938606 :   v[3] = add_256(v[3], v[4]);
     709     3938606 :   v[15] = xor_256(v[15], v[0]);
     710     3938606 :   v[12] = xor_256(v[12], v[1]);
     711     3938606 :   v[13] = xor_256(v[13], v[2]);
     712     3938606 :   v[14] = xor_256(v[14], v[3]);
     713     3938606 :   v[15] = rot8_256(v[15]);
     714     3938606 :   v[12] = rot8_256(v[12]);
     715     3938606 :   v[13] = rot8_256(v[13]);
     716     3938606 :   v[14] = rot8_256(v[14]);
     717     3938606 :   v[10] = add_256(v[10], v[15]);
     718     3938606 :   v[11] = add_256(v[11], v[12]);
     719     3938606 :   v[8] = add_256(v[8], v[13]);
     720     3938606 :   v[9] = add_256(v[9], v[14]);
     721     3938606 :   v[5] = xor_256(v[5], v[10]);
     722     3938606 :   v[6] = xor_256(v[6], v[11]);
     723     3938606 :   v[7] = xor_256(v[7], v[8]);
     724     3938606 :   v[4] = xor_256(v[4], v[9]);
     725     3938606 :   v[5] = rot7_256(v[5]);
     726     3938606 :   v[6] = rot7_256(v[6]);
     727     3938606 :   v[7] = rot7_256(v[7]);
     728     3938606 :   v[4] = rot7_256(v[4]);
     729     3938606 : }
     730             : 
     731     1191474 : INLINE void transpose_vecs_256(__m256i vecs[8]) {
     732             :   // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
     733             :   // is 22/33/66/77.
     734     1191474 :   __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
     735     1191474 :   __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
     736     1191474 :   __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
     737     1191474 :   __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
     738     1191474 :   __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
     739     1191474 :   __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
     740     1191474 :   __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
     741     1191474 :   __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
     742             : 
     743             :   // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
     744             :   // 11/33.
     745     1191474 :   __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
     746     1191474 :   __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
     747     1191474 :   __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
     748     1191474 :   __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
     749     1191474 :   __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
     750     1191474 :   __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
     751     1191474 :   __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
     752     1191474 :   __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
     753             : 
     754             :   // Interleave 128-bit lanes.
     755     1191474 :   vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
     756     1191474 :   vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
     757     1191474 :   vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
     758     1191474 :   vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
     759     1191474 :   vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
     760     1191474 :   vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
     761     1191474 :   vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
     762     1191474 :   vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
     763     1191474 : }
     764             : 
     765             : INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
     766      562658 :                                 size_t block_offset, __m256i out[16]) {
     767      562658 :   out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
     768      562658 :   out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
     769      562658 :   out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
     770      562658 :   out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
     771      562658 :   out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
     772      562658 :   out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
     773      562658 :   out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
     774      562658 :   out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
     775      562658 :   out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
     776      562658 :   out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
     777      562658 :   out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
     778      562658 :   out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
     779      562658 :   out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
     780      562658 :   out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
     781      562658 :   out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
     782      562658 :   out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
     783     5063922 :   for (size_t i = 0; i < 8; ++i) {
     784     4501264 :     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
     785     4501264 :   }
     786      562658 :   transpose_vecs_256(&out[0]);
     787      562658 :   transpose_vecs_256(&out[8]);
     788      562658 : }
     789             : 
     790             : INLINE void load_counters8(uint64_t counter, bool increment_counter,
     791       66158 :                            __m256i *out_lo, __m256i *out_hi) {
     792       66158 :   int64_t mask = (increment_counter ? ~0 : 0);
     793       66158 :   __m512i mask_vec = _mm512_set1_epi64(mask);
     794       66158 :   __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
     795       66158 :   deltas = _mm512_and_si512(mask_vec, deltas);
     796       66158 :   __m512i counters =
     797       66158 :       _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
     798       66158 :   *out_lo = _mm512_cvtepi64_epi32(counters);
     799       66158 :   *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
     800       66158 : }
     801             : 
     802             : static
     803             : void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
     804             :                          const uint32_t key[8], uint64_t counter,
     805             :                          bool increment_counter, uint8_t flags,
     806       66158 :                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
     807       66158 :   __m256i h_vecs[8] = {
     808       66158 :       set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
     809       66158 :       set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
     810       66158 :   };
     811       66158 :   __m256i counter_low_vec, counter_high_vec;
     812       66158 :   load_counters8(counter, increment_counter, &counter_low_vec,
     813       66158 :                  &counter_high_vec);
     814       66158 :   uint8_t block_flags = flags | flags_start;
     815             : 
     816      628816 :   for (size_t block = 0; block < blocks; block++) {
     817      562658 :     if (block + 1 == blocks) {
     818       66158 :       block_flags |= flags_end;
     819       66158 :     }
     820      562658 :     __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
     821      562658 :     __m256i block_flags_vec = set1_256(block_flags);
     822      562658 :     __m256i msg_vecs[16];
     823      562658 :     transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
     824             : 
     825      562658 :     __m256i v[16] = {
     826      562658 :         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
     827      562658 :         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
     828      562658 :         set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
     829      562658 :         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     830      562658 :     };
     831      562658 :     round_fn8(v, msg_vecs, 0);
     832      562658 :     round_fn8(v, msg_vecs, 1);
     833      562658 :     round_fn8(v, msg_vecs, 2);
     834      562658 :     round_fn8(v, msg_vecs, 3);
     835      562658 :     round_fn8(v, msg_vecs, 4);
     836      562658 :     round_fn8(v, msg_vecs, 5);
     837      562658 :     round_fn8(v, msg_vecs, 6);
     838      562658 :     h_vecs[0] = xor_256(v[0], v[8]);
     839      562658 :     h_vecs[1] = xor_256(v[1], v[9]);
     840      562658 :     h_vecs[2] = xor_256(v[2], v[10]);
     841      562658 :     h_vecs[3] = xor_256(v[3], v[11]);
     842      562658 :     h_vecs[4] = xor_256(v[4], v[12]);
     843      562658 :     h_vecs[5] = xor_256(v[5], v[13]);
     844      562658 :     h_vecs[6] = xor_256(v[6], v[14]);
     845      562658 :     h_vecs[7] = xor_256(v[7], v[15]);
     846             : 
     847      562658 :     block_flags = flags;
     848      562658 :   }
     849             : 
     850       66158 :   transpose_vecs_256(h_vecs);
     851       66158 :   storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
     852       66158 :   storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
     853       66158 :   storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
     854       66158 :   storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
     855       66158 :   storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
     856       66158 :   storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
     857       66158 :   storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
     858       66158 :   storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
     859       66158 : }
     860             : 
     861             : static
     862             : void blake3_xof8_avx512(const uint32_t cv[8],
     863             :                         const uint8_t block[BLAKE3_BLOCK_LEN],
     864             :                         uint8_t block_len, uint64_t counter, uint8_t flags,
     865           0 :                         uint8_t out[8 * 64]) {
     866           0 :   __m256i h_vecs[8] = {
     867           0 :       set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
     868           0 :       set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
     869           0 :   };
     870           0 :   uint32_t block_words[16];
     871           0 :   load_block_words(block, block_words);
     872           0 :   __m256i msg_vecs[16];
     873           0 :   for (size_t i = 0; i < 16; i++) {
     874           0 :       msg_vecs[i] = set1_256(block_words[i]);
     875           0 :   }
     876           0 :   __m256i counter_low_vec, counter_high_vec;
     877           0 :   load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
     878           0 :   __m256i block_len_vec = set1_256(block_len);
     879           0 :   __m256i block_flags_vec = set1_256(flags);
     880           0 :   __m256i v[16] = {
     881           0 :       h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
     882           0 :       h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
     883           0 :       set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
     884           0 :       counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     885           0 :   };
     886           0 :   round_fn8(v, msg_vecs, 0);
     887           0 :   round_fn8(v, msg_vecs, 1);
     888           0 :   round_fn8(v, msg_vecs, 2);
     889           0 :   round_fn8(v, msg_vecs, 3);
     890           0 :   round_fn8(v, msg_vecs, 4);
     891           0 :   round_fn8(v, msg_vecs, 5);
     892           0 :   round_fn8(v, msg_vecs, 6);
     893           0 :   for (size_t i = 0; i < 8; i++) {
     894           0 :       v[i] = xor_256(v[i], v[i+8]);
     895           0 :       v[i+8] = xor_256(v[i+8], h_vecs[i]);
     896           0 :   }
     897           0 :   transpose_vecs_256(&v[0]);
     898           0 :   transpose_vecs_256(&v[8]);
     899           0 :   for (size_t i = 0; i < 8; i++) {
     900           0 :       storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
     901           0 :       storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
     902           0 :   }
     903           0 : }
     904             : 
     905             : /*
     906             :  * ----------------------------------------------------------------------------
     907             :  * hash16_avx512
     908             :  * ----------------------------------------------------------------------------
     909             :  */
     910             : 
     911    19415123 : INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
     912    19415123 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
     913    19415123 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
     914    19415123 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
     915    19415123 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
     916    19415123 :   v[0] = add_512(v[0], v[4]);
     917    19415123 :   v[1] = add_512(v[1], v[5]);
     918    19415123 :   v[2] = add_512(v[2], v[6]);
     919    19415123 :   v[3] = add_512(v[3], v[7]);
     920    19415123 :   v[12] = xor_512(v[12], v[0]);
     921    19415123 :   v[13] = xor_512(v[13], v[1]);
     922    19415123 :   v[14] = xor_512(v[14], v[2]);
     923    19415123 :   v[15] = xor_512(v[15], v[3]);
     924    19415123 :   v[12] = rot16_512(v[12]);
     925    19415123 :   v[13] = rot16_512(v[13]);
     926    19415123 :   v[14] = rot16_512(v[14]);
     927    19415123 :   v[15] = rot16_512(v[15]);
     928    19415123 :   v[8] = add_512(v[8], v[12]);
     929    19415123 :   v[9] = add_512(v[9], v[13]);
     930    19415123 :   v[10] = add_512(v[10], v[14]);
     931    19415123 :   v[11] = add_512(v[11], v[15]);
     932    19415123 :   v[4] = xor_512(v[4], v[8]);
     933    19415123 :   v[5] = xor_512(v[5], v[9]);
     934    19415123 :   v[6] = xor_512(v[6], v[10]);
     935    19415123 :   v[7] = xor_512(v[7], v[11]);
     936    19415123 :   v[4] = rot12_512(v[4]);
     937    19415123 :   v[5] = rot12_512(v[5]);
     938    19415123 :   v[6] = rot12_512(v[6]);
     939    19415123 :   v[7] = rot12_512(v[7]);
     940    19415123 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
     941    19415123 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
     942    19415123 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
     943    19415123 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
     944    19415123 :   v[0] = add_512(v[0], v[4]);
     945    19415123 :   v[1] = add_512(v[1], v[5]);
     946    19415123 :   v[2] = add_512(v[2], v[6]);
     947    19415123 :   v[3] = add_512(v[3], v[7]);
     948    19415123 :   v[12] = xor_512(v[12], v[0]);
     949    19415123 :   v[13] = xor_512(v[13], v[1]);
     950    19415123 :   v[14] = xor_512(v[14], v[2]);
     951    19415123 :   v[15] = xor_512(v[15], v[3]);
     952    19415123 :   v[12] = rot8_512(v[12]);
     953    19415123 :   v[13] = rot8_512(v[13]);
     954    19415123 :   v[14] = rot8_512(v[14]);
     955    19415123 :   v[15] = rot8_512(v[15]);
     956    19415123 :   v[8] = add_512(v[8], v[12]);
     957    19415123 :   v[9] = add_512(v[9], v[13]);
     958    19415123 :   v[10] = add_512(v[10], v[14]);
     959    19415123 :   v[11] = add_512(v[11], v[15]);
     960    19415123 :   v[4] = xor_512(v[4], v[8]);
     961    19415123 :   v[5] = xor_512(v[5], v[9]);
     962    19415123 :   v[6] = xor_512(v[6], v[10]);
     963    19415123 :   v[7] = xor_512(v[7], v[11]);
     964    19415123 :   v[4] = rot7_512(v[4]);
     965    19415123 :   v[5] = rot7_512(v[5]);
     966    19415123 :   v[6] = rot7_512(v[6]);
     967    19415123 :   v[7] = rot7_512(v[7]);
     968             : 
     969    19415123 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
     970    19415123 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
     971    19415123 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
     972    19415123 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
     973    19415123 :   v[0] = add_512(v[0], v[5]);
     974    19415123 :   v[1] = add_512(v[1], v[6]);
     975    19415123 :   v[2] = add_512(v[2], v[7]);
     976    19415123 :   v[3] = add_512(v[3], v[4]);
     977    19415123 :   v[15] = xor_512(v[15], v[0]);
     978    19415123 :   v[12] = xor_512(v[12], v[1]);
     979    19415123 :   v[13] = xor_512(v[13], v[2]);
     980    19415123 :   v[14] = xor_512(v[14], v[3]);
     981    19415123 :   v[15] = rot16_512(v[15]);
     982    19415123 :   v[12] = rot16_512(v[12]);
     983    19415123 :   v[13] = rot16_512(v[13]);
     984    19415123 :   v[14] = rot16_512(v[14]);
     985    19415123 :   v[10] = add_512(v[10], v[15]);
     986    19415123 :   v[11] = add_512(v[11], v[12]);
     987    19415123 :   v[8] = add_512(v[8], v[13]);
     988    19415123 :   v[9] = add_512(v[9], v[14]);
     989    19415123 :   v[5] = xor_512(v[5], v[10]);
     990    19415123 :   v[6] = xor_512(v[6], v[11]);
     991    19415123 :   v[7] = xor_512(v[7], v[8]);
     992    19415123 :   v[4] = xor_512(v[4], v[9]);
     993    19415123 :   v[5] = rot12_512(v[5]);
     994    19415123 :   v[6] = rot12_512(v[6]);
     995    19415123 :   v[7] = rot12_512(v[7]);
     996    19415123 :   v[4] = rot12_512(v[4]);
     997    19415123 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
     998    19415123 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
     999    19415123 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
    1000    19415123 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
    1001    19415123 :   v[0] = add_512(v[0], v[5]);
    1002    19415123 :   v[1] = add_512(v[1], v[6]);
    1003    19415123 :   v[2] = add_512(v[2], v[7]);
    1004    19415123 :   v[3] = add_512(v[3], v[4]);
    1005    19415123 :   v[15] = xor_512(v[15], v[0]);
    1006    19415123 :   v[12] = xor_512(v[12], v[1]);
    1007    19415123 :   v[13] = xor_512(v[13], v[2]);
    1008    19415123 :   v[14] = xor_512(v[14], v[3]);
    1009    19415123 :   v[15] = rot8_512(v[15]);
    1010    19415123 :   v[12] = rot8_512(v[12]);
    1011    19415123 :   v[13] = rot8_512(v[13]);
    1012    19415123 :   v[14] = rot8_512(v[14]);
    1013    19415123 :   v[10] = add_512(v[10], v[15]);
    1014    19415123 :   v[11] = add_512(v[11], v[12]);
    1015    19415123 :   v[8] = add_512(v[8], v[13]);
    1016    19415123 :   v[9] = add_512(v[9], v[14]);
    1017    19415123 :   v[5] = xor_512(v[5], v[10]);
    1018    19415123 :   v[6] = xor_512(v[6], v[11]);
    1019    19415123 :   v[7] = xor_512(v[7], v[8]);
    1020    19415123 :   v[4] = xor_512(v[4], v[9]);
    1021    19415123 :   v[5] = rot7_512(v[5]);
    1022    19415123 :   v[6] = rot7_512(v[6]);
    1023    19415123 :   v[7] = rot7_512(v[7]);
    1024    19415123 :   v[4] = rot7_512(v[4]);
    1025    19415123 : }
    1026             : 
    1027             : // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
    1028             : #define LO_IMM8 0x88
    1029             : 
    1030    49130448 : INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
    1031    49130448 :   return _mm512_shuffle_i32x4(a, b, LO_IMM8);
    1032    49130448 : }
    1033             : 
    1034             : // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
    1035             : #define HI_IMM8 0xdd
    1036             : 
    1037    49130448 : INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
    1038    49130448 :   return _mm512_shuffle_i32x4(a, b, HI_IMM8);
    1039    49130448 : }
    1040             : 
    1041     3070653 : INLINE void transpose_vecs_512(__m512i vecs[16]) {
    1042             :   // Interleave 32-bit lanes. The _0 unpack is lanes
    1043             :   // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
    1044             :   // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
    1045     3070653 :   __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
    1046     3070653 :   __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
    1047     3070653 :   __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
    1048     3070653 :   __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
    1049     3070653 :   __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
    1050     3070653 :   __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
    1051     3070653 :   __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
    1052     3070653 :   __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
    1053     3070653 :   __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
    1054     3070653 :   __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
    1055     3070653 :   __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
    1056     3070653 :   __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
    1057     3070653 :   __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
    1058     3070653 :   __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
    1059     3070653 :   __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
    1060     3070653 :   __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
    1061             : 
    1062             :   // Interleave 64-bit lanes. The _0 unpack is lanes
    1063             :   // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
    1064             :   // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
    1065             :   // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
    1066             :   // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
    1067     3070653 :   __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
    1068     3070653 :   __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
    1069     3070653 :   __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
    1070     3070653 :   __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
    1071     3070653 :   __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
    1072     3070653 :   __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
    1073     3070653 :   __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
    1074     3070653 :   __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
    1075     3070653 :   __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
    1076     3070653 :   __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
    1077     3070653 :   __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
    1078     3070653 :   __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
    1079     3070653 :   __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
    1080     3070653 :   __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
    1081     3070653 :   __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
    1082     3070653 :   __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
    1083             : 
    1084             :   // Interleave 128-bit lanes. The _0 unpack is
    1085             :   // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
    1086             :   // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
    1087     3070653 :   __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
    1088     3070653 :   __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
    1089     3070653 :   __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
    1090     3070653 :   __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
    1091     3070653 :   __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
    1092     3070653 :   __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
    1093     3070653 :   __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
    1094     3070653 :   __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
    1095     3070653 :   __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
    1096     3070653 :   __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
    1097     3070653 :   __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
    1098     3070653 :   __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
    1099     3070653 :   __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
    1100     3070653 :   __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
    1101     3070653 :   __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
    1102     3070653 :   __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
    1103             : 
    1104             :   // Interleave 128-bit lanes again for the final outputs.
    1105     3070653 :   vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
    1106     3070653 :   vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
    1107     3070653 :   vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
    1108     3070653 :   vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
    1109     3070653 :   vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
    1110     3070653 :   vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
    1111     3070653 :   vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
    1112     3070653 :   vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
    1113     3070653 :   vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
    1114     3070653 :   vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
    1115     3070653 :   vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
    1116     3070653 :   vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
    1117     3070653 :   vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
    1118     3070653 :   vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
    1119     3070653 :   vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
    1120     3070653 :   vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
    1121     3070653 : }
    1122             : 
    1123             : INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
    1124     2772979 :                                  size_t block_offset, __m512i out[16]) {
    1125     2772979 :   out[0] = loadu_512(&inputs[0][block_offset]);
    1126     2772979 :   out[1] = loadu_512(&inputs[1][block_offset]);
    1127     2772979 :   out[2] = loadu_512(&inputs[2][block_offset]);
    1128     2772979 :   out[3] = loadu_512(&inputs[3][block_offset]);
    1129     2772979 :   out[4] = loadu_512(&inputs[4][block_offset]);
    1130     2772979 :   out[5] = loadu_512(&inputs[5][block_offset]);
    1131     2772979 :   out[6] = loadu_512(&inputs[6][block_offset]);
    1132     2772979 :   out[7] = loadu_512(&inputs[7][block_offset]);
    1133     2772979 :   out[8] = loadu_512(&inputs[8][block_offset]);
    1134     2772979 :   out[9] = loadu_512(&inputs[9][block_offset]);
    1135     2772979 :   out[10] = loadu_512(&inputs[10][block_offset]);
    1136     2772979 :   out[11] = loadu_512(&inputs[11][block_offset]);
    1137     2772979 :   out[12] = loadu_512(&inputs[12][block_offset]);
    1138     2772979 :   out[13] = loadu_512(&inputs[13][block_offset]);
    1139     2772979 :   out[14] = loadu_512(&inputs[14][block_offset]);
    1140     2772979 :   out[15] = loadu_512(&inputs[15][block_offset]);
    1141    47140643 :   for (size_t i = 0; i < 16; ++i) {
    1142    44367664 :     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
    1143    44367664 :   }
    1144     2772979 :   transpose_vecs_512(out);
    1145     2772979 : }
    1146             : 
    1147             : INLINE void load_counters16(uint64_t counter, bool increment_counter,
    1148      297674 :                             __m512i *out_lo, __m512i *out_hi) {
    1149      297674 :   const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
    1150      297674 :   const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    1151      297674 :   const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
    1152      297674 :   const __m512i low_words = _mm512_add_epi32(
    1153      297674 :     _mm512_set1_epi32((int32_t)counter),
    1154      297674 :     masked_deltas);
    1155             :   // The carry bit is 1 if the high bit of the word was 1 before addition and is
    1156             :   // 0 after.
    1157             :   // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
    1158             :   // compute the carry bits here, and originally we did, but that intrinsic is
    1159             :   // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
    1160      297674 :   const __m512i carries = _mm512_srli_epi32(
    1161      297674 :     _mm512_andnot_si512(
    1162      297674 :         low_words, // 0 after (gets inverted by andnot)
    1163      297674 :         _mm512_set1_epi32((int32_t)counter)), // and 1 before
    1164      297674 :     31);
    1165      297674 :   const __m512i high_words = _mm512_add_epi32(
    1166      297674 :     _mm512_set1_epi32((int32_t)(counter >> 32)),
    1167      297674 :     carries);
    1168      297674 :   *out_lo = low_words;
    1169      297674 :   *out_hi = high_words;
    1170      297674 : }
    1171             : 
    1172             : static
    1173             : void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
    1174             :                           const uint32_t key[8], uint64_t counter,
    1175             :                           bool increment_counter, uint8_t flags,
    1176             :                           uint8_t flags_start, uint8_t flags_end,
    1177      297064 :                           uint8_t *out) {
    1178      297064 :   __m512i h_vecs[8] = {
    1179      297064 :       set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
    1180      297064 :       set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
    1181      297064 :   };
    1182      297064 :   __m512i counter_low_vec, counter_high_vec;
    1183      297064 :   load_counters16(counter, increment_counter, &counter_low_vec,
    1184      297064 :                   &counter_high_vec);
    1185      297064 :   uint8_t block_flags = flags | flags_start;
    1186             : 
    1187     3070043 :   for (size_t block = 0; block < blocks; block++) {
    1188     2772979 :     if (block + 1 == blocks) {
    1189      297064 :       block_flags |= flags_end;
    1190      297064 :     }
    1191     2772979 :     __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
    1192     2772979 :     __m512i block_flags_vec = set1_512(block_flags);
    1193     2772979 :     __m512i msg_vecs[16];
    1194     2772979 :     transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
    1195             : 
    1196     2772979 :     __m512i v[16] = {
    1197     2772979 :         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
    1198     2772979 :         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
    1199     2772979 :         set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
    1200     2772979 :         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
    1201     2772979 :     };
    1202     2772979 :     round_fn16(v, msg_vecs, 0);
    1203     2772979 :     round_fn16(v, msg_vecs, 1);
    1204     2772979 :     round_fn16(v, msg_vecs, 2);
    1205     2772979 :     round_fn16(v, msg_vecs, 3);
    1206     2772979 :     round_fn16(v, msg_vecs, 4);
    1207     2772979 :     round_fn16(v, msg_vecs, 5);
    1208     2772979 :     round_fn16(v, msg_vecs, 6);
    1209     2772979 :     h_vecs[0] = xor_512(v[0], v[8]);
    1210     2772979 :     h_vecs[1] = xor_512(v[1], v[9]);
    1211     2772979 :     h_vecs[2] = xor_512(v[2], v[10]);
    1212     2772979 :     h_vecs[3] = xor_512(v[3], v[11]);
    1213     2772979 :     h_vecs[4] = xor_512(v[4], v[12]);
    1214     2772979 :     h_vecs[5] = xor_512(v[5], v[13]);
    1215     2772979 :     h_vecs[6] = xor_512(v[6], v[14]);
    1216     2772979 :     h_vecs[7] = xor_512(v[7], v[15]);
    1217             : 
    1218     2772979 :     block_flags = flags;
    1219     2772979 :   }
    1220             : 
    1221             :   // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
    1222             :   // state vectors. Pad the matrix with zeros. After transposition, store the
    1223             :   // lower half of each vector.
    1224      297064 :   __m512i padded[16] = {
    1225      297064 :       h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
    1226      297064 :       h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
    1227      297064 :       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
    1228      297064 :       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
    1229      297064 :   };
    1230      297064 :   transpose_vecs_512(padded);
    1231      297064 :   _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
    1232      297064 :   _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
    1233      297064 :   _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
    1234      297064 :   _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
    1235      297064 :   _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
    1236      297064 :   _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
    1237      297064 :   _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
    1238      297064 :   _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
    1239      297064 :   _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
    1240      297064 :   _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
    1241      297064 :   _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
    1242      297064 :   _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
    1243      297064 :   _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
    1244      297064 :   _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
    1245      297064 :   _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
    1246      297064 :   _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
    1247      297064 : }
    1248             : 
    1249             : static
    1250             : void blake3_xof16_avx512(const uint32_t cv[8],
    1251             :                         const uint8_t block[BLAKE3_BLOCK_LEN],
    1252             :                         uint8_t block_len, uint64_t counter, uint8_t flags,
    1253         610 :                         uint8_t out[16 * 64]) {
    1254         610 :   __m512i h_vecs[8] = {
    1255         610 :       set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
    1256         610 :       set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
    1257         610 :   };
    1258         610 :   uint32_t block_words[16];
    1259         610 :   load_block_words(block, block_words);
    1260         610 :   __m512i msg_vecs[16];
    1261       10370 :   for (size_t i = 0; i < 16; i++) {
    1262        9760 :       msg_vecs[i] = set1_512(block_words[i]);
    1263        9760 :   }
    1264         610 :   __m512i counter_low_vec, counter_high_vec;
    1265         610 :   load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
    1266         610 :   __m512i block_len_vec = set1_512(block_len);
    1267         610 :   __m512i block_flags_vec = set1_512(flags);
    1268         610 :   __m512i v[16] = {
    1269         610 :       h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
    1270         610 :       h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
    1271         610 :       set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
    1272         610 :       counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
    1273         610 :   };
    1274         610 :   round_fn16(v, msg_vecs, 0);
    1275         610 :   round_fn16(v, msg_vecs, 1);
    1276         610 :   round_fn16(v, msg_vecs, 2);
    1277         610 :   round_fn16(v, msg_vecs, 3);
    1278         610 :   round_fn16(v, msg_vecs, 4);
    1279         610 :   round_fn16(v, msg_vecs, 5);
    1280         610 :   round_fn16(v, msg_vecs, 6);
    1281        5490 :   for (size_t i = 0; i < 8; i++) {
    1282        4880 :       v[i] = xor_512(v[i], v[i+8]);
    1283        4880 :       v[i+8] = xor_512(v[i+8], h_vecs[i]);
    1284        4880 :   }
    1285         610 :   transpose_vecs_512(&v[0]);
    1286       10370 :   for (size_t i = 0; i < 16; i++) {
    1287        9760 :       storeu_512(v[i], &out[i * sizeof(__m512i)]);
    1288        9760 :   }
    1289         610 : }
    1290             : 
    1291             : /*
    1292             :  * ----------------------------------------------------------------------------
    1293             :  * hash_many_avx512
    1294             :  * ----------------------------------------------------------------------------
    1295             :  */
    1296             : 
    1297             : INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
    1298             :                             const uint32_t key[8], uint64_t counter,
    1299             :                             uint8_t flags, uint8_t flags_start,
    1300      530092 :                             uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
    1301      530092 :   uint32_t cv[8];
    1302      530092 :   memcpy(cv, key, BLAKE3_KEY_LEN);
    1303      530092 :   uint8_t block_flags = flags | flags_start;
    1304     5040944 :   while (blocks > 0) {
    1305     4510852 :     if (blocks == 1) {
    1306      530092 :       block_flags |= flags_end;
    1307      530092 :     }
    1308     4510852 :     blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
    1309     4510852 :                                     block_flags);
    1310     4510852 :     input = &input[BLAKE3_BLOCK_LEN];
    1311     4510852 :     blocks -= 1;
    1312     4510852 :     block_flags = flags;
    1313     4510852 :   }
    1314      530092 :   memcpy(out, cv, BLAKE3_OUT_LEN);
    1315      530092 : }
    1316             : 
    1317             : void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
    1318             :                              size_t blocks, const uint32_t key[8],
    1319             :                              uint64_t counter, bool increment_counter,
    1320             :                              uint8_t flags, uint8_t flags_start,
    1321      760622 :                              uint8_t flags_end, uint8_t *out) {
    1322     1057686 :   while (num_inputs >= 16) {
    1323      297064 :     blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
    1324      297064 :                          flags_start, flags_end, out);
    1325      297064 :     if (increment_counter) {
    1326      165061 :       counter += 16;
    1327      165061 :     }
    1328      297064 :     inputs += 16;
    1329      297064 :     num_inputs -= 16;
    1330      297064 :     out = &out[16 * BLAKE3_OUT_LEN];
    1331      297064 :   }
    1332      826780 :   while (num_inputs >= 8) {
    1333       66158 :     blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
    1334       66158 :                         flags_start, flags_end, out);
    1335       66158 :     if (increment_counter) {
    1336       33100 :       counter += 8;
    1337       33100 :     }
    1338       66158 :     inputs += 8;
    1339       66158 :     num_inputs -= 8;
    1340       66158 :     out = &out[8 * BLAKE3_OUT_LEN];
    1341       66158 :   }
    1342      892976 :   while (num_inputs >= 4) {
    1343      132354 :     blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
    1344      132354 :                         flags_start, flags_end, out);
    1345      132354 :     if (increment_counter) {
    1346       66196 :       counter += 4;
    1347       66196 :     }
    1348      132354 :     inputs += 4;
    1349      132354 :     num_inputs -= 4;
    1350      132354 :     out = &out[4 * BLAKE3_OUT_LEN];
    1351      132354 :   }
    1352     1290714 :   while (num_inputs > 0) {
    1353      530092 :     hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
    1354      530092 :                     flags_end, out);
    1355      530092 :     if (increment_counter) {
    1356      265384 :       counter += 1;
    1357      265384 :     }
    1358      530092 :     inputs += 1;
    1359      530092 :     num_inputs -= 1;
    1360      530092 :     out = &out[BLAKE3_OUT_LEN];
    1361      530092 :   }
    1362      760622 : }
    1363             : 
    1364             : void blake3_xof_many_avx512(const uint32_t cv[8],
    1365             :                             const uint8_t block[BLAKE3_BLOCK_LEN],
    1366             :                             uint8_t block_len, uint64_t counter, uint8_t flags,
    1367         305 :                             uint8_t* out, size_t outblocks) {
    1368         915 :   while (outblocks >= 16) {
    1369         610 :     blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
    1370         610 :     counter += 16;
    1371         610 :     outblocks -= 16;
    1372         610 :     out += 16 * BLAKE3_BLOCK_LEN;
    1373         610 :   }
    1374         305 :   while (outblocks >= 8) {
    1375           0 :     blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
    1376           0 :     counter += 8;
    1377           0 :     outblocks -= 8;
    1378           0 :     out += 8 * BLAKE3_BLOCK_LEN;
    1379           0 :   }
    1380         305 :   while (outblocks >= 4) {
    1381           0 :     blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
    1382           0 :     counter += 4;
    1383           0 :     outblocks -= 4;
    1384           0 :     out += 4 * BLAKE3_BLOCK_LEN;
    1385           0 :   }
    1386         305 :   while (outblocks > 0) {
    1387           0 :     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
    1388           0 :     counter += 1;
    1389           0 :     outblocks -= 1;
    1390           0 :     out += BLAKE3_BLOCK_LEN;
    1391           0 :   }
    1392         305 : }

Generated by: LCOV version 1.14