LCOV - code coverage report
Current view: top level - ballet/blake3 - blake3_avx512.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 707 1016 69.6 %
Date: 2025-07-18 05:01:12 Functions: 38 53 71.7 %

          Line data    Source code
       1             : 
       2             : // Source originally from https://github.com/BLAKE3-team/BLAKE3
       3             : // From commit: c0ea395cf91d242f078c23d5f8d87eb9dd5f7b78
       4             : 
       5             : #include "blake3_impl.h"
       6             : 
       7             : #include <immintrin.h>
       8             : 
       9             : #define _mm_shuffle_ps2(a, b, c)                                               \
      10   415157360 :   (_mm_castps_si128(                                                           \
      11   415157360 :       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
      12             : 
      13   190632346 : INLINE __m128i loadu_128(const uint8_t src[16]) {
      14   190632346 :   return _mm_loadu_si128((const __m128i *)src);
      15   190632346 : }
      16             : 
      17    97736576 : INLINE __m256i loadu_256(const uint8_t src[32]) {
      18    97736576 :   return _mm256_loadu_si256((const __m256i *)src);
      19    97736576 : }
      20             : 
      21           0 : INLINE __m512i loadu_512(const uint8_t src[64]) {
      22           0 :   return _mm512_loadu_si512((const __m512i *)src);
      23           0 : }
      24             : 
      25    69898414 : INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
      26    69898414 :   _mm_storeu_si128((__m128i *)dest, src);
      27    69898414 : }
      28             : 
      29     5282248 : INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
      30     5282248 :   _mm256_storeu_si256((__m256i *)dest, src);
      31     5282248 : }
      32             : 
      33  2557646364 : INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
      34             : 
      35  2052468096 : INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
      36             : 
      37           0 : INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
      38             : 
      39  1782938910 : INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
      40             : 
      41  1417180352 : INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
      42             : 
      43           0 : INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
      44             : 
      45     7810006 : INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
      46             : 
      47    41933464 : INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
      48             : 
      49           0 : INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
      50             : 
      51    51894670 : INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
      52    51894670 :   return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
      53    51894670 : }
      54             : 
      55   426274394 : INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
      56             : 
      57   342078016 : INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
      58             : 
      59           0 : INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
      60             : 
      61   426274394 : INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
      62             : 
      63   342078016 : INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
      64             : 
      65           0 : INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
      66             : 
      67   426274394 : INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
      68             : 
      69   342078016 : INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
      70             : 
      71           0 : INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
      72             : 
      73   426274394 : INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
      74             : 
      75   342078016 : INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
      76             : 
      77           0 : INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
      78             : 
      79             : /*
      80             :  * ----------------------------------------------------------------------------
      81             :  * compress_avx512
      82             :  * ----------------------------------------------------------------------------
      83             :  */
      84             : 
      85             : INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
      86   363262690 :                __m128i m) {
      87   363262690 :   *row0 = add_128(add_128(*row0, m), *row1);
      88   363262690 :   *row3 = xor_128(*row3, *row0);
      89   363262690 :   *row3 = rot16_128(*row3);
      90   363262690 :   *row2 = add_128(*row2, *row3);
      91   363262690 :   *row1 = xor_128(*row1, *row2);
      92   363262690 :   *row1 = rot12_128(*row1);
      93   363262690 : }
      94             : 
      95             : INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
      96   363262690 :                __m128i m) {
      97   363262690 :   *row0 = add_128(add_128(*row0, m), *row1);
      98   363262690 :   *row3 = xor_128(*row3, *row0);
      99   363262690 :   *row3 = rot8_128(*row3);
     100   363262690 :   *row2 = add_128(*row2, *row3);
     101   363262690 :   *row1 = xor_128(*row1, *row2);
     102   363262690 :   *row1 = rot7_128(*row1);
     103   363262690 : }
     104             : 
     105             : // Note the optimization here of leaving row1 as the unrotated row, rather than
     106             : // row0. All the message loads below are adjusted to compensate for this. See
     107             : // discussion at https://github.com/sneves/blake2-avx2/pull/4
     108   181631345 : INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
     109   181631345 :   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
     110   181631345 :   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
     111   181631345 :   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
     112   181631345 : }
     113             : 
     114   181631345 : INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
     115   181631345 :   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
     116   181631345 :   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
     117   181631345 :   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
     118   181631345 : }
     119             : 
     120             : INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
     121             :                          const uint8_t block[BLAKE3_BLOCK_LEN],
     122    25947335 :                          uint8_t block_len, uint64_t counter, uint8_t flags) {
     123    25947335 :   rows[0] = loadu_128((uint8_t *)&cv[0]);
     124    25947335 :   rows[1] = loadu_128((uint8_t *)&cv[4]);
     125    25947335 :   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
     126    25947335 :   rows[3] = set4(counter_low(counter), counter_high(counter),
     127    25947335 :                  (uint32_t)block_len, (uint32_t)flags);
     128             : 
     129    25947335 :   __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
     130    25947335 :   __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
     131    25947335 :   __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
     132    25947335 :   __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
     133             : 
     134    25947335 :   __m128i t0, t1, t2, t3, tt;
     135             : 
     136             :   // Round 1. The first round permutes the message words from the original
     137             :   // input order, into the groups that get mixed in parallel.
     138    25947335 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
     139    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     140    25947335 :   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
     141    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     142    25947335 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     143    25947335 :   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
     144    25947335 :   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
     145    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     146    25947335 :   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
     147    25947335 :   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
     148    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     149    25947335 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     150    25947335 :   m0 = t0;
     151    25947335 :   m1 = t1;
     152    25947335 :   m2 = t2;
     153    25947335 :   m3 = t3;
     154             : 
     155             :   // Round 2. This round and all following rounds apply a fixed permutation
     156             :   // to the message words from the round before.
     157    25947335 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     158    25947335 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     159    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     160    25947335 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     161    25947335 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     162    25947335 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     163    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     164    25947335 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     165    25947335 :   t2 = _mm_unpacklo_epi64(m3, m1);
     166    25947335 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     167    25947335 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     168    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     169    25947335 :   t3 = _mm_unpackhi_epi32(m1, m3);
     170    25947335 :   tt = _mm_unpacklo_epi32(m2, t3);
     171    25947335 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     172    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     173    25947335 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     174    25947335 :   m0 = t0;
     175    25947335 :   m1 = t1;
     176    25947335 :   m2 = t2;
     177    25947335 :   m3 = t3;
     178             : 
     179             :   // Round 3
     180    25947335 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     181    25947335 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     182    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     183    25947335 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     184    25947335 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     185    25947335 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     186    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     187    25947335 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     188    25947335 :   t2 = _mm_unpacklo_epi64(m3, m1);
     189    25947335 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     190    25947335 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     191    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     192    25947335 :   t3 = _mm_unpackhi_epi32(m1, m3);
     193    25947335 :   tt = _mm_unpacklo_epi32(m2, t3);
     194    25947335 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     195    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     196    25947335 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     197    25947335 :   m0 = t0;
     198    25947335 :   m1 = t1;
     199    25947335 :   m2 = t2;
     200    25947335 :   m3 = t3;
     201             : 
     202             :   // Round 4
     203    25947335 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     204    25947335 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     205    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     206    25947335 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     207    25947335 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     208    25947335 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     209    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     210    25947335 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     211    25947335 :   t2 = _mm_unpacklo_epi64(m3, m1);
     212    25947335 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     213    25947335 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     214    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     215    25947335 :   t3 = _mm_unpackhi_epi32(m1, m3);
     216    25947335 :   tt = _mm_unpacklo_epi32(m2, t3);
     217    25947335 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     218    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     219    25947335 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     220    25947335 :   m0 = t0;
     221    25947335 :   m1 = t1;
     222    25947335 :   m2 = t2;
     223    25947335 :   m3 = t3;
     224             : 
     225             :   // Round 5
     226    25947335 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     227    25947335 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     228    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     229    25947335 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     230    25947335 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     231    25947335 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     232    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     233    25947335 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     234    25947335 :   t2 = _mm_unpacklo_epi64(m3, m1);
     235    25947335 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     236    25947335 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     237    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     238    25947335 :   t3 = _mm_unpackhi_epi32(m1, m3);
     239    25947335 :   tt = _mm_unpacklo_epi32(m2, t3);
     240    25947335 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     241    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     242    25947335 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     243    25947335 :   m0 = t0;
     244    25947335 :   m1 = t1;
     245    25947335 :   m2 = t2;
     246    25947335 :   m3 = t3;
     247             : 
     248             :   // Round 6
     249    25947335 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     250    25947335 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     251    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     252    25947335 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     253    25947335 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     254    25947335 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     255    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     256    25947335 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     257    25947335 :   t2 = _mm_unpacklo_epi64(m3, m1);
     258    25947335 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     259    25947335 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     260    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     261    25947335 :   t3 = _mm_unpackhi_epi32(m1, m3);
     262    25947335 :   tt = _mm_unpacklo_epi32(m2, t3);
     263    25947335 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     264    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     265    25947335 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     266    25947335 :   m0 = t0;
     267    25947335 :   m1 = t1;
     268    25947335 :   m2 = t2;
     269    25947335 :   m3 = t3;
     270             : 
     271             :   // Round 7
     272    25947335 :   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
     273    25947335 :   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
     274    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
     275    25947335 :   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
     276    25947335 :   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
     277    25947335 :   t1 = _mm_blend_epi16(tt, t1, 0xCC);
     278    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
     279    25947335 :   diagonalize(&rows[0], &rows[2], &rows[3]);
     280    25947335 :   t2 = _mm_unpacklo_epi64(m3, m1);
     281    25947335 :   tt = _mm_blend_epi16(t2, m2, 0xC0);
     282    25947335 :   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
     283    25947335 :   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
     284    25947335 :   t3 = _mm_unpackhi_epi32(m1, m3);
     285    25947335 :   tt = _mm_unpacklo_epi32(m2, t3);
     286    25947335 :   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
     287    25947335 :   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
     288    25947335 :   undiagonalize(&rows[0], &rows[2], &rows[3]);
     289    25947335 : }
     290             : 
     291             : void fd_blake3_compress_xof_avx512(const uint32_t cv[8],
     292             :                                    const uint8_t block[BLAKE3_BLOCK_LEN],
     293             :                                    uint8_t block_len, uint64_t counter,
     294     8472496 :                                    uint8_t flags, uint8_t out[64]) {
     295     8472496 :   __m128i rows[4];
     296     8472496 :   compress_pre(rows, cv, block, block_len, counter, flags);
     297     8472496 :   storeu_128(xor_128(rows[0], rows[2]), &out[0]);
     298     8472496 :   storeu_128(xor_128(rows[1], rows[3]), &out[16]);
     299     8472496 :   storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
     300     8472496 :   storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
     301     8472496 : }
     302             : 
     303             : void fd_blake3_compress_in_place_avx512(uint32_t cv[8],
     304             :                                         const uint8_t block[BLAKE3_BLOCK_LEN],
     305             :                                         uint8_t block_len, uint64_t counter,
     306    17474839 :                                         uint8_t flags) {
     307    17474839 :   __m128i rows[4];
     308    17474839 :   compress_pre(rows, cv, block, block_len, counter, flags);
     309    17474839 :   storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
     310    17474839 :   storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
     311    17474839 : }
     312             : 
     313             : /*
     314             :  * ----------------------------------------------------------------------------
     315             :  * hash4_avx512
     316             :  * ----------------------------------------------------------------------------
     317             :  */
     318             : 
     319     7876463 : INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
     320     7876463 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
     321     7876463 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
     322     7876463 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
     323     7876463 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
     324     7876463 :   v[0] = add_128(v[0], v[4]);
     325     7876463 :   v[1] = add_128(v[1], v[5]);
     326     7876463 :   v[2] = add_128(v[2], v[6]);
     327     7876463 :   v[3] = add_128(v[3], v[7]);
     328     7876463 :   v[12] = xor_128(v[12], v[0]);
     329     7876463 :   v[13] = xor_128(v[13], v[1]);
     330     7876463 :   v[14] = xor_128(v[14], v[2]);
     331     7876463 :   v[15] = xor_128(v[15], v[3]);
     332     7876463 :   v[12] = rot16_128(v[12]);
     333     7876463 :   v[13] = rot16_128(v[13]);
     334     7876463 :   v[14] = rot16_128(v[14]);
     335     7876463 :   v[15] = rot16_128(v[15]);
     336     7876463 :   v[8] = add_128(v[8], v[12]);
     337     7876463 :   v[9] = add_128(v[9], v[13]);
     338     7876463 :   v[10] = add_128(v[10], v[14]);
     339     7876463 :   v[11] = add_128(v[11], v[15]);
     340     7876463 :   v[4] = xor_128(v[4], v[8]);
     341     7876463 :   v[5] = xor_128(v[5], v[9]);
     342     7876463 :   v[6] = xor_128(v[6], v[10]);
     343     7876463 :   v[7] = xor_128(v[7], v[11]);
     344     7876463 :   v[4] = rot12_128(v[4]);
     345     7876463 :   v[5] = rot12_128(v[5]);
     346     7876463 :   v[6] = rot12_128(v[6]);
     347     7876463 :   v[7] = rot12_128(v[7]);
     348     7876463 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
     349     7876463 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
     350     7876463 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
     351     7876463 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
     352     7876463 :   v[0] = add_128(v[0], v[4]);
     353     7876463 :   v[1] = add_128(v[1], v[5]);
     354     7876463 :   v[2] = add_128(v[2], v[6]);
     355     7876463 :   v[3] = add_128(v[3], v[7]);
     356     7876463 :   v[12] = xor_128(v[12], v[0]);
     357     7876463 :   v[13] = xor_128(v[13], v[1]);
     358     7876463 :   v[14] = xor_128(v[14], v[2]);
     359     7876463 :   v[15] = xor_128(v[15], v[3]);
     360     7876463 :   v[12] = rot8_128(v[12]);
     361     7876463 :   v[13] = rot8_128(v[13]);
     362     7876463 :   v[14] = rot8_128(v[14]);
     363     7876463 :   v[15] = rot8_128(v[15]);
     364     7876463 :   v[8] = add_128(v[8], v[12]);
     365     7876463 :   v[9] = add_128(v[9], v[13]);
     366     7876463 :   v[10] = add_128(v[10], v[14]);
     367     7876463 :   v[11] = add_128(v[11], v[15]);
     368     7876463 :   v[4] = xor_128(v[4], v[8]);
     369     7876463 :   v[5] = xor_128(v[5], v[9]);
     370     7876463 :   v[6] = xor_128(v[6], v[10]);
     371     7876463 :   v[7] = xor_128(v[7], v[11]);
     372     7876463 :   v[4] = rot7_128(v[4]);
     373     7876463 :   v[5] = rot7_128(v[5]);
     374     7876463 :   v[6] = rot7_128(v[6]);
     375     7876463 :   v[7] = rot7_128(v[7]);
     376             : 
     377     7876463 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
     378     7876463 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
     379     7876463 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
     380     7876463 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
     381     7876463 :   v[0] = add_128(v[0], v[5]);
     382     7876463 :   v[1] = add_128(v[1], v[6]);
     383     7876463 :   v[2] = add_128(v[2], v[7]);
     384     7876463 :   v[3] = add_128(v[3], v[4]);
     385     7876463 :   v[15] = xor_128(v[15], v[0]);
     386     7876463 :   v[12] = xor_128(v[12], v[1]);
     387     7876463 :   v[13] = xor_128(v[13], v[2]);
     388     7876463 :   v[14] = xor_128(v[14], v[3]);
     389     7876463 :   v[15] = rot16_128(v[15]);
     390     7876463 :   v[12] = rot16_128(v[12]);
     391     7876463 :   v[13] = rot16_128(v[13]);
     392     7876463 :   v[14] = rot16_128(v[14]);
     393     7876463 :   v[10] = add_128(v[10], v[15]);
     394     7876463 :   v[11] = add_128(v[11], v[12]);
     395     7876463 :   v[8] = add_128(v[8], v[13]);
     396     7876463 :   v[9] = add_128(v[9], v[14]);
     397     7876463 :   v[5] = xor_128(v[5], v[10]);
     398     7876463 :   v[6] = xor_128(v[6], v[11]);
     399     7876463 :   v[7] = xor_128(v[7], v[8]);
     400     7876463 :   v[4] = xor_128(v[4], v[9]);
     401     7876463 :   v[5] = rot12_128(v[5]);
     402     7876463 :   v[6] = rot12_128(v[6]);
     403     7876463 :   v[7] = rot12_128(v[7]);
     404     7876463 :   v[4] = rot12_128(v[4]);
     405     7876463 :   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
     406     7876463 :   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
     407     7876463 :   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
     408     7876463 :   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
     409     7876463 :   v[0] = add_128(v[0], v[5]);
     410     7876463 :   v[1] = add_128(v[1], v[6]);
     411     7876463 :   v[2] = add_128(v[2], v[7]);
     412     7876463 :   v[3] = add_128(v[3], v[4]);
     413     7876463 :   v[15] = xor_128(v[15], v[0]);
     414     7876463 :   v[12] = xor_128(v[12], v[1]);
     415     7876463 :   v[13] = xor_128(v[13], v[2]);
     416     7876463 :   v[14] = xor_128(v[14], v[3]);
     417     7876463 :   v[15] = rot8_128(v[15]);
     418     7876463 :   v[12] = rot8_128(v[12]);
     419     7876463 :   v[13] = rot8_128(v[13]);
     420     7876463 :   v[14] = rot8_128(v[14]);
     421     7876463 :   v[10] = add_128(v[10], v[15]);
     422     7876463 :   v[11] = add_128(v[11], v[12]);
     423     7876463 :   v[8] = add_128(v[8], v[13]);
     424     7876463 :   v[9] = add_128(v[9], v[14]);
     425     7876463 :   v[5] = xor_128(v[5], v[10]);
     426     7876463 :   v[6] = xor_128(v[6], v[11]);
     427     7876463 :   v[7] = xor_128(v[7], v[8]);
     428     7876463 :   v[4] = xor_128(v[4], v[9]);
     429     7876463 :   v[5] = rot7_128(v[5]);
     430     7876463 :   v[6] = rot7_128(v[6]);
     431     7876463 :   v[7] = rot7_128(v[7]);
     432     7876463 :   v[4] = rot7_128(v[4]);
     433     7876463 : }
     434             : 
     435     4765524 : INLINE void transpose_vecs_128(__m128i vecs[4]) {
     436             :   // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
     437             :   // 22/33. Note that this doesn't split the vector into two lanes, as the
     438             :   // AVX2 counterparts do.
     439     4765524 :   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
     440     4765524 :   __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
     441     4765524 :   __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
     442     4765524 :   __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
     443             : 
     444             :   // Interleave 64-bit lanes.
     445     4765524 :   __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
     446     4765524 :   __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
     447     4765524 :   __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
     448     4765524 :   __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
     449             : 
     450     4765524 :   vecs[0] = abcd_0;
     451     4765524 :   vecs[1] = abcd_1;
     452     4765524 :   vecs[2] = abcd_2;
     453     4765524 :   vecs[3] = abcd_3;
     454     4765524 : }
     455             : 
     456             : INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
     457     1125209 :                                 size_t block_offset, __m128i out[16]) {
     458     1125209 :   out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
     459     1125209 :   out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
     460     1125209 :   out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
     461     1125209 :   out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
     462     1125209 :   out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
     463     1125209 :   out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
     464     1125209 :   out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
     465     1125209 :   out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
     466     1125209 :   out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
     467     1125209 :   out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
     468     1125209 :   out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
     469     1125209 :   out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
     470     1125209 :   out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
     471     1125209 :   out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
     472     1125209 :   out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
     473     1125209 :   out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
     474     5626045 :   for (size_t i = 0; i < 4; ++i) {
     475     4500836 :     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
     476     4500836 :   }
     477     1125209 :   transpose_vecs_128(&out[0]);
     478     1125209 :   transpose_vecs_128(&out[4]);
     479     1125209 :   transpose_vecs_128(&out[8]);
     480     1125209 :   transpose_vecs_128(&out[12]);
     481     1125209 : }
     482             : 
     483             : INLINE void load_counters4(uint64_t counter, bool increment_counter,
     484      132344 :                            __m128i *out_lo, __m128i *out_hi) {
     485      132344 :   int64_t mask = (increment_counter ? ~0 : 0);
     486      132344 :   __m256i mask_vec = _mm256_set1_epi64x(mask);
     487      132344 :   __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
     488      132344 :   deltas = _mm256_and_si256(mask_vec, deltas);
     489      132344 :   __m256i counters =
     490      132344 :       _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
     491      132344 :   *out_lo = _mm256_cvtepi64_epi32(counters);
     492      132344 :   *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
     493      132344 : }
     494             : 
     495             : static
     496             : void fd_blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
     497             :                             const uint32_t key[8], uint64_t counter,
     498             :                             bool increment_counter, uint8_t flags,
     499      132344 :                             uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
     500      132344 :   __m128i h_vecs[8] = {
     501      132344 :       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
     502      132344 :       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
     503      132344 :   };
     504      132344 :   __m128i counter_low_vec, counter_high_vec;
     505      132344 :   load_counters4(counter, increment_counter, &counter_low_vec,
     506      132344 :                  &counter_high_vec);
     507      132344 :   uint8_t block_flags = flags | flags_start;
     508             : 
     509     1257553 :   for (size_t block = 0; block < blocks; block++) {
     510     1125209 :     if (block + 1 == blocks) {
     511      132344 :       block_flags |= flags_end;
     512      132344 :     }
     513     1125209 :     __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
     514     1125209 :     __m128i block_flags_vec = set1_128(block_flags);
     515     1125209 :     __m128i msg_vecs[16];
     516     1125209 :     transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
     517             : 
     518     1125209 :     __m128i v[16] = {
     519     1125209 :         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
     520     1125209 :         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
     521     1125209 :         set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
     522     1125209 :         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     523     1125209 :     };
     524     1125209 :     round_fn4(v, msg_vecs, 0);
     525     1125209 :     round_fn4(v, msg_vecs, 1);
     526     1125209 :     round_fn4(v, msg_vecs, 2);
     527     1125209 :     round_fn4(v, msg_vecs, 3);
     528     1125209 :     round_fn4(v, msg_vecs, 4);
     529     1125209 :     round_fn4(v, msg_vecs, 5);
     530     1125209 :     round_fn4(v, msg_vecs, 6);
     531     1125209 :     h_vecs[0] = xor_128(v[0], v[8]);
     532     1125209 :     h_vecs[1] = xor_128(v[1], v[9]);
     533     1125209 :     h_vecs[2] = xor_128(v[2], v[10]);
     534     1125209 :     h_vecs[3] = xor_128(v[3], v[11]);
     535     1125209 :     h_vecs[4] = xor_128(v[4], v[12]);
     536     1125209 :     h_vecs[5] = xor_128(v[5], v[13]);
     537     1125209 :     h_vecs[6] = xor_128(v[6], v[14]);
     538     1125209 :     h_vecs[7] = xor_128(v[7], v[15]);
     539             : 
     540     1125209 :     block_flags = flags;
     541     1125209 :   }
     542             : 
     543      132344 :   transpose_vecs_128(&h_vecs[0]);
     544      132344 :   transpose_vecs_128(&h_vecs[4]);
     545             :   // The first four vecs now contain the first half of each output, and the
     546             :   // second four vecs contain the second half of each output.
     547      132344 :   storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
     548      132344 :   storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
     549      132344 :   storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
     550      132344 :   storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
     551      132344 :   storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
     552      132344 :   storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
     553      132344 :   storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
     554      132344 :   storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
     555      132344 : }
     556             : 
     557             : /*
     558             :  * ----------------------------------------------------------------------------
     559             :  * hash8_avx512
     560             :  * ----------------------------------------------------------------------------
     561             :  */
     562             : 
     563    42759752 : INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
     564    42759752 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
     565    42759752 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
     566    42759752 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
     567    42759752 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
     568    42759752 :   v[0] = add_256(v[0], v[4]);
     569    42759752 :   v[1] = add_256(v[1], v[5]);
     570    42759752 :   v[2] = add_256(v[2], v[6]);
     571    42759752 :   v[3] = add_256(v[3], v[7]);
     572    42759752 :   v[12] = xor_256(v[12], v[0]);
     573    42759752 :   v[13] = xor_256(v[13], v[1]);
     574    42759752 :   v[14] = xor_256(v[14], v[2]);
     575    42759752 :   v[15] = xor_256(v[15], v[3]);
     576    42759752 :   v[12] = rot16_256(v[12]);
     577    42759752 :   v[13] = rot16_256(v[13]);
     578    42759752 :   v[14] = rot16_256(v[14]);
     579    42759752 :   v[15] = rot16_256(v[15]);
     580    42759752 :   v[8] = add_256(v[8], v[12]);
     581    42759752 :   v[9] = add_256(v[9], v[13]);
     582    42759752 :   v[10] = add_256(v[10], v[14]);
     583    42759752 :   v[11] = add_256(v[11], v[15]);
     584    42759752 :   v[4] = xor_256(v[4], v[8]);
     585    42759752 :   v[5] = xor_256(v[5], v[9]);
     586    42759752 :   v[6] = xor_256(v[6], v[10]);
     587    42759752 :   v[7] = xor_256(v[7], v[11]);
     588    42759752 :   v[4] = rot12_256(v[4]);
     589    42759752 :   v[5] = rot12_256(v[5]);
     590    42759752 :   v[6] = rot12_256(v[6]);
     591    42759752 :   v[7] = rot12_256(v[7]);
     592    42759752 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
     593    42759752 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
     594    42759752 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
     595    42759752 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
     596    42759752 :   v[0] = add_256(v[0], v[4]);
     597    42759752 :   v[1] = add_256(v[1], v[5]);
     598    42759752 :   v[2] = add_256(v[2], v[6]);
     599    42759752 :   v[3] = add_256(v[3], v[7]);
     600    42759752 :   v[12] = xor_256(v[12], v[0]);
     601    42759752 :   v[13] = xor_256(v[13], v[1]);
     602    42759752 :   v[14] = xor_256(v[14], v[2]);
     603    42759752 :   v[15] = xor_256(v[15], v[3]);
     604    42759752 :   v[12] = rot8_256(v[12]);
     605    42759752 :   v[13] = rot8_256(v[13]);
     606    42759752 :   v[14] = rot8_256(v[14]);
     607    42759752 :   v[15] = rot8_256(v[15]);
     608    42759752 :   v[8] = add_256(v[8], v[12]);
     609    42759752 :   v[9] = add_256(v[9], v[13]);
     610    42759752 :   v[10] = add_256(v[10], v[14]);
     611    42759752 :   v[11] = add_256(v[11], v[15]);
     612    42759752 :   v[4] = xor_256(v[4], v[8]);
     613    42759752 :   v[5] = xor_256(v[5], v[9]);
     614    42759752 :   v[6] = xor_256(v[6], v[10]);
     615    42759752 :   v[7] = xor_256(v[7], v[11]);
     616    42759752 :   v[4] = rot7_256(v[4]);
     617    42759752 :   v[5] = rot7_256(v[5]);
     618    42759752 :   v[6] = rot7_256(v[6]);
     619    42759752 :   v[7] = rot7_256(v[7]);
     620             : 
     621    42759752 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
     622    42759752 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
     623    42759752 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
     624    42759752 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
     625    42759752 :   v[0] = add_256(v[0], v[5]);
     626    42759752 :   v[1] = add_256(v[1], v[6]);
     627    42759752 :   v[2] = add_256(v[2], v[7]);
     628    42759752 :   v[3] = add_256(v[3], v[4]);
     629    42759752 :   v[15] = xor_256(v[15], v[0]);
     630    42759752 :   v[12] = xor_256(v[12], v[1]);
     631    42759752 :   v[13] = xor_256(v[13], v[2]);
     632    42759752 :   v[14] = xor_256(v[14], v[3]);
     633    42759752 :   v[15] = rot16_256(v[15]);
     634    42759752 :   v[12] = rot16_256(v[12]);
     635    42759752 :   v[13] = rot16_256(v[13]);
     636    42759752 :   v[14] = rot16_256(v[14]);
     637    42759752 :   v[10] = add_256(v[10], v[15]);
     638    42759752 :   v[11] = add_256(v[11], v[12]);
     639    42759752 :   v[8] = add_256(v[8], v[13]);
     640    42759752 :   v[9] = add_256(v[9], v[14]);
     641    42759752 :   v[5] = xor_256(v[5], v[10]);
     642    42759752 :   v[6] = xor_256(v[6], v[11]);
     643    42759752 :   v[7] = xor_256(v[7], v[8]);
     644    42759752 :   v[4] = xor_256(v[4], v[9]);
     645    42759752 :   v[5] = rot12_256(v[5]);
     646    42759752 :   v[6] = rot12_256(v[6]);
     647    42759752 :   v[7] = rot12_256(v[7]);
     648    42759752 :   v[4] = rot12_256(v[4]);
     649    42759752 :   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
     650    42759752 :   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
     651    42759752 :   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
     652    42759752 :   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
     653    42759752 :   v[0] = add_256(v[0], v[5]);
     654    42759752 :   v[1] = add_256(v[1], v[6]);
     655    42759752 :   v[2] = add_256(v[2], v[7]);
     656    42759752 :   v[3] = add_256(v[3], v[4]);
     657    42759752 :   v[15] = xor_256(v[15], v[0]);
     658    42759752 :   v[12] = xor_256(v[12], v[1]);
     659    42759752 :   v[13] = xor_256(v[13], v[2]);
     660    42759752 :   v[14] = xor_256(v[14], v[3]);
     661    42759752 :   v[15] = rot8_256(v[15]);
     662    42759752 :   v[12] = rot8_256(v[12]);
     663    42759752 :   v[13] = rot8_256(v[13]);
     664    42759752 :   v[14] = rot8_256(v[14]);
     665    42759752 :   v[10] = add_256(v[10], v[15]);
     666    42759752 :   v[11] = add_256(v[11], v[12]);
     667    42759752 :   v[8] = add_256(v[8], v[13]);
     668    42759752 :   v[9] = add_256(v[9], v[14]);
     669    42759752 :   v[5] = xor_256(v[5], v[10]);
     670    42759752 :   v[6] = xor_256(v[6], v[11]);
     671    42759752 :   v[7] = xor_256(v[7], v[8]);
     672    42759752 :   v[4] = xor_256(v[4], v[9]);
     673    42759752 :   v[5] = rot7_256(v[5]);
     674    42759752 :   v[6] = rot7_256(v[6]);
     675    42759752 :   v[7] = rot7_256(v[7]);
     676    42759752 :   v[4] = rot7_256(v[4]);
     677    42759752 : }
     678             : 
     679    12877353 : INLINE void transpose_vecs_256(__m256i vecs[8]) {
     680             :   // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
     681             :   // is 22/33/66/77.
     682    12877353 :   __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
     683    12877353 :   __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
     684    12877353 :   __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
     685    12877353 :   __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
     686    12877353 :   __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
     687    12877353 :   __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
     688    12877353 :   __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
     689    12877353 :   __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
     690             : 
     691             :   // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
     692             :   // 11/33.
     693    12877353 :   __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
     694    12877353 :   __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
     695    12877353 :   __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
     696    12877353 :   __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
     697    12877353 :   __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
     698    12877353 :   __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
     699    12877353 :   __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
     700    12877353 :   __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
     701             : 
     702             :   // Interleave 128-bit lanes.
     703    12877353 :   vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
     704    12877353 :   vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
     705    12877353 :   vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
     706    12877353 :   vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
     707    12877353 :   vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
     708    12877353 :   vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
     709    12877353 :   vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
     710    12877353 :   vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
     711    12877353 : }
     712             : 
     713             : INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
     714     6108536 :                                 size_t block_offset, __m256i out[16]) {
     715     6108536 :   out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
     716     6108536 :   out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
     717     6108536 :   out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
     718     6108536 :   out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
     719     6108536 :   out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
     720     6108536 :   out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
     721     6108536 :   out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
     722     6108536 :   out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
     723     6108536 :   out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
     724     6108536 :   out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
     725     6108536 :   out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
     726     6108536 :   out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
     727     6108536 :   out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
     728     6108536 :   out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
     729     6108536 :   out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
     730     6108536 :   out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
     731    54976824 :   for (size_t i = 0; i < 8; ++i) {
     732    48868288 :     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
     733    48868288 :   }
     734     6108536 :   transpose_vecs_256(&out[0]);
     735     6108536 :   transpose_vecs_256(&out[8]);
     736     6108536 : }
     737             : 
     738             : INLINE void load_counters8(uint64_t counter, bool increment_counter,
     739      660281 :                            __m256i *out_lo, __m256i *out_hi) {
     740      660281 :   int64_t mask = (increment_counter ? ~0 : 0);
     741      660281 :   __m512i mask_vec = _mm512_set1_epi64(mask);
     742      660281 :   __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
     743      660281 :   deltas = _mm512_and_si512(mask_vec, deltas);
     744      660281 :   __m512i counters =
     745      660281 :       _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
     746      660281 :   *out_lo = _mm512_cvtepi64_epi32(counters);
     747      660281 :   *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
     748      660281 : }
     749             : 
     750             : static
     751             : void fd_blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
     752             :                             const uint32_t key[8], uint64_t counter,
     753             :                             bool increment_counter, uint8_t flags,
     754      660281 :                             uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
     755      660281 :   __m256i h_vecs[8] = {
     756      660281 :       set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
     757      660281 :       set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
     758      660281 :   };
     759      660281 :   __m256i counter_low_vec, counter_high_vec;
     760      660281 :   load_counters8(counter, increment_counter, &counter_low_vec,
     761      660281 :                  &counter_high_vec);
     762      660281 :   uint8_t block_flags = flags | flags_start;
     763             : 
     764     6768817 :   for (size_t block = 0; block < blocks; block++) {
     765     6108536 :     if (block + 1 == blocks) {
     766      660281 :       block_flags |= flags_end;
     767      660281 :     }
     768     6108536 :     __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
     769     6108536 :     __m256i block_flags_vec = set1_256(block_flags);
     770     6108536 :     __m256i msg_vecs[16];
     771     6108536 :     transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
     772             : 
     773     6108536 :     __m256i v[16] = {
     774     6108536 :         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
     775     6108536 :         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
     776     6108536 :         set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
     777     6108536 :         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     778     6108536 :     };
     779     6108536 :     round_fn8(v, msg_vecs, 0);
     780     6108536 :     round_fn8(v, msg_vecs, 1);
     781     6108536 :     round_fn8(v, msg_vecs, 2);
     782     6108536 :     round_fn8(v, msg_vecs, 3);
     783     6108536 :     round_fn8(v, msg_vecs, 4);
     784     6108536 :     round_fn8(v, msg_vecs, 5);
     785     6108536 :     round_fn8(v, msg_vecs, 6);
     786     6108536 :     h_vecs[0] = xor_256(v[0], v[8]);
     787     6108536 :     h_vecs[1] = xor_256(v[1], v[9]);
     788     6108536 :     h_vecs[2] = xor_256(v[2], v[10]);
     789     6108536 :     h_vecs[3] = xor_256(v[3], v[11]);
     790     6108536 :     h_vecs[4] = xor_256(v[4], v[12]);
     791     6108536 :     h_vecs[5] = xor_256(v[5], v[13]);
     792     6108536 :     h_vecs[6] = xor_256(v[6], v[14]);
     793     6108536 :     h_vecs[7] = xor_256(v[7], v[15]);
     794             : 
     795     6108536 :     block_flags = flags;
     796     6108536 :   }
     797             : 
     798      660281 :   transpose_vecs_256(h_vecs);
     799      660281 :   storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
     800      660281 :   storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
     801      660281 :   storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
     802      660281 :   storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
     803      660281 :   storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
     804      660281 :   storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
     805      660281 :   storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
     806      660281 :   storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
     807      660281 : }
     808             : 
     809             : /*
     810             :  * ----------------------------------------------------------------------------
     811             :  * hash16_avx512
     812             :  * ----------------------------------------------------------------------------
     813             :  */
     814             : 
     815           0 : INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
     816           0 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
     817           0 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
     818           0 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
     819           0 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
     820           0 :   v[0] = add_512(v[0], v[4]);
     821           0 :   v[1] = add_512(v[1], v[5]);
     822           0 :   v[2] = add_512(v[2], v[6]);
     823           0 :   v[3] = add_512(v[3], v[7]);
     824           0 :   v[12] = xor_512(v[12], v[0]);
     825           0 :   v[13] = xor_512(v[13], v[1]);
     826           0 :   v[14] = xor_512(v[14], v[2]);
     827           0 :   v[15] = xor_512(v[15], v[3]);
     828           0 :   v[12] = rot16_512(v[12]);
     829           0 :   v[13] = rot16_512(v[13]);
     830           0 :   v[14] = rot16_512(v[14]);
     831           0 :   v[15] = rot16_512(v[15]);
     832           0 :   v[8] = add_512(v[8], v[12]);
     833           0 :   v[9] = add_512(v[9], v[13]);
     834           0 :   v[10] = add_512(v[10], v[14]);
     835           0 :   v[11] = add_512(v[11], v[15]);
     836           0 :   v[4] = xor_512(v[4], v[8]);
     837           0 :   v[5] = xor_512(v[5], v[9]);
     838           0 :   v[6] = xor_512(v[6], v[10]);
     839           0 :   v[7] = xor_512(v[7], v[11]);
     840           0 :   v[4] = rot12_512(v[4]);
     841           0 :   v[5] = rot12_512(v[5]);
     842           0 :   v[6] = rot12_512(v[6]);
     843           0 :   v[7] = rot12_512(v[7]);
     844           0 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
     845           0 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
     846           0 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
     847           0 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
     848           0 :   v[0] = add_512(v[0], v[4]);
     849           0 :   v[1] = add_512(v[1], v[5]);
     850           0 :   v[2] = add_512(v[2], v[6]);
     851           0 :   v[3] = add_512(v[3], v[7]);
     852           0 :   v[12] = xor_512(v[12], v[0]);
     853           0 :   v[13] = xor_512(v[13], v[1]);
     854           0 :   v[14] = xor_512(v[14], v[2]);
     855           0 :   v[15] = xor_512(v[15], v[3]);
     856           0 :   v[12] = rot8_512(v[12]);
     857           0 :   v[13] = rot8_512(v[13]);
     858           0 :   v[14] = rot8_512(v[14]);
     859           0 :   v[15] = rot8_512(v[15]);
     860           0 :   v[8] = add_512(v[8], v[12]);
     861           0 :   v[9] = add_512(v[9], v[13]);
     862           0 :   v[10] = add_512(v[10], v[14]);
     863           0 :   v[11] = add_512(v[11], v[15]);
     864           0 :   v[4] = xor_512(v[4], v[8]);
     865           0 :   v[5] = xor_512(v[5], v[9]);
     866           0 :   v[6] = xor_512(v[6], v[10]);
     867           0 :   v[7] = xor_512(v[7], v[11]);
     868           0 :   v[4] = rot7_512(v[4]);
     869           0 :   v[5] = rot7_512(v[5]);
     870           0 :   v[6] = rot7_512(v[6]);
     871           0 :   v[7] = rot7_512(v[7]);
     872             : 
     873           0 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
     874           0 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
     875           0 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
     876           0 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
     877           0 :   v[0] = add_512(v[0], v[5]);
     878           0 :   v[1] = add_512(v[1], v[6]);
     879           0 :   v[2] = add_512(v[2], v[7]);
     880           0 :   v[3] = add_512(v[3], v[4]);
     881           0 :   v[15] = xor_512(v[15], v[0]);
     882           0 :   v[12] = xor_512(v[12], v[1]);
     883           0 :   v[13] = xor_512(v[13], v[2]);
     884           0 :   v[14] = xor_512(v[14], v[3]);
     885           0 :   v[15] = rot16_512(v[15]);
     886           0 :   v[12] = rot16_512(v[12]);
     887           0 :   v[13] = rot16_512(v[13]);
     888           0 :   v[14] = rot16_512(v[14]);
     889           0 :   v[10] = add_512(v[10], v[15]);
     890           0 :   v[11] = add_512(v[11], v[12]);
     891           0 :   v[8] = add_512(v[8], v[13]);
     892           0 :   v[9] = add_512(v[9], v[14]);
     893           0 :   v[5] = xor_512(v[5], v[10]);
     894           0 :   v[6] = xor_512(v[6], v[11]);
     895           0 :   v[7] = xor_512(v[7], v[8]);
     896           0 :   v[4] = xor_512(v[4], v[9]);
     897           0 :   v[5] = rot12_512(v[5]);
     898           0 :   v[6] = rot12_512(v[6]);
     899           0 :   v[7] = rot12_512(v[7]);
     900           0 :   v[4] = rot12_512(v[4]);
     901           0 :   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
     902           0 :   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
     903           0 :   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
     904           0 :   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
     905           0 :   v[0] = add_512(v[0], v[5]);
     906           0 :   v[1] = add_512(v[1], v[6]);
     907           0 :   v[2] = add_512(v[2], v[7]);
     908           0 :   v[3] = add_512(v[3], v[4]);
     909           0 :   v[15] = xor_512(v[15], v[0]);
     910           0 :   v[12] = xor_512(v[12], v[1]);
     911           0 :   v[13] = xor_512(v[13], v[2]);
     912           0 :   v[14] = xor_512(v[14], v[3]);
     913           0 :   v[15] = rot8_512(v[15]);
     914           0 :   v[12] = rot8_512(v[12]);
     915           0 :   v[13] = rot8_512(v[13]);
     916           0 :   v[14] = rot8_512(v[14]);
     917           0 :   v[10] = add_512(v[10], v[15]);
     918           0 :   v[11] = add_512(v[11], v[12]);
     919           0 :   v[8] = add_512(v[8], v[13]);
     920           0 :   v[9] = add_512(v[9], v[14]);
     921           0 :   v[5] = xor_512(v[5], v[10]);
     922           0 :   v[6] = xor_512(v[6], v[11]);
     923           0 :   v[7] = xor_512(v[7], v[8]);
     924           0 :   v[4] = xor_512(v[4], v[9]);
     925           0 :   v[5] = rot7_512(v[5]);
     926           0 :   v[6] = rot7_512(v[6]);
     927           0 :   v[7] = rot7_512(v[7]);
     928           0 :   v[4] = rot7_512(v[4]);
     929           0 : }
     930             : 
     931             : // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
     932             : #define LO_IMM8 0x88
     933             : 
     934           0 : INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
     935           0 :   return _mm512_shuffle_i32x4(a, b, LO_IMM8);
     936           0 : }
     937             : 
     938             : // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
     939             : #define HI_IMM8 0xdd
     940             : 
     941           0 : INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
     942           0 :   return _mm512_shuffle_i32x4(a, b, HI_IMM8);
     943           0 : }
     944             : 
     945           0 : INLINE void transpose_vecs_512(__m512i vecs[16]) {
     946             :   // Interleave 32-bit lanes. The _0 unpack is lanes
     947             :   // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
     948             :   // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
     949           0 :   __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
     950           0 :   __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
     951           0 :   __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
     952           0 :   __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
     953           0 :   __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
     954           0 :   __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
     955           0 :   __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
     956           0 :   __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
     957           0 :   __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
     958           0 :   __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
     959           0 :   __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
     960           0 :   __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
     961           0 :   __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
     962           0 :   __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
     963           0 :   __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
     964           0 :   __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
     965             : 
     966             :   // Interleave 64-bit lanes. The _0 unpack is lanes
     967             :   // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
     968             :   // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
     969             :   // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
     970             :   // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
     971           0 :   __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
     972           0 :   __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
     973           0 :   __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
     974           0 :   __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
     975           0 :   __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
     976           0 :   __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
     977           0 :   __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
     978           0 :   __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
     979           0 :   __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
     980           0 :   __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
     981           0 :   __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
     982           0 :   __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
     983           0 :   __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
     984           0 :   __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
     985           0 :   __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
     986           0 :   __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
     987             : 
     988             :   // Interleave 128-bit lanes. The _0 unpack is
     989             :   // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
     990             :   // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
     991           0 :   __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
     992           0 :   __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
     993           0 :   __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
     994           0 :   __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
     995           0 :   __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
     996           0 :   __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
     997           0 :   __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
     998           0 :   __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
     999           0 :   __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
    1000           0 :   __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
    1001           0 :   __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
    1002           0 :   __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
    1003           0 :   __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
    1004           0 :   __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
    1005           0 :   __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
    1006           0 :   __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
    1007             : 
    1008             :   // Interleave 128-bit lanes again for the final outputs.
    1009           0 :   vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
    1010           0 :   vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
    1011           0 :   vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
    1012           0 :   vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
    1013           0 :   vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
    1014           0 :   vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
    1015           0 :   vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
    1016           0 :   vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
    1017           0 :   vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
    1018           0 :   vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
    1019           0 :   vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
    1020           0 :   vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
    1021           0 :   vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
    1022           0 :   vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
    1023           0 :   vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
    1024           0 :   vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
    1025           0 : }
    1026             : 
    1027             : INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
    1028           0 :                                  size_t block_offset, __m512i out[16]) {
    1029           0 :   out[0] = loadu_512(&inputs[0][block_offset]);
    1030           0 :   out[1] = loadu_512(&inputs[1][block_offset]);
    1031           0 :   out[2] = loadu_512(&inputs[2][block_offset]);
    1032           0 :   out[3] = loadu_512(&inputs[3][block_offset]);
    1033           0 :   out[4] = loadu_512(&inputs[4][block_offset]);
    1034           0 :   out[5] = loadu_512(&inputs[5][block_offset]);
    1035           0 :   out[6] = loadu_512(&inputs[6][block_offset]);
    1036           0 :   out[7] = loadu_512(&inputs[7][block_offset]);
    1037           0 :   out[8] = loadu_512(&inputs[8][block_offset]);
    1038           0 :   out[9] = loadu_512(&inputs[9][block_offset]);
    1039           0 :   out[10] = loadu_512(&inputs[10][block_offset]);
    1040           0 :   out[11] = loadu_512(&inputs[11][block_offset]);
    1041           0 :   out[12] = loadu_512(&inputs[12][block_offset]);
    1042           0 :   out[13] = loadu_512(&inputs[13][block_offset]);
    1043           0 :   out[14] = loadu_512(&inputs[14][block_offset]);
    1044           0 :   out[15] = loadu_512(&inputs[15][block_offset]);
    1045           0 :   for (size_t i = 0; i < 16; ++i) {
    1046           0 :     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
    1047           0 :   }
    1048           0 :   transpose_vecs_512(out);
    1049           0 : }
    1050             : 
    1051             : INLINE void load_counters16(uint64_t counter, bool increment_counter,
    1052           0 :                             __m512i *out_lo, __m512i *out_hi) {
    1053           0 :   const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
    1054           0 :   const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
    1055           0 :   const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
    1056           0 :   const __m512i low_words = _mm512_add_epi32(
    1057           0 :     _mm512_set1_epi32((int32_t)counter),
    1058           0 :     masked_deltas);
    1059             :   // The carry bit is 1 if the high bit of the word was 1 before addition and is
    1060             :   // 0 after.
    1061             :   // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
    1062             :   // compute the carry bits here, and originally we did, but that intrinsic is
    1063             :   // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
    1064           0 :   const __m512i carries = _mm512_srli_epi32(
    1065           0 :     _mm512_andnot_si512(
    1066           0 :         low_words, // 0 after (gets inverted by andnot)
    1067           0 :         _mm512_set1_epi32((int32_t)counter)), // and 1 before
    1068           0 :     31);
    1069           0 :   const __m512i high_words = _mm512_add_epi32(
    1070           0 :     _mm512_set1_epi32((int32_t)(counter >> 32)),
    1071           0 :     carries);
    1072           0 :   *out_lo = low_words;
    1073           0 :   *out_hi = high_words;
    1074           0 : }
    1075             : 
    1076             : static
    1077             : void fd_blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
    1078             :                              const uint32_t key[8], uint64_t counter,
    1079             :                              bool increment_counter, uint8_t flags,
    1080             :                              uint8_t flags_start, uint8_t flags_end,
    1081           0 :                              uint8_t *out) {
    1082           0 :   __m512i h_vecs[8] = {
    1083           0 :       set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
    1084           0 :       set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
    1085           0 :   };
    1086           0 :   __m512i counter_low_vec, counter_high_vec;
    1087           0 :   load_counters16(counter, increment_counter, &counter_low_vec,
    1088           0 :                   &counter_high_vec);
    1089           0 :   uint8_t block_flags = flags | flags_start;
    1090             : 
    1091           0 :   for (size_t block = 0; block < blocks; block++) {
    1092           0 :     if (block + 1 == blocks) {
    1093           0 :       block_flags |= flags_end;
    1094           0 :     }
    1095           0 :     __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
    1096           0 :     __m512i block_flags_vec = set1_512(block_flags);
    1097           0 :     __m512i msg_vecs[16];
    1098           0 :     transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
    1099             : 
    1100           0 :     __m512i v[16] = {
    1101           0 :         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
    1102           0 :         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
    1103           0 :         set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
    1104           0 :         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
    1105           0 :     };
    1106           0 :     round_fn16(v, msg_vecs, 0);
    1107           0 :     round_fn16(v, msg_vecs, 1);
    1108           0 :     round_fn16(v, msg_vecs, 2);
    1109           0 :     round_fn16(v, msg_vecs, 3);
    1110           0 :     round_fn16(v, msg_vecs, 4);
    1111           0 :     round_fn16(v, msg_vecs, 5);
    1112           0 :     round_fn16(v, msg_vecs, 6);
    1113           0 :     h_vecs[0] = xor_512(v[0], v[8]);
    1114           0 :     h_vecs[1] = xor_512(v[1], v[9]);
    1115           0 :     h_vecs[2] = xor_512(v[2], v[10]);
    1116           0 :     h_vecs[3] = xor_512(v[3], v[11]);
    1117           0 :     h_vecs[4] = xor_512(v[4], v[12]);
    1118           0 :     h_vecs[5] = xor_512(v[5], v[13]);
    1119           0 :     h_vecs[6] = xor_512(v[6], v[14]);
    1120           0 :     h_vecs[7] = xor_512(v[7], v[15]);
    1121             : 
    1122           0 :     block_flags = flags;
    1123           0 :   }
    1124             : 
    1125             :   // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
    1126             :   // state vectors. Pad the matrix with zeros. After transposition, store the
    1127             :   // lower half of each vector.
    1128           0 :   __m512i padded[16] = {
    1129           0 :       h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
    1130           0 :       h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
    1131           0 :       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
    1132           0 :       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
    1133           0 :   };
    1134           0 :   transpose_vecs_512(padded);
    1135           0 :   _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
    1136           0 :   _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
    1137           0 :   _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
    1138           0 :   _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
    1139           0 :   _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
    1140           0 :   _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
    1141           0 :   _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
    1142           0 :   _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
    1143           0 :   _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
    1144           0 :   _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
    1145           0 :   _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
    1146           0 :   _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
    1147           0 :   _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
    1148           0 :   _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
    1149           0 :   _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
    1150           0 :   _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
    1151           0 : }
    1152             : 
    1153             : /*
    1154             :  * ----------------------------------------------------------------------------
    1155             :  * hash_many_avx512
    1156             :  * ----------------------------------------------------------------------------
    1157             :  */
    1158             : 
    1159             : INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
    1160             :                             const uint32_t key[8], uint64_t counter,
    1161             :                             uint8_t flags, uint8_t flags_start,
    1162      529462 :                             uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
    1163      529462 :   uint32_t cv[8];
    1164      529462 :   memcpy(cv, key, BLAKE3_KEY_LEN);
    1165      529462 :   uint8_t block_flags = flags | flags_start;
    1166     5030534 :   while (blocks > 0) {
    1167     4501072 :     if (blocks == 1) {
    1168      529462 :       block_flags |= flags_end;
    1169      529462 :     }
    1170     4501072 :     fd_blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
    1171     4501072 :                                        block_flags);
    1172     4501072 :     input = &input[BLAKE3_BLOCK_LEN];
    1173     4501072 :     blocks -= 1;
    1174     4501072 :     block_flags = flags;
    1175     4501072 :   }
    1176      529462 :   memcpy(out, cv, BLAKE3_OUT_LEN);
    1177      529462 : }
    1178             : 
    1179             : void fd_blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
    1180             :                                 size_t blocks, const uint32_t key[8],
    1181             :                                 uint64_t counter, bool increment_counter,
    1182             :                                 uint8_t flags, uint8_t flags_start,
    1183     1057356 :                                 uint8_t flags_end, uint8_t *out) {
    1184     1057356 :   while (num_inputs >= 16) {
    1185           0 :     fd_blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
    1186           0 :                             flags_start, flags_end, out);
    1187           0 :     if (increment_counter) {
    1188           0 :       counter += 16;
    1189           0 :     }
    1190           0 :     inputs += 16;
    1191           0 :     num_inputs -= 16;
    1192           0 :     out = &out[16 * BLAKE3_OUT_LEN];
    1193           0 :   }
    1194     1717637 :   while (num_inputs >= 8) {
    1195      660281 :     fd_blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
    1196      660281 :                            flags_start, flags_end, out);
    1197      660281 :     if (increment_counter) {
    1198      363217 :       counter += 8;
    1199      363217 :     }
    1200      660281 :     inputs += 8;
    1201      660281 :     num_inputs -= 8;
    1202      660281 :     out = &out[8 * BLAKE3_OUT_LEN];
    1203      660281 :   }
    1204     1189700 :   while (num_inputs >= 4) {
    1205      132344 :     fd_blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
    1206      132344 :                            flags_start, flags_end, out);
    1207      132344 :     if (increment_counter) {
    1208       66191 :       counter += 4;
    1209       66191 :     }
    1210      132344 :     inputs += 4;
    1211      132344 :     num_inputs -= 4;
    1212      132344 :     out = &out[4 * BLAKE3_OUT_LEN];
    1213      132344 :   }
    1214     1586818 :   while (num_inputs > 0) {
    1215      529462 :     hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
    1216      529462 :                     flags_end, out);
    1217      529462 :     if (increment_counter) {
    1218      264774 :       counter += 1;
    1219      264774 :     }
    1220      529462 :     inputs += 1;
    1221      529462 :     num_inputs -= 1;
    1222      529462 :     out = &out[BLAKE3_OUT_LEN];
    1223      529462 :   }
    1224     1057356 : }

Generated by: LCOV version 1.14