LCOV - code coverage report
Current view: top level - ballet/chacha - fd_chacha_rng_avx.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 84 84 100.0 %
Date: 2025-09-19 04:41:14 Functions: 4 4 100.0 %

          Line data    Source code
       1             : #include "fd_chacha_rng.h"
       2             : #include "../../util/simd/fd_avx.h"
       3             : #include <assert.h>
       4             : 
       5  2243438976 : #define wu_rol16(a) wb_exch_adj_pair( (a) )
       6  2243438976 : #define wu_rol12(a) wu_rol( (a), 12 )
       7  2243438976 : #define wu_rol7(a)  wu_rol( (a),  7 )
       8             : 
       9             : static inline __attribute__((always_inline)) wu_t
      10  2243438976 : wu_rol8( wu_t x ) {
      11  2243438976 :   wb_t const mask =
      12  2243438976 :     wb_bcast_hex( 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 );
      13  2243438976 :   return _mm256_shuffle_epi8( x, mask );
      14  2243438976 : }
      15             : 
      16             : __attribute__((always_inline)) static inline void
      17             : fd_chacha_rng_refill_avx( fd_chacha_rng_t * rng,
      18    30229236 :                           ulong             rnd2_cnt ) {
      19             : 
      20    30229236 :   wu_t iv0  = wu_bcast( 0x61707865U );
      21    30229236 :   wu_t iv1  = wu_bcast( 0x3320646eU );
      22    30229236 :   wu_t iv2  = wu_bcast( 0x79622d32U );
      23    30229236 :   wu_t iv3  = wu_bcast( 0x6b206574U );
      24    30229236 :   wb_t key  = wb_ld( rng->key );
      25    30229236 :   wu_t zero = wu_zero();
      26             : 
      27             :   /* Unpack key equivalent to:
      28             : 
      29             :        c4 = wu_bcast( (uint const *)(rng->key)[0] );
      30             :        c5 = wu_bcast( (uint const *)(rng->key)[1] );
      31             :        ...
      32             :        cB = wu_bcast( (uint const *)(rng->key)[7] ); */
      33             : 
      34    30229236 :   wu_t key_lo = _mm256_permute2x128_si256( key, key, 0x00 );  /* [0,1,2,3,0,1,2,3] */
      35    30229236 :   wu_t key_hi = _mm256_permute2x128_si256( key, key, 0x11 );  /* [4,5,6,7,4,5,6,7] */
      36    30229236 :   wu_t k0 = _mm256_shuffle_epi32( key_lo, 0x00 );
      37    30229236 :   wu_t k1 = _mm256_shuffle_epi32( key_lo, 0x55 );
      38    30229236 :   wu_t k2 = _mm256_shuffle_epi32( key_lo, 0xaa );
      39    30229236 :   wu_t k3 = _mm256_shuffle_epi32( key_lo, 0xff );
      40    30229236 :   wu_t k4 = _mm256_shuffle_epi32( key_hi, 0x00 );
      41    30229236 :   wu_t k5 = _mm256_shuffle_epi32( key_hi, 0x55 );
      42    30229236 :   wu_t k6 = _mm256_shuffle_epi32( key_hi, 0xaa );
      43    30229236 :   wu_t k7 = _mm256_shuffle_epi32( key_hi, 0xff );
      44             : 
      45             :   /* Derive block index */
      46             : 
      47    30229236 :   ulong idx = rng->buf_fill / FD_CHACHA_BLOCK_SZ;  /* really a right shift */
      48    30229236 :   wu_t idxs = wu_add( wu_bcast( idx ), wu( 0, 1, 2, 3, 4, 5, 6, 7 ) );
      49             : 
      50             :   /* Run through the round function */
      51             : 
      52    30229236 :   wu_t c0 = iv0;   wu_t c1 = iv1;   wu_t c2 = iv2;   wu_t c3 = iv3;
      53    30229236 :   wu_t c4 = k0;    wu_t c5 = k1;    wu_t c6 = k2;    wu_t c7 = k3;
      54    30229236 :   wu_t c8 = k4;    wu_t c9 = k5;    wu_t cA = k6;    wu_t cB = k7;
      55    30229236 :   wu_t cC = idxs;  wu_t cD = zero;  wu_t cE = zero;  wu_t cF = zero;
      56             : 
      57    30229236 : # define QUARTER_ROUND(a,b,c,d)                                        \
      58  2243438976 :   do {                                                                 \
      59  2243438976 :     a = wu_add( a, b ); d = wu_xor( d, a ); d = wu_rol16( d );         \
      60  2243438976 :     c = wu_add( c, d ); b = wu_xor( b, c ); b = wu_rol12( b );         \
      61  2243438976 :     a = wu_add( a, b ); d = wu_xor( d, a ); d = wu_rol8( d );          \
      62  2243438976 :     c = wu_add( c, d ); b = wu_xor( b, c ); b = wu_rol7( b );          \
      63  2243438976 :   } while(0)
      64             : 
      65   310659108 :   for( ulong i=0UL; i<rnd2_cnt; i++ ) {
      66   280429872 :     QUARTER_ROUND( c0, c4, c8, cC );
      67   280429872 :     QUARTER_ROUND( c1, c5, c9, cD );
      68   280429872 :     QUARTER_ROUND( c2, c6, cA, cE );
      69   280429872 :     QUARTER_ROUND( c3, c7, cB, cF );
      70   280429872 :     QUARTER_ROUND( c0, c5, cA, cF );
      71   280429872 :     QUARTER_ROUND( c1, c6, cB, cC );
      72   280429872 :     QUARTER_ROUND( c2, c7, c8, cD );
      73   280429872 :     QUARTER_ROUND( c3, c4, c9, cE );
      74   280429872 :   }
      75    30229236 : # undef QUARTER_ROUND
      76             : 
      77             :   /* Finalize */
      78             : 
      79    30229236 :   c0 = wu_add( c0, iv0  );
      80    30229236 :   c1 = wu_add( c1, iv1  );
      81    30229236 :   c2 = wu_add( c2, iv2  );
      82    30229236 :   c3 = wu_add( c3, iv3  );
      83    30229236 :   c4 = wu_add( c4, k0   );
      84    30229236 :   c5 = wu_add( c5, k1   );
      85    30229236 :   c6 = wu_add( c6, k2   );
      86    30229236 :   c7 = wu_add( c7, k3   );
      87    30229236 :   c8 = wu_add( c8, k4   );
      88    30229236 :   c9 = wu_add( c9, k5   );
      89    30229236 :   cA = wu_add( cA, k6   );
      90    30229236 :   cB = wu_add( cB, k7   );
      91    30229236 :   cC = wu_add( cC, idxs );
      92             :   //cD = wu_add( cD, zero );
      93             :   //cE = wu_add( cE, zero );
      94             :   //cF = wu_add( cF, zero );
      95             : 
      96             :   /* Transpose matrix to get output vector */
      97             : 
      98    30229236 :   wu_transpose_8x8( c0, c1, c2, c3, c4, c5, c6, c7,
      99    30229236 :                     c0, c1, c2, c3, c4, c5, c6, c7 );
     100    30229236 :   wu_transpose_8x8( c8, c9, cA, cB, cC, cD, cE, cF,
     101    30229236 :                     c8, c9, cA, cB, cC, cD, cE, cF );
     102             : 
     103             :   /* Update ring buffer */
     104             : 
     105    30229236 :   ulong  slot = rng->buf_fill % (8*FD_CHACHA_BLOCK_SZ);
     106    30229236 :   uint * out  = (uint *)rng->buf + (slot*2*FD_CHACHA_BLOCK_SZ);
     107    30229236 :   wu_st( out+0x00, c0 ); wu_st( out+0x08, c8 );
     108    30229236 :   wu_st( out+0x10, c1 ); wu_st( out+0x18, c9 );
     109    30229236 :   wu_st( out+0x20, c2 ); wu_st( out+0x28, cA );
     110    30229236 :   wu_st( out+0x30, c3 ); wu_st( out+0x38, cB );
     111    30229236 :   wu_st( out+0x40, c4 ); wu_st( out+0x48, cC );
     112    30229236 :   wu_st( out+0x50, c5 ); wu_st( out+0x58, cD );
     113    30229236 :   wu_st( out+0x60, c6 ); wu_st( out+0x68, cE );
     114    30229236 :   wu_st( out+0x70, c7 ); wu_st( out+0x78, cF );
     115             : 
     116             :   /* Update ring descriptor */
     117             : 
     118    30229236 :   rng->buf_fill += 8*FD_CHACHA_BLOCK_SZ;
     119    30229236 : }
     120             : 
     121             : void
     122     3643748 : fd_chacha8_rng_refill_avx( fd_chacha_rng_t * rng ) {
     123     3643748 :   fd_chacha_rng_refill_avx( rng, 4UL );
     124     3643748 : }
     125             : 
     126             : void
     127    26585488 : fd_chacha20_rng_refill_avx( fd_chacha_rng_t * rng ) {
     128    26585488 :   fd_chacha_rng_refill_avx( rng, 10UL );
     129    26585488 : }

Generated by: LCOV version 1.14