LCOV - code coverage report
Current view: top level - ballet/chacha - fd_chacha_rng_avx512.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 85 87 97.7 %
Date: 2025-09-19 04:41:14 Functions: 4 4 100.0 %

          Line data    Source code
       1             : #include "fd_chacha_rng.h"
       2             : #include "../../util/simd/fd_avx512.h"
       3             : #include <assert.h>
       4             : 
       5   637316944 : #define wwu_rol16(a) wwb_exch_adj_pair( (a) )
       6   637316944 : #define wwu_rol12(a) wwu_rol( (a), 12 )
       7   637316944 : #define wwu_rol7(a)  wwu_rol( (a),  7 )
       8             : 
       9             : static inline __attribute__((always_inline)) wwu_t
      10   637316944 : wwu_rol8( wwu_t x ) {
      11   637316944 :   wwb_t const mask =
      12   637316944 :     wwb_bcast_hex( 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 );
      13   637316944 :   return _mm512_shuffle_epi8( x, mask );
      14   637316944 : }
      15             : 
      16             : static void
      17             : fd_chacha_rng_refill_avx512( fd_chacha_rng_t * rng,
      18     8678024 :                              ulong             rnd2_cnt ) {
      19             : 
      20             :   /* This function should only be called if the buffer is empty. */
      21     8678024 :   if( FD_UNLIKELY( rng->buf_off != rng->buf_fill ) ) {
      22           0 :     FD_LOG_CRIT(( "refill out of sync: buf_off=%lu buf_fill=%lu", rng->buf_off, rng->buf_fill ));
      23           0 :   }
      24             : 
      25     8678024 :   wwu_t iv0  = wwu_bcast( 0x61707865U );
      26     8678024 :   wwu_t iv1  = wwu_bcast( 0x3320646eU );
      27     8678024 :   wwu_t iv2  = wwu_bcast( 0x79622d32U );
      28     8678024 :   wwu_t iv3  = wwu_bcast( 0x6b206574U );
      29     8678024 :   wwu_t zero = wwu_zero();
      30             : 
      31             :   /* Unpack key equivalent to:
      32             : 
      33             :        c4 = wwu_bcast( (uint const *)(rng->key)[0] );
      34             :        c5 = wwu_bcast( (uint const *)(rng->key)[1] );
      35             :        ...
      36             :        cB = wwu_bcast( (uint const *)(rng->key)[7] ); */
      37             : 
      38     8678024 :   __m128i key_lo_v = _mm_load_si128( (__m128i const *)rng->key   ); /* [0,1,2,3] */
      39     8678024 :   __m128i key_hi_v = _mm_load_si128( (__m128i const *)rng->key+1 ); /* [4,5,6,7] */
      40     8678024 :   wwu_t key_lo = _mm512_broadcast_i32x4( key_lo_v );  /* [0,1,2,3,0,1,2,3] */
      41     8678024 :   wwu_t key_hi = _mm512_broadcast_i32x4( key_hi_v );  /* [4,5,6,7,4,5,6,7] */
      42     8678024 :   wwu_t k0 = _mm512_shuffle_epi32( key_lo, 0x00 );
      43     8678024 :   wwu_t k1 = _mm512_shuffle_epi32( key_lo, 0x55 );
      44     8678024 :   wwu_t k2 = _mm512_shuffle_epi32( key_lo, 0xaa );
      45     8678024 :   wwu_t k3 = _mm512_shuffle_epi32( key_lo, 0xff );
      46     8678024 :   wwu_t k4 = _mm512_shuffle_epi32( key_hi, 0x00 );
      47     8678024 :   wwu_t k5 = _mm512_shuffle_epi32( key_hi, 0x55 );
      48     8678024 :   wwu_t k6 = _mm512_shuffle_epi32( key_hi, 0xaa );
      49     8678024 :   wwu_t k7 = _mm512_shuffle_epi32( key_hi, 0xff );
      50             : 
      51             :   /* Derive block index */
      52             : 
      53     8678024 :   ulong idx = rng->buf_fill / FD_CHACHA_BLOCK_SZ;  /* really a right shift */
      54     8678024 :   wwu_t idxs = wwu_add( wwu_bcast( idx ), wwu( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ) );
      55             : 
      56             :   /* Run through the round function */
      57             : 
      58     8678024 :   wwu_t c0 = iv0;   wwu_t c1 = iv1;   wwu_t c2 = iv2;   wwu_t c3 = iv3;
      59     8678024 :   wwu_t c4 = k0;    wwu_t c5 = k1;    wwu_t c6 = k2;    wwu_t c7 = k3;
      60     8678024 :   wwu_t c8 = k4;    wwu_t c9 = k5;    wwu_t cA = k6;    wwu_t cB = k7;
      61     8678024 :   wwu_t cC = idxs;  wwu_t cD = zero;  wwu_t cE = zero;  wwu_t cF = zero;
      62             : 
      63     8678024 : # define QUARTER_ROUND(a,b,c,d)                                   \
      64   637316944 :   do {                                                            \
      65   637316944 :     a = wwu_add( a, b ); d = wwu_xor( d, a ); d = wwu_rol16( d ); \
      66   637316944 :     c = wwu_add( c, d ); b = wwu_xor( b, c ); b = wwu_rol12( b ); \
      67   637316944 :     a = wwu_add( a, b ); d = wwu_xor( d, a ); d = wwu_rol8( d );  \
      68   637316944 :     c = wwu_add( c, d ); b = wwu_xor( b, c ); b = wwu_rol7( b );  \
      69   637316944 :   } while(0)
      70             : 
      71    88342642 :   for( ulong i=0UL; i<rnd2_cnt; i++ ) {
      72    79664618 :     QUARTER_ROUND( c0, c4, c8, cC );
      73    79664618 :     QUARTER_ROUND( c1, c5, c9, cD );
      74    79664618 :     QUARTER_ROUND( c2, c6, cA, cE );
      75    79664618 :     QUARTER_ROUND( c3, c7, cB, cF );
      76    79664618 :     QUARTER_ROUND( c0, c5, cA, cF );
      77    79664618 :     QUARTER_ROUND( c1, c6, cB, cC );
      78    79664618 :     QUARTER_ROUND( c2, c7, c8, cD );
      79    79664618 :     QUARTER_ROUND( c3, c4, c9, cE );
      80    79664618 :   }
      81     8678024 : # undef QUARTER_ROUND
      82             : 
      83             :   /* Finalize */
      84             : 
      85     8678024 :   c0 = wwu_add( c0, iv0  );
      86     8678024 :   c1 = wwu_add( c1, iv1  );
      87     8678024 :   c2 = wwu_add( c2, iv2  );
      88     8678024 :   c3 = wwu_add( c3, iv3  );
      89     8678024 :   c4 = wwu_add( c4, k0   );
      90     8678024 :   c5 = wwu_add( c5, k1   );
      91     8678024 :   c6 = wwu_add( c6, k2   );
      92     8678024 :   c7 = wwu_add( c7, k3   );
      93     8678024 :   c8 = wwu_add( c8, k4   );
      94     8678024 :   c9 = wwu_add( c9, k5   );
      95     8678024 :   cA = wwu_add( cA, k6   );
      96     8678024 :   cB = wwu_add( cB, k7   );
      97     8678024 :   cC = wwu_add( cC, idxs );
      98             :   //cD = wwu_add( cD, zero );
      99             :   //cE = wwu_add( cE, zero );
     100             :   //cF = wwu_add( cF, zero );
     101             : 
     102             :   /* Transpose matrix to get output vector */
     103             : 
     104     8678024 :   wwu_transpose_16x16( c0, c1, c2, c3, c4, c5, c6, c7,
     105     8678024 :                        c8, c9, cA, cB, cC, cD, cE, cF,
     106     8678024 :                        c0, c1, c2, c3, c4, c5, c6, c7,
     107     8678024 :                        c8, c9, cA, cB, cC, cD, cE, cF );
     108             : 
     109             :   /* Update ring buffer */
     110             : 
     111     8678024 :   uint * out = (uint *)rng->buf;
     112     8678024 :   wwu_st( out+0x00, c0 ); wwu_st( out+0x10, c1 );
     113     8678024 :   wwu_st( out+0x20, c2 ); wwu_st( out+0x30, c3 );
     114     8678024 :   wwu_st( out+0x40, c4 ); wwu_st( out+0x50, c5 );
     115     8678024 :   wwu_st( out+0x60, c6 ); wwu_st( out+0x70, c7 );
     116     8678024 :   wwu_st( out+0x80, c8 ); wwu_st( out+0x90, c9 );
     117     8678024 :   wwu_st( out+0xa0, cA ); wwu_st( out+0xb0, cB );
     118     8678024 :   wwu_st( out+0xc0, cC ); wwu_st( out+0xd0, cD );
     119     8678024 :   wwu_st( out+0xe0, cE ); wwu_st( out+0xf0, cF );
     120             : 
     121             :   /* Update ring descriptor */
     122             : 
     123     8678024 :   rng->buf_fill += 16*FD_CHACHA_BLOCK_SZ;
     124     8678024 : }
     125             : 
     126             : void
     127     1185937 : fd_chacha8_rng_refill_avx512( fd_chacha_rng_t * rng ) {
     128     1185937 :   fd_chacha_rng_refill_avx512( rng, 4UL );
     129     1185937 : }
     130             : 
     131             : void
     132     7492087 : fd_chacha20_rng_refill_avx512( fd_chacha_rng_t * rng ) {
     133     7492087 :   fd_chacha_rng_refill_avx512( rng, 10UL );
     134     7492087 : }

Generated by: LCOV version 1.14