LCOV - code coverage report
Current view: top level - ballet/chacha20 - fd_chacha20_sse.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 46 46 100.0 %
Date: 2025-01-08 12:08:44 Functions: 1 1 100.0 %

          Line data    Source code
       1             : #include "fd_chacha20.h"
       2             : #include "../../util/simd/fd_sse.h"
       3             : 
       4             : 
       5             : void *
       6             : fd_chacha20_block( void *       _block,
       7             :                    void const * _key,
       8    33000003 :                    void const * _idx_nonce ) {
       9             : 
      10    33000003 :   uint *       block     = __builtin_assume_aligned( _block,     64UL );
      11    33000003 :   uint const * key       = __builtin_assume_aligned( _key,       32UL );
      12    33000003 :   uint const * idx_nonce = __builtin_assume_aligned( _idx_nonce, 16UL );
      13             : 
      14             :   /* Construct the input ChaCha20 block state as the following
      15             :      matrix of little endian uint entries:
      16             : 
      17             :      cccccccc  cccccccc  cccccccc  cccccccc
      18             :      kkkkkkkk  kkkkkkkk  kkkkkkkk  kkkkkkkk
      19             :      kkkkkkkk  kkkkkkkk  kkkkkkkk  kkkkkkkk
      20             :      bbbbbbbb  nnnnnnnn  nnnnnnnn  nnnnnnnn
      21             : 
      22             :      Where
      23             :        c are the constants 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
      24             :        k is the input key
      25             :        b is the block index
      26             :        n is the nonce */
      27             : 
      28             :   /* Remember the input state for later use */
      29    33000003 :   vu_t row0_init = vu( 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U );
      30    33000003 :   vu_t row1_init = vu_ld( key       );
      31    33000003 :   vu_t row2_init = vu_ld( key+4     );
      32    33000003 :   vu_t row3_init = vu_ld( idx_nonce );
      33             : 
      34    33000003 :   vu_t row0 = row0_init;
      35    33000003 :   vu_t row1 = row1_init;
      36    33000003 :   vu_t row2 = row2_init;
      37    33000003 :   vu_t row3 = row3_init;
      38             : 
      39             :   /* These rotates are a bit faster, and they're on the critical path,
      40             :      so this makes a difference. */
      41   660000060 : #define ROTATE_LEFT_16( x ) _mm_shuffle_epi8( (x), vb( 2,3,0,1, 6,7,4,5, 10,11,8,9,  14,15,12,13 ) )
      42   660000060 : #define ROTATE_LEFT_08( x ) _mm_shuffle_epi8( (x), vb( 3,0,1,2, 7,4,5,6, 11,8,9,10,  15,12,13,14 ) )
      43   660000060 : #define ROTATE_LEFT_12( x ) vu_rol( (x), 12 )
      44   660000060 : #define ROTATE_LEFT_07( x ) vu_rol( (x),  7 )
      45             : 
      46             :   /* Run the ChaCha round function 20 times.
      47             :      (Each iteration does a column round and a diagonal round.) */
      48   363000033 :   for( ulong i=0UL; i<10UL; i++ ) {
      49             :     /* Column round */
      50   330000030 :     row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_16( row3 );
      51   330000030 :     row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_12( row1 );
      52   330000030 :     row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_08( row3 );
      53   330000030 :     row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_07( row1 );
      54             : 
      55   330000030 :     row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE( 0, 3, 2, 1 ) );
      56   330000030 :     row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
      57   330000030 :     row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE( 2, 1, 0, 3 ) );
      58             : 
      59             :     /* Diagonal round */
      60   330000030 :     row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_16( row3 );
      61   330000030 :     row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_12( row1 );
      62   330000030 :     row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_08( row3 );
      63   330000030 :     row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_07( row1 );
      64             : 
      65   330000030 :     row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
      66   330000030 :     row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
      67   330000030 :     row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
      68   330000030 :   }
      69    33000003 : #undef ROTATE_LEFT_07
      70    33000003 : #undef ROTATE_LEFT_12
      71    33000003 : #undef ROTATE_LEFT_08
      72    33000003 : #undef ROTATE_LEFT_16
      73             : 
      74             : 
      75             :   /* Complete the block by adding the input state */
      76    33000003 :   row0 = vu_add( row0, row0_init );
      77    33000003 :   row1 = vu_add( row1, row1_init );
      78    33000003 :   row2 = vu_add( row2, row2_init );
      79    33000003 :   row3 = vu_add( row3, row3_init );
      80             : 
      81    33000003 :   vu_st( block,    row0 );
      82    33000003 :   vu_st( block+ 4, row1 );
      83    33000003 :   vu_st( block+ 8, row2 );
      84    33000003 :   vu_st( block+12, row3 );
      85             : 
      86    33000003 :   return _block;
      87    33000003 : }
      88             : 

Generated by: LCOV version 1.14