Line data Source code
1 : #include "fd_chacha.h"
2 : #include "../../util/simd/fd_sse.h"
3 :
4 :
5 : static void *
6 : fd_chacha_block_sse( void * _block,
7 : void const * _key,
8 : void const * _idx_nonce,
9 39599997 : ulong rnd2_cnt ) {
10 :
11 39599997 : uint * block = __builtin_assume_aligned( _block, 64UL );
12 39599997 : uint const * key = __builtin_assume_aligned( _key, 32UL );
13 39599997 : uint const * idx_nonce = __builtin_assume_aligned( _idx_nonce, 16UL );
14 :
15 : /* Construct the input ChaCha20 block state as the following
16 : matrix of little endian uint entries:
17 :
18 : cccccccc cccccccc cccccccc cccccccc
19 : kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
20 : kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
21 : bbbbbbbb nnnnnnnn nnnnnnnn nnnnnnnn
22 :
23 : Where
24 : c are the constants 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
25 : k is the input key
26 : b is the block index
27 : n is the nonce */
28 :
29 : /* Remember the input state for later use */
30 39599997 : vu_t row0_init = vu( 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U );
31 39599997 : vu_t row1_init = vu_ld( key );
32 39599997 : vu_t row2_init = vu_ld( key+4 );
33 39599997 : vu_t row3_init = vu_ld( idx_nonce );
34 :
35 39599997 : vu_t row0 = row0_init;
36 39599997 : vu_t row1 = row1_init;
37 39599997 : vu_t row2 = row2_init;
38 39599997 : vu_t row3 = row3_init;
39 :
40 : /* These rotates are a bit faster, and they're on the critical path,
41 : so this makes a difference. */
42 752399976 : #define ROTATE_LEFT_16( x ) _mm_shuffle_epi8( (x), vb( 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 ) )
43 752399976 : #define ROTATE_LEFT_08( x ) _mm_shuffle_epi8( (x), vb( 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 ) )
44 752399976 : #define ROTATE_LEFT_12( x ) vu_rol( (x), 12 )
45 752399976 : #define ROTATE_LEFT_07( x ) vu_rol( (x), 7 )
46 :
47 : /* Run the ChaCha round function 20 times.
48 : (Each iteration does a column round and a diagonal round.) */
49 415799985 : for( ulong i=0UL; i<rnd2_cnt; i++ ) {
50 : /* Column round */
51 376199988 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_16( row3 );
52 376199988 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_12( row1 );
53 376199988 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_08( row3 );
54 376199988 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_07( row1 );
55 :
56 376199988 : row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE( 0, 3, 2, 1 ) );
57 376199988 : row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
58 376199988 : row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE( 2, 1, 0, 3 ) );
59 :
60 : /* Diagonal round */
61 376199988 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_16( row3 );
62 376199988 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_12( row1 );
63 376199988 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_08( row3 );
64 376199988 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_07( row1 );
65 :
66 376199988 : row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
67 376199988 : row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
68 376199988 : row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
69 376199988 : }
70 39599997 : #undef ROTATE_LEFT_07
71 39599997 : #undef ROTATE_LEFT_12
72 39599997 : #undef ROTATE_LEFT_08
73 39599997 : #undef ROTATE_LEFT_16
74 :
75 :
76 : /* Complete the block by adding the input state */
77 39599997 : row0 = vu_add( row0, row0_init );
78 39599997 : row1 = vu_add( row1, row1_init );
79 39599997 : row2 = vu_add( row2, row2_init );
80 39599997 : row3 = vu_add( row3, row3_init );
81 :
82 39599997 : vu_st( block, row0 );
83 39599997 : vu_st( block+ 4, row1 );
84 39599997 : vu_st( block+ 8, row2 );
85 39599997 : vu_st( block+12, row3 );
86 :
87 39599997 : return _block;
88 39599997 : }
89 :
90 : void *
91 : fd_chacha8_block( void * _block,
92 : void const * _key,
93 3299997 : void const * _idx_nonce ) {
94 3299997 : return fd_chacha_block_sse( _block, _key, _idx_nonce, 4UL );
95 3299997 : }
96 :
97 : void *
98 : fd_chacha20_block( void * _block,
99 : void const * _key,
100 36300000 : void const * _idx_nonce ) {
101 36300000 : return fd_chacha_block_sse( _block, _key, _idx_nonce, 10UL );
102 36300000 : }
|