Line data Source code
1 : #include "fd_chacha20.h"
2 : #include "../../util/simd/fd_sse.h"
3 :
4 :
5 : void *
6 : fd_chacha20_block( void * _block,
7 : void const * _key,
8 33000003 : void const * _idx_nonce ) {
9 :
10 33000003 : uint * block = __builtin_assume_aligned( _block, 64UL );
11 33000003 : uint const * key = __builtin_assume_aligned( _key, 32UL );
12 33000003 : uint const * idx_nonce = __builtin_assume_aligned( _idx_nonce, 16UL );
13 :
14 : /* Construct the input ChaCha20 block state as the following
15 : matrix of little endian uint entries:
16 :
17 : cccccccc cccccccc cccccccc cccccccc
18 : kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
19 : kkkkkkkk kkkkkkkk kkkkkkkk kkkkkkkk
20 : bbbbbbbb nnnnnnnn nnnnnnnn nnnnnnnn
21 :
22 : Where
23 : c are the constants 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
24 : k is the input key
25 : b is the block index
26 : n is the nonce */
27 :
28 : /* Remember the input state for later use */
29 33000003 : vu_t row0_init = vu( 0x61707865U, 0x3320646eU, 0x79622d32U, 0x6b206574U );
30 33000003 : vu_t row1_init = vu_ld( key );
31 33000003 : vu_t row2_init = vu_ld( key+4 );
32 33000003 : vu_t row3_init = vu_ld( idx_nonce );
33 :
34 33000003 : vu_t row0 = row0_init;
35 33000003 : vu_t row1 = row1_init;
36 33000003 : vu_t row2 = row2_init;
37 33000003 : vu_t row3 = row3_init;
38 :
39 : /* These rotates are a bit faster, and they're on the critical path,
40 : so this makes a difference. */
41 660000060 : #define ROTATE_LEFT_16( x ) _mm_shuffle_epi8( (x), vb( 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 ) )
42 660000060 : #define ROTATE_LEFT_08( x ) _mm_shuffle_epi8( (x), vb( 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 ) )
43 660000060 : #define ROTATE_LEFT_12( x ) vu_rol( (x), 12 )
44 660000060 : #define ROTATE_LEFT_07( x ) vu_rol( (x), 7 )
45 :
46 : /* Run the ChaCha round function 20 times.
47 : (Each iteration does a column round and a diagonal round.) */
48 363000033 : for( ulong i=0UL; i<10UL; i++ ) {
49 : /* Column round */
50 330000030 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_16( row3 );
51 330000030 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_12( row1 );
52 330000030 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_08( row3 );
53 330000030 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_07( row1 );
54 :
55 330000030 : row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE( 0, 3, 2, 1 ) );
56 330000030 : row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
57 330000030 : row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE( 2, 1, 0, 3 ) );
58 :
59 : /* Diagonal round */
60 330000030 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_16( row3 );
61 330000030 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_12( row1 );
62 330000030 : row0 = vu_add( row0, row1 ); row3 = vu_xor( row3, row0 ); row3 = ROTATE_LEFT_08( row3 );
63 330000030 : row2 = vu_add( row2, row3 ); row1 = vu_xor( row1, row2 ); row1 = ROTATE_LEFT_07( row1 );
64 :
65 330000030 : row1 = _mm_shuffle_epi32( row1, _MM_SHUFFLE( 2, 1, 0, 3 ) );
66 330000030 : row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE( 1, 0, 3, 2 ) );
67 330000030 : row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE( 0, 3, 2, 1 ) );
68 330000030 : }
69 33000003 : #undef ROTATE_LEFT_07
70 33000003 : #undef ROTATE_LEFT_12
71 33000003 : #undef ROTATE_LEFT_08
72 33000003 : #undef ROTATE_LEFT_16
73 :
74 :
75 : /* Complete the block by adding the input state */
76 33000003 : row0 = vu_add( row0, row0_init );
77 33000003 : row1 = vu_add( row1, row1_init );
78 33000003 : row2 = vu_add( row2, row2_init );
79 33000003 : row3 = vu_add( row3, row3_init );
80 :
81 33000003 : vu_st( block, row0 );
82 33000003 : vu_st( block+ 4, row1 );
83 33000003 : vu_st( block+ 8, row2 );
84 33000003 : vu_st( block+12, row3 );
85 :
86 33000003 : return _block;
87 33000003 : }
88 :
|