Line data Source code
1 : #include "fd_chacha_rng.h"
2 : #include "../../util/simd/fd_avx.h"
3 : #include <assert.h>
4 :
5 1348923040 : #define wu_rol16(a) wb_exch_adj_pair( (a) )
6 1348923040 : #define wu_rol12(a) wu_rol( (a), 12 )
7 1348923040 : #define wu_rol7(a) wu_rol( (a), 7 )
8 :
9 : static inline __attribute__((always_inline)) wu_t
10 1348923040 : wu_rol8( wu_t x ) {
11 1348923040 : wb_t const mask =
12 1348923040 : wb_bcast_hex( 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 );
13 1348923040 : return _mm256_shuffle_epi8( x, mask );
14 1348923040 : }
15 :
16 : __attribute__((always_inline)) static inline void
17 : fd_chacha_rng_refill_avx( fd_chacha_rng_t * rng,
18 21789248 : ulong rnd2_cnt ) {
19 :
20 21789248 : wu_t iv0 = wu_bcast( 0x61707865U );
21 21789248 : wu_t iv1 = wu_bcast( 0x3320646eU );
22 21789248 : wu_t iv2 = wu_bcast( 0x79622d32U );
23 21789248 : wu_t iv3 = wu_bcast( 0x6b206574U );
24 21789248 : wb_t key = wb_ld( rng->key );
25 21789248 : wu_t zero = wu_zero();
26 :
27 : /* Unpack key equivalent to:
28 :
29 : c4 = wu_bcast( (uint const *)(rng->key)[0] );
30 : c5 = wu_bcast( (uint const *)(rng->key)[1] );
31 : ...
32 : cB = wu_bcast( (uint const *)(rng->key)[7] ); */
33 :
34 21789248 : wu_t key_lo = _mm256_permute2x128_si256( key, key, 0x00 ); /* [0,1,2,3,0,1,2,3] */
35 21789248 : wu_t key_hi = _mm256_permute2x128_si256( key, key, 0x11 ); /* [4,5,6,7,4,5,6,7] */
36 21789248 : wu_t k0 = _mm256_shuffle_epi32( key_lo, 0x00 );
37 21789248 : wu_t k1 = _mm256_shuffle_epi32( key_lo, 0x55 );
38 21789248 : wu_t k2 = _mm256_shuffle_epi32( key_lo, 0xaa );
39 21789248 : wu_t k3 = _mm256_shuffle_epi32( key_lo, 0xff );
40 21789248 : wu_t k4 = _mm256_shuffle_epi32( key_hi, 0x00 );
41 21789248 : wu_t k5 = _mm256_shuffle_epi32( key_hi, 0x55 );
42 21789248 : wu_t k6 = _mm256_shuffle_epi32( key_hi, 0xaa );
43 21789248 : wu_t k7 = _mm256_shuffle_epi32( key_hi, 0xff );
44 :
45 : /* Derive block index */
46 :
47 21789248 : ulong idx = rng->buf_fill / FD_CHACHA_BLOCK_SZ; /* really a right shift */
48 21789248 : wu_t idxs = wu_add( wu_bcast( idx ), wu( 0, 1, 2, 3, 4, 5, 6, 7 ) );
49 :
50 : /* Run through the round function */
51 :
52 21789248 : wu_t c0 = iv0; wu_t c1 = iv1; wu_t c2 = iv2; wu_t c3 = iv3;
53 21789248 : wu_t c4 = k0; wu_t c5 = k1; wu_t c6 = k2; wu_t c7 = k3;
54 21789248 : wu_t c8 = k4; wu_t c9 = k5; wu_t cA = k6; wu_t cB = k7;
55 21789248 : wu_t cC = idxs; wu_t cD = zero; wu_t cE = zero; wu_t cF = zero;
56 :
57 21789248 : # define QUARTER_ROUND(a,b,c,d) \
58 1348923040 : do { \
59 1348923040 : a = wu_add( a, b ); d = wu_xor( d, a ); d = wu_rol16( d ); \
60 1348923040 : c = wu_add( c, d ); b = wu_xor( b, c ); b = wu_rol12( b ); \
61 1348923040 : a = wu_add( a, b ); d = wu_xor( d, a ); d = wu_rol8( d ); \
62 1348923040 : c = wu_add( c, d ); b = wu_xor( b, c ); b = wu_rol7( b ); \
63 1348923040 : } while(0)
64 :
65 190404628 : for( ulong i=0UL; i<rnd2_cnt; i++ ) {
66 168615380 : QUARTER_ROUND( c0, c4, c8, cC );
67 168615380 : QUARTER_ROUND( c1, c5, c9, cD );
68 168615380 : QUARTER_ROUND( c2, c6, cA, cE );
69 168615380 : QUARTER_ROUND( c3, c7, cB, cF );
70 168615380 : QUARTER_ROUND( c0, c5, cA, cF );
71 168615380 : QUARTER_ROUND( c1, c6, cB, cC );
72 168615380 : QUARTER_ROUND( c2, c7, c8, cD );
73 168615380 : QUARTER_ROUND( c3, c4, c9, cE );
74 168615380 : }
75 21789248 : # undef QUARTER_ROUND
76 :
77 : /* Finalize */
78 :
79 21789248 : c0 = wu_add( c0, iv0 );
80 21789248 : c1 = wu_add( c1, iv1 );
81 21789248 : c2 = wu_add( c2, iv2 );
82 21789248 : c3 = wu_add( c3, iv3 );
83 21789248 : c4 = wu_add( c4, k0 );
84 21789248 : c5 = wu_add( c5, k1 );
85 21789248 : c6 = wu_add( c6, k2 );
86 21789248 : c7 = wu_add( c7, k3 );
87 21789248 : c8 = wu_add( c8, k4 );
88 21789248 : c9 = wu_add( c9, k5 );
89 21789248 : cA = wu_add( cA, k6 );
90 21789248 : cB = wu_add( cB, k7 );
91 21789248 : cC = wu_add( cC, idxs );
92 : //cD = wu_add( cD, zero );
93 : //cE = wu_add( cE, zero );
94 : //cF = wu_add( cF, zero );
95 :
96 : /* Transpose matrix to get output vector */
97 :
98 21789248 : wu_transpose_8x8( c0, c1, c2, c3, c4, c5, c6, c7,
99 21789248 : c0, c1, c2, c3, c4, c5, c6, c7 );
100 21789248 : wu_transpose_8x8( c8, c9, cA, cB, cC, cD, cE, cF,
101 21789248 : c8, c9, cA, cB, cC, cD, cE, cF );
102 :
103 : /* Update ring buffer */
104 :
105 21789248 : ulong slot = rng->buf_fill % (8*FD_CHACHA_BLOCK_SZ);
106 21789248 : uint * out = (uint *)rng->buf + (slot*2*FD_CHACHA_BLOCK_SZ);
107 21789248 : wu_st( out+0x00, c0 ); wu_st( out+0x08, c8 );
108 21789248 : wu_st( out+0x10, c1 ); wu_st( out+0x18, c9 );
109 21789248 : wu_st( out+0x20, c2 ); wu_st( out+0x28, cA );
110 21789248 : wu_st( out+0x30, c3 ); wu_st( out+0x38, cB );
111 21789248 : wu_st( out+0x40, c4 ); wu_st( out+0x48, cC );
112 21789248 : wu_st( out+0x50, c5 ); wu_st( out+0x58, cD );
113 21789248 : wu_st( out+0x60, c6 ); wu_st( out+0x68, cE );
114 21789248 : wu_st( out+0x70, c7 ); wu_st( out+0x78, cF );
115 :
116 : /* Update ring descriptor */
117 :
118 21789248 : rng->buf_fill += 8*FD_CHACHA_BLOCK_SZ;
119 21789248 : }
120 :
121 : void
122 8212850 : fd_chacha8_rng_refill_avx( fd_chacha_rng_t * rng ) {
123 8212850 : fd_chacha_rng_refill_avx( rng, 4UL );
124 8212850 : }
125 :
126 : void
127 13576398 : fd_chacha20_rng_refill_avx( fd_chacha_rng_t * rng ) {
128 13576398 : fd_chacha_rng_refill_avx( rng, 10UL );
129 13576398 : }
|