Line data Source code
1 : #include "fd_chacha_rng.h"
2 : #include "../../util/simd/fd_avx512.h"
3 : #include <assert.h>
4 :
5 465953696 : #define wwu_rol16(a) wwb_exch_adj_pair( (a) )
6 465953696 : #define wwu_rol12(a) wwu_rol( (a), 12 )
7 465953696 : #define wwu_rol7(a) wwu_rol( (a), 7 )
8 :
9 : static inline __attribute__((always_inline)) wwu_t
10 465953696 : wwu_rol8( wwu_t x ) {
11 465953696 : wwb_t const mask =
12 465953696 : wwb_bcast_hex( 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 );
13 465953696 : return _mm512_shuffle_epi8( x, mask );
14 465953696 : }
15 :
16 : static void
17 : fd_chacha_rng_refill_avx512( fd_chacha_rng_t * rng,
18 6959086 : ulong rnd2_cnt ) {
19 :
20 : /* This function should only be called if the buffer is empty. */
21 6959086 : if( FD_UNLIKELY( rng->buf_off != rng->buf_fill ) ) {
22 0 : FD_LOG_CRIT(( "refill out of sync: buf_off=%lu buf_fill=%lu", rng->buf_off, rng->buf_fill ));
23 0 : }
24 :
25 6959086 : wwu_t iv0 = wwu_bcast( 0x61707865U );
26 6959086 : wwu_t iv1 = wwu_bcast( 0x3320646eU );
27 6959086 : wwu_t iv2 = wwu_bcast( 0x79622d32U );
28 6959086 : wwu_t iv3 = wwu_bcast( 0x6b206574U );
29 6959086 : wwu_t zero = wwu_zero();
30 :
31 : /* Unpack key equivalent to:
32 :
33 : c4 = wwu_bcast( (uint const *)(rng->key)[0] );
34 : c5 = wwu_bcast( (uint const *)(rng->key)[1] );
35 : ...
36 : cB = wwu_bcast( (uint const *)(rng->key)[7] ); */
37 :
38 6959086 : __m128i key_lo_v = _mm_load_si128( (__m128i const *)rng->key ); /* [0,1,2,3] */
39 6959086 : __m128i key_hi_v = _mm_load_si128( (__m128i const *)rng->key+1 ); /* [4,5,6,7] */
40 6959086 : wwu_t key_lo = _mm512_broadcast_i32x4( key_lo_v ); /* [0,1,2,3,0,1,2,3] */
41 6959086 : wwu_t key_hi = _mm512_broadcast_i32x4( key_hi_v ); /* [4,5,6,7,4,5,6,7] */
42 6959086 : wwu_t k0 = _mm512_shuffle_epi32( key_lo, 0x00 );
43 6959086 : wwu_t k1 = _mm512_shuffle_epi32( key_lo, 0x55 );
44 6959086 : wwu_t k2 = _mm512_shuffle_epi32( key_lo, 0xaa );
45 6959086 : wwu_t k3 = _mm512_shuffle_epi32( key_lo, 0xff );
46 6959086 : wwu_t k4 = _mm512_shuffle_epi32( key_hi, 0x00 );
47 6959086 : wwu_t k5 = _mm512_shuffle_epi32( key_hi, 0x55 );
48 6959086 : wwu_t k6 = _mm512_shuffle_epi32( key_hi, 0xaa );
49 6959086 : wwu_t k7 = _mm512_shuffle_epi32( key_hi, 0xff );
50 :
51 : /* Derive block index */
52 :
53 6959086 : ulong idx = rng->buf_fill / FD_CHACHA_BLOCK_SZ; /* really a right shift */
54 6959086 : wwu_t idxs = wwu_add( wwu_bcast( idx ), wwu( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ) );
55 :
56 : /* Run through the round function */
57 :
58 6959086 : wwu_t c0 = iv0; wwu_t c1 = iv1; wwu_t c2 = iv2; wwu_t c3 = iv3;
59 6959086 : wwu_t c4 = k0; wwu_t c5 = k1; wwu_t c6 = k2; wwu_t c7 = k3;
60 6959086 : wwu_t c8 = k4; wwu_t c9 = k5; wwu_t cA = k6; wwu_t cB = k7;
61 6959086 : wwu_t cC = idxs; wwu_t cD = zero; wwu_t cE = zero; wwu_t cF = zero;
62 :
63 6959086 : # define QUARTER_ROUND(a,b,c,d) \
64 465953696 : do { \
65 465953696 : a = wwu_add( a, b ); d = wwu_xor( d, a ); d = wwu_rol16( d ); \
66 465953696 : c = wwu_add( c, d ); b = wwu_xor( b, c ); b = wwu_rol12( b ); \
67 465953696 : a = wwu_add( a, b ); d = wwu_xor( d, a ); d = wwu_rol8( d ); \
68 465953696 : c = wwu_add( c, d ); b = wwu_xor( b, c ); b = wwu_rol7( b ); \
69 465953696 : } while(0)
70 :
71 65203298 : for( ulong i=0UL; i<rnd2_cnt; i++ ) {
72 58244212 : QUARTER_ROUND( c0, c4, c8, cC );
73 58244212 : QUARTER_ROUND( c1, c5, c9, cD );
74 58244212 : QUARTER_ROUND( c2, c6, cA, cE );
75 58244212 : QUARTER_ROUND( c3, c7, cB, cF );
76 58244212 : QUARTER_ROUND( c0, c5, cA, cF );
77 58244212 : QUARTER_ROUND( c1, c6, cB, cC );
78 58244212 : QUARTER_ROUND( c2, c7, c8, cD );
79 58244212 : QUARTER_ROUND( c3, c4, c9, cE );
80 58244212 : }
81 6959086 : # undef QUARTER_ROUND
82 :
83 : /* Finalize */
84 :
85 6959086 : c0 = wwu_add( c0, iv0 );
86 6959086 : c1 = wwu_add( c1, iv1 );
87 6959086 : c2 = wwu_add( c2, iv2 );
88 6959086 : c3 = wwu_add( c3, iv3 );
89 6959086 : c4 = wwu_add( c4, k0 );
90 6959086 : c5 = wwu_add( c5, k1 );
91 6959086 : c6 = wwu_add( c6, k2 );
92 6959086 : c7 = wwu_add( c7, k3 );
93 6959086 : c8 = wwu_add( c8, k4 );
94 6959086 : c9 = wwu_add( c9, k5 );
95 6959086 : cA = wwu_add( cA, k6 );
96 6959086 : cB = wwu_add( cB, k7 );
97 6959086 : cC = wwu_add( cC, idxs );
98 : //cD = wwu_add( cD, zero );
99 : //cE = wwu_add( cE, zero );
100 : //cF = wwu_add( cF, zero );
101 :
102 : /* Transpose matrix to get output vector */
103 :
104 6959086 : wwu_transpose_16x16( c0, c1, c2, c3, c4, c5, c6, c7,
105 6959086 : c8, c9, cA, cB, cC, cD, cE, cF,
106 6959086 : c0, c1, c2, c3, c4, c5, c6, c7,
107 6959086 : c8, c9, cA, cB, cC, cD, cE, cF );
108 :
109 : /* Update ring buffer */
110 :
111 6959086 : uint * out = (uint *)rng->buf;
112 6959086 : wwu_st( out+0x00, c0 ); wwu_st( out+0x10, c1 );
113 6959086 : wwu_st( out+0x20, c2 ); wwu_st( out+0x30, c3 );
114 6959086 : wwu_st( out+0x40, c4 ); wwu_st( out+0x50, c5 );
115 6959086 : wwu_st( out+0x60, c6 ); wwu_st( out+0x70, c7 );
116 6959086 : wwu_st( out+0x80, c8 ); wwu_st( out+0x90, c9 );
117 6959086 : wwu_st( out+0xa0, cA ); wwu_st( out+0xb0, cB );
118 6959086 : wwu_st( out+0xc0, cC ); wwu_st( out+0xd0, cD );
119 6959086 : wwu_st( out+0xe0, cE ); wwu_st( out+0xf0, cF );
120 :
121 : /* Update ring descriptor */
122 :
123 6959086 : rng->buf_fill += 16*FD_CHACHA_BLOCK_SZ;
124 6959086 : }
125 :
126 : void
127 1891108 : fd_chacha8_rng_refill_avx512( fd_chacha_rng_t * rng ) {
128 1891108 : fd_chacha_rng_refill_avx512( rng, 4UL );
129 1891108 : }
130 :
131 : void
132 5067978 : fd_chacha20_rng_refill_avx512( fd_chacha_rng_t * rng ) {
133 5067978 : fd_chacha_rng_refill_avx512( rng, 10UL );
134 5067978 : }
|