Line data Source code
1 : #define FD_SHA256_BATCH_IMPL 1
2 :
3 : #include "fd_sha256.h"
4 : #include "../../util/simd/fd_avx.h"
5 :
6 : FD_STATIC_ASSERT( FD_SHA256_BATCH_MAX==8UL, compat );
7 :
8 : void
9 : fd_sha256_private_batch_avx( ulong batch_cnt,
10 : void const * _batch_data,
11 : ulong const * batch_sz,
12 20559531 : void * const * _batch_hash ) {
13 :
14 : /* If the batch is too small, it's faster to run each part of the
15 : batch sequentially. When we have SHA-NI instructions, the
16 : sequential implementation is faster, so we need a larger batch size
17 : to justify using the batched implementation. */
18 :
19 1545805 : # if FD_HAS_SHANI
20 1545805 : # define MIN_BATCH_CNT (6UL)
21 : # else
22 19013726 : # define MIN_BATCH_CNT (2UL)
23 19013726 : # endif
24 :
25 20559531 : if( FD_UNLIKELY( batch_cnt<MIN_BATCH_CNT ) ) {
26 4291599 : void const * const * batch_data = (void const * const *)_batch_data;
27 9089645 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ )
28 4798046 : fd_sha256_hash( batch_data[ batch_idx ], batch_sz[ batch_idx ], _batch_hash[ batch_idx ] );
29 4291599 : return;
30 4291599 : }
31 :
32 16267932 : # undef MIN_BATCH_CNT
33 :
34 : /* SHA appends to the end of each message 9 bytes of additional data
35 : (a messaging terminator byte and the big endian ulong with the
36 : message size in bits) and enough zero padding to make the message
37 : an integer number of blocks long. We compute the 1 or 2 tail
38 : blocks of each message here. We then process complete blocks of
39 : the original messages in place, switching to processing these tail
40 : blocks in the same pass toward the end. TODO: This code could
41 : probably be SIMD optimized slightly more (this is where all the
42 : really performance suboptimally designed parts of SHA live so it is
43 : just inherently gross). The main optimization would probably be to
44 : allow tail reading to use a faster memcpy and then maybe some
45 : vectorization of the bswap. */
46 :
47 16267932 : ulong const * batch_data = (ulong const *)_batch_data;
48 :
49 16267932 : ulong batch_tail_data[ FD_SHA256_BATCH_MAX ] __attribute__((aligned(32)));
50 16267932 : ulong batch_tail_rem [ FD_SHA256_BATCH_MAX ] __attribute__((aligned(32)));
51 :
52 16267932 : uchar scratch[ FD_SHA256_BATCH_MAX*2UL*FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
53 16267932 : do {
54 16267932 : ulong scratch_free = (ulong)scratch;
55 :
56 16267932 : wv_t zero = wv_zero();
57 :
58 139311056 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
59 :
60 : /* Allocate the tail blocks for this message */
61 :
62 123043124 : ulong data = batch_data[ batch_idx ];
63 123043124 : ulong sz = batch_sz [ batch_idx ];
64 :
65 123043124 : ulong tail_data = scratch_free;
66 123043124 : ulong tail_data_sz = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
67 123043124 : ulong tail_data_off = fd_ulong_align_dn( sz, FD_SHA256_PRIVATE_BUF_MAX );
68 123043124 : ulong tail_sz = fd_ulong_align_up( tail_data_sz+9UL, FD_SHA256_PRIVATE_BUF_MAX );
69 :
70 123043124 : batch_tail_data[ batch_idx ] = tail_data;
71 123043124 : batch_tail_rem [ batch_idx ] = tail_sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
72 :
73 123043124 : scratch_free += tail_sz;
74 :
75 : /* Populate the tail blocks. We first clear the blocks (note that
76 : it is okay to clobber bytes 64:127 if tail_sz only 64, saving a
77 : nasty branch). Then we copy any straggler data bytes into the
78 : tail, terminate the message, and finally record the size of the
79 : message in bits at the end as a big endian ulong. */
80 :
81 123043124 : wv_st( (ulong *) tail_data, zero );
82 123043124 : wv_st( (ulong *)(tail_data+32), zero );
83 123043124 : wv_st( (ulong *)(tail_data+64), zero );
84 123043124 : wv_st( (ulong *)(tail_data+96), zero );
85 :
86 123043124 : # if 1
87 : /* Quick experiments found that, once again, straight memcpy is
88 : much slower than a fd_memcpy is slightly slower than a
89 : site-optimized handrolled memcpy (fd_memcpy would be less L1I
90 : cache footprint though). They also found that doing the below
91 : in a branchless way is slightly worse and an ILP optimized
92 : version of the conditional calculation is about the same. They
93 : also found that vectorizing the overall loop and/or Duffing the
94 : vectorized loop did not provide noticeable performance
95 : improvements under various styles of memcpy. */
96 123043124 : ulong src = data + tail_data_off;
97 123043124 : ulong dst = tail_data;
98 123043124 : ulong rem = tail_data_sz;
99 153385816 : while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
100 258455602 : while( rem>= 8UL ) { *(ulong *)dst = FD_LOAD( ulong, src ); dst += 8UL; src += 8UL; rem -= 8UL; }
101 123043124 : if ( rem>= 4UL ) { *(uint *)dst = FD_LOAD( uint, src ); dst += 4UL; src += 4UL; rem -= 4UL; }
102 123043124 : if ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src ); dst += 2UL; src += 2UL; rem -= 2UL; }
103 123043124 : if ( rem ) { *(uchar *)dst = FD_LOAD( uchar, src ); dst++; }
104 123043124 : *(uchar *)dst = (uchar)0x80;
105 : # else
106 : fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
107 : *((uchar *)(tail_data+tail_data_sz)) = (uchar)0x80;
108 : # endif
109 :
110 123043124 : *((ulong *)(tail_data+tail_sz-8UL )) = fd_ulong_bswap( sz<<3 );
111 123043124 : }
112 16267932 : } while(0);
113 :
114 16267932 : wu_t s0 = wu_bcast( 0x6a09e667U );
115 16267932 : wu_t s1 = wu_bcast( 0xbb67ae85U );
116 16267932 : wu_t s2 = wu_bcast( 0x3c6ef372U );
117 16267932 : wu_t s3 = wu_bcast( 0xa54ff53aU );
118 16267932 : wu_t s4 = wu_bcast( 0x510e527fU );
119 16267932 : wu_t s5 = wu_bcast( 0x9b05688cU );
120 16267932 : wu_t s6 = wu_bcast( 0x1f83d9abU );
121 16267932 : wu_t s7 = wu_bcast( 0x5be0cd19U );
122 :
123 16267932 : wv_t wv_64 = wv_bcast( FD_SHA256_PRIVATE_BUF_MAX );
124 16267932 : wv_t W_sentinel = wv_bcast( (ulong)scratch );
125 16267932 : wc_t batch_lane = wc_unpack( (1<<batch_cnt)-1 );
126 :
127 16267932 : wv_t tail_lo = wv_ld( batch_tail_data );
128 16267932 : wv_t tail_hi = wv_ld( batch_tail_data+4 );
129 :
130 16267932 : wv_t tail_rem_lo = wv_ld( batch_tail_rem );
131 16267932 : wv_t tail_rem_hi = wv_ld( batch_tail_rem+4 );
132 :
133 16267932 : wv_t W_lo = wv_ld( batch_data );
134 16267932 : wv_t W_hi = wv_ld( batch_data+4 );
135 :
136 16267932 : wv_t block_rem_lo = wv_notczero( wc_expand( batch_lane, 0 ),
137 16267932 : wv_add( wv_shr( wv_ld( batch_sz ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_lo ) );
138 16267932 : wv_t block_rem_hi = wv_notczero( wc_expand( batch_lane, 1 ),
139 16267932 : wv_add( wv_shr( wv_ld( batch_sz+4 ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_hi ) );
140 246206218 : for(;;) {
141 246206218 : wc_t active_lane_lo = wv_to_wc( block_rem_lo );
142 246206218 : wc_t active_lane_hi = wv_to_wc( block_rem_hi );
143 246206218 : if( FD_UNLIKELY( !wc_any( wc_or( active_lane_lo, active_lane_hi ) ) ) ) break;
144 :
145 : /* Switch lanes that have hit the end of their in-place bulk
146 : processing to their out-of-place scratch tail regions as
147 : necessary. */
148 :
149 229938286 : W_lo = wv_if( wv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
150 229938286 : W_hi = wv_if( wv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
151 :
152 : /* At this point, we have at least 1 block in this message segment
153 : pass that has not been processed. Load the next 64 bytes of
154 : each unprocessed block. Inactive lanes (e.g. message segments
155 : in this pass for which we've already processed all the blocks)
156 : will load garbage from a sentinel location (and the result of
157 : the state computations for the inactive lane will be ignored). */
158 :
159 229938286 : wv_t W03 = wv_if( active_lane_lo, W_lo, W_sentinel );
160 229938286 : uchar const * W0 = (uchar const *)wv_extract( W03, 0 );
161 229938286 : uchar const * W1 = (uchar const *)wv_extract( W03, 1 );
162 229938286 : uchar const * W2 = (uchar const *)wv_extract( W03, 2 );
163 229938286 : uchar const * W3 = (uchar const *)wv_extract( W03, 3 );
164 :
165 229938286 : wv_t W47 = wv_if( active_lane_hi, W_hi, W_sentinel );
166 229938286 : uchar const * W4 = (uchar const *)wv_extract( W47, 0 );
167 229938286 : uchar const * W5 = (uchar const *)wv_extract( W47, 1 );
168 229938286 : uchar const * W6 = (uchar const *)wv_extract( W47, 2 );
169 229938286 : uchar const * W7 = (uchar const *)wv_extract( W47, 3 );
170 :
171 229938286 : wu_t x0; wu_t x1; wu_t x2; wu_t x3; wu_t x4; wu_t x5; wu_t x6; wu_t x7;
172 229938286 : wu_transpose_8x8( wu_bswap( wu_ldu(W0 ) ), wu_bswap( wu_ldu(W1 ) ), wu_bswap( wu_ldu(W2 ) ), wu_bswap( wu_ldu(W3 ) ),
173 229938286 : wu_bswap( wu_ldu(W4 ) ), wu_bswap( wu_ldu(W5 ) ), wu_bswap( wu_ldu(W6 ) ), wu_bswap( wu_ldu(W7 ) ),
174 229938286 : x0, x1, x2, x3, x4, x5, x6, x7 );
175 :
176 229938286 : wu_t x8; wu_t x9; wu_t xa; wu_t xb; wu_t xc; wu_t xd; wu_t xe; wu_t xf;
177 229938286 : wu_transpose_8x8( wu_bswap( wu_ldu(W0+32) ), wu_bswap( wu_ldu(W1+32) ), wu_bswap( wu_ldu(W2+32) ), wu_bswap( wu_ldu(W3+32) ),
178 229938286 : wu_bswap( wu_ldu(W4+32) ), wu_bswap( wu_ldu(W5+32) ), wu_bswap( wu_ldu(W6+32) ), wu_bswap( wu_ldu(W7+32) ),
179 229938286 : x8, x9, xa, xb, xc, xd, xe, xf );
180 :
181 : /* Compute the SHA-256 state updates */
182 :
183 229938286 : wu_t a = s0; wu_t b = s1; wu_t c = s2; wu_t d = s3; wu_t e = s4; wu_t f = s5; wu_t g = s6; wu_t h = s7;
184 :
185 229938286 : static uint const K[64] = { /* FIXME: Reuse with other functions */
186 229938286 : 0x428a2f98U, 0x71374491U, 0xb5c0fbcfU, 0xe9b5dba5U, 0x3956c25bU, 0x59f111f1U, 0x923f82a4U, 0xab1c5ed5U,
187 229938286 : 0xd807aa98U, 0x12835b01U, 0x243185beU, 0x550c7dc3U, 0x72be5d74U, 0x80deb1feU, 0x9bdc06a7U, 0xc19bf174U,
188 229938286 : 0xe49b69c1U, 0xefbe4786U, 0x0fc19dc6U, 0x240ca1ccU, 0x2de92c6fU, 0x4a7484aaU, 0x5cb0a9dcU, 0x76f988daU,
189 229938286 : 0x983e5152U, 0xa831c66dU, 0xb00327c8U, 0xbf597fc7U, 0xc6e00bf3U, 0xd5a79147U, 0x06ca6351U, 0x14292967U,
190 229938286 : 0x27b70a85U, 0x2e1b2138U, 0x4d2c6dfcU, 0x53380d13U, 0x650a7354U, 0x766a0abbU, 0x81c2c92eU, 0x92722c85U,
191 229938286 : 0xa2bfe8a1U, 0xa81a664bU, 0xc24b8b70U, 0xc76c51a3U, 0xd192e819U, 0xd6990624U, 0xf40e3585U, 0x106aa070U,
192 229938286 : 0x19a4c116U, 0x1e376c08U, 0x2748774cU, 0x34b0bcb5U, 0x391c0cb3U, 0x4ed8aa4aU, 0x5b9cca4fU, 0x682e6ff3U,
193 229938286 : 0x748f82eeU, 0x78a5636fU, 0x84c87814U, 0x8cc70208U, 0x90befffaU, 0xa4506cebU, 0xbef9a3f7U, 0xc67178f2U,
194 229938286 : };
195 :
196 229938286 : # define Sigma0(x) wu_xor( wu_rol(x,30), wu_xor( wu_rol(x,19), wu_rol(x,10) ) )
197 229938286 : # define Sigma1(x) wu_xor( wu_rol(x,26), wu_xor( wu_rol(x,21), wu_rol(x, 7) ) )
198 229938286 : # define sigma0(x) wu_xor( wu_rol(x,25), wu_xor( wu_rol(x,14), wu_shr(x, 3) ) )
199 229938286 : # define sigma1(x) wu_xor( wu_rol(x,15), wu_xor( wu_rol(x,13), wu_shr(x,10) ) )
200 229938286 : # define Ch(x,y,z) wu_xor( wu_and(x,y), wu_andnot(x,z) )
201 229938286 : # define Maj(x,y,z) wu_xor( wu_and(x,y), wu_xor( wu_and(x,z), wu_and(y,z) ) )
202 229938286 : # define SHA_CORE(xi,ki) \
203 14716050304 : T1 = wu_add( wu_add(xi,ki), wu_add( wu_add( h, Sigma1(e) ), Ch(e, f, g) ) ); \
204 14716050304 : T2 = wu_add( Sigma0(a), Maj(a, b, c) ); \
205 14716050304 : h = g; \
206 14716050304 : g = f; \
207 14716050304 : f = e; \
208 14716050304 : e = wu_add( d, T1 ); \
209 14716050304 : d = c; \
210 14716050304 : c = b; \
211 14716050304 : b = a; \
212 14716050304 : a = wu_add( T1, T2 )
213 :
214 229938286 : wu_t T1;
215 229938286 : wu_t T2;
216 :
217 229938286 : SHA_CORE( x0, wu_bcast( K[ 0] ) );
218 229938286 : SHA_CORE( x1, wu_bcast( K[ 1] ) );
219 229938286 : SHA_CORE( x2, wu_bcast( K[ 2] ) );
220 229938286 : SHA_CORE( x3, wu_bcast( K[ 3] ) );
221 229938286 : SHA_CORE( x4, wu_bcast( K[ 4] ) );
222 229938286 : SHA_CORE( x5, wu_bcast( K[ 5] ) );
223 229938286 : SHA_CORE( x6, wu_bcast( K[ 6] ) );
224 229938286 : SHA_CORE( x7, wu_bcast( K[ 7] ) );
225 229938286 : SHA_CORE( x8, wu_bcast( K[ 8] ) );
226 229938286 : SHA_CORE( x9, wu_bcast( K[ 9] ) );
227 229938286 : SHA_CORE( xa, wu_bcast( K[10] ) );
228 229938286 : SHA_CORE( xb, wu_bcast( K[11] ) );
229 229938286 : SHA_CORE( xc, wu_bcast( K[12] ) );
230 229938286 : SHA_CORE( xd, wu_bcast( K[13] ) );
231 229938286 : SHA_CORE( xe, wu_bcast( K[14] ) );
232 229938286 : SHA_CORE( xf, wu_bcast( K[15] ) );
233 919753144 : for( ulong i=16UL; i<64UL; i+=16UL ) {
234 689814858 : x0 = wu_add( wu_add( x0, sigma0(x1) ), wu_add( sigma1(xe), x9 ) ); SHA_CORE( x0, wu_bcast( K[i ] ) );
235 689814858 : x1 = wu_add( wu_add( x1, sigma0(x2) ), wu_add( sigma1(xf), xa ) ); SHA_CORE( x1, wu_bcast( K[i+ 1UL] ) );
236 689814858 : x2 = wu_add( wu_add( x2, sigma0(x3) ), wu_add( sigma1(x0), xb ) ); SHA_CORE( x2, wu_bcast( K[i+ 2UL] ) );
237 689814858 : x3 = wu_add( wu_add( x3, sigma0(x4) ), wu_add( sigma1(x1), xc ) ); SHA_CORE( x3, wu_bcast( K[i+ 3UL] ) );
238 689814858 : x4 = wu_add( wu_add( x4, sigma0(x5) ), wu_add( sigma1(x2), xd ) ); SHA_CORE( x4, wu_bcast( K[i+ 4UL] ) );
239 689814858 : x5 = wu_add( wu_add( x5, sigma0(x6) ), wu_add( sigma1(x3), xe ) ); SHA_CORE( x5, wu_bcast( K[i+ 5UL] ) );
240 689814858 : x6 = wu_add( wu_add( x6, sigma0(x7) ), wu_add( sigma1(x4), xf ) ); SHA_CORE( x6, wu_bcast( K[i+ 6UL] ) );
241 689814858 : x7 = wu_add( wu_add( x7, sigma0(x8) ), wu_add( sigma1(x5), x0 ) ); SHA_CORE( x7, wu_bcast( K[i+ 7UL] ) );
242 689814858 : x8 = wu_add( wu_add( x8, sigma0(x9) ), wu_add( sigma1(x6), x1 ) ); SHA_CORE( x8, wu_bcast( K[i+ 8UL] ) );
243 689814858 : x9 = wu_add( wu_add( x9, sigma0(xa) ), wu_add( sigma1(x7), x2 ) ); SHA_CORE( x9, wu_bcast( K[i+ 9UL] ) );
244 689814858 : xa = wu_add( wu_add( xa, sigma0(xb) ), wu_add( sigma1(x8), x3 ) ); SHA_CORE( xa, wu_bcast( K[i+10UL] ) );
245 689814858 : xb = wu_add( wu_add( xb, sigma0(xc) ), wu_add( sigma1(x9), x4 ) ); SHA_CORE( xb, wu_bcast( K[i+11UL] ) );
246 689814858 : xc = wu_add( wu_add( xc, sigma0(xd) ), wu_add( sigma1(xa), x5 ) ); SHA_CORE( xc, wu_bcast( K[i+12UL] ) );
247 689814858 : xd = wu_add( wu_add( xd, sigma0(xe) ), wu_add( sigma1(xb), x6 ) ); SHA_CORE( xd, wu_bcast( K[i+13UL] ) );
248 689814858 : xe = wu_add( wu_add( xe, sigma0(xf) ), wu_add( sigma1(xc), x7 ) ); SHA_CORE( xe, wu_bcast( K[i+14UL] ) );
249 689814858 : xf = wu_add( wu_add( xf, sigma0(x0) ), wu_add( sigma1(xd), x8 ) ); SHA_CORE( xf, wu_bcast( K[i+15UL] ) );
250 689814858 : }
251 :
252 229938286 : # undef SHA_CORE
253 229938286 : # undef Sigma0
254 229938286 : # undef Sigma1
255 229938286 : # undef sigma0
256 229938286 : # undef sigma1
257 229938286 : # undef Ch
258 229938286 : # undef Maj
259 :
260 : /* Apply the state updates to the active lanes */
261 :
262 229938286 : wc_t active_lane = wc_narrow( active_lane_lo, active_lane_hi );
263 229938286 : s0 = wu_add( s0, wu_notczero( active_lane, a ) );
264 229938286 : s1 = wu_add( s1, wu_notczero( active_lane, b ) );
265 229938286 : s2 = wu_add( s2, wu_notczero( active_lane, c ) );
266 229938286 : s3 = wu_add( s3, wu_notczero( active_lane, d ) );
267 229938286 : s4 = wu_add( s4, wu_notczero( active_lane, e ) );
268 229938286 : s5 = wu_add( s5, wu_notczero( active_lane, f ) );
269 229938286 : s6 = wu_add( s6, wu_notczero( active_lane, g ) );
270 229938286 : s7 = wu_add( s7, wu_notczero( active_lane, h ) );
271 :
272 : /* Advance to the next message segment blocks. In pseudo code,
273 : the below is:
274 :
275 : W += 64; if( block_rem ) block_rem--;
276 :
277 : Since wc_to_wv_raw(false/true) is 0UL/~0UL, we can use wv_add /
278 : wc_to_wv_raw instead of wv_sub / wc_to_wv to save some ops.
279 : (Consider conditional increment / decrement operations?)
280 :
281 : Also since we do not load anything at W(lane) above unless
282 : block_rem(lane) is non-zero, we can omit vector conditional
283 : operations for W(lane) below to save some additional ops. */
284 :
285 229938286 : W_lo = wv_add( W_lo, wv_64 );
286 229938286 : W_hi = wv_add( W_hi, wv_64 );
287 :
288 229938286 : block_rem_lo = wv_add( block_rem_lo, wc_to_wv_raw( active_lane_lo ) );
289 229938286 : block_rem_hi = wv_add( block_rem_hi, wc_to_wv_raw( active_lane_hi ) );
290 229938286 : }
291 :
292 : /* Store the results. FIXME: Probably could optimize the transpose
293 : further by taking into account needed stores (and then maybe go
294 : direct into memory ... would need a family of such transposed
295 : stores). */
296 :
297 16267932 : wu_transpose_8x8( s0,s1,s2,s3,s4,s5,s6,s7, s0,s1,s2,s3,s4,s5,s6,s7 );
298 :
299 16267932 : uint * const * batch_hash = (uint * const *)_batch_hash;
300 16267932 : switch( batch_cnt ) { /* application dependent prob */
301 14244050 : case 8UL: wu_stu( batch_hash[7], wu_bswap( s7 ) ); __attribute__((fallthrough));
302 14555700 : case 7UL: wu_stu( batch_hash[6], wu_bswap( s6 ) ); __attribute__((fallthrough));
303 14904828 : case 6UL: wu_stu( batch_hash[5], wu_bswap( s5 ) ); __attribute__((fallthrough));
304 15262964 : case 5UL: wu_stu( batch_hash[4], wu_bswap( s4 ) ); __attribute__((fallthrough));
305 15612846 : case 4UL: wu_stu( batch_hash[3], wu_bswap( s3 ) ); __attribute__((fallthrough));
306 15926872 : case 3UL: wu_stu( batch_hash[2], wu_bswap( s2 ) ); __attribute__((fallthrough));
307 16267932 : case 2UL: wu_stu( batch_hash[1], wu_bswap( s1 ) ); __attribute__((fallthrough));
308 16267932 : case 1UL: wu_stu( batch_hash[0], wu_bswap( s0 ) ); __attribute__((fallthrough));
309 16267932 : default: break;
310 16267932 : }
311 16267932 : }
|