Line data Source code
1 : #define FD_SHA512_BATCH_IMPL 1
2 :
3 : #include "fd_sha512.h"
4 : #include "../../util/simd/fd_avx.h"
5 :
6 : FD_STATIC_ASSERT( FD_SHA512_BATCH_MAX==4UL, compat );
7 :
8 : /* TODO: CONSIDER SSE IMPL FOR BATCH_CNT==2 CASE? */
9 :
10 : void
11 : fd_sha512_private_batch_avx( ulong batch_cnt,
12 : void const * _batch_data,
13 : ulong const * batch_sz,
14 4249481 : void * const * _batch_hash ) {
15 :
16 4249481 : if( FD_UNLIKELY( batch_cnt<2UL ) ) {
17 380007 : void const * const * batch_data = (void const * const *)_batch_data;
18 760014 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ )
19 380007 : fd_sha512_hash( batch_data[ batch_idx ], batch_sz[ batch_idx ], _batch_hash[ batch_idx ] );
20 380007 : return;
21 380007 : }
22 :
23 : /* SHA appends to the end of each message 17 bytes of additional data
24 : (a messaging terminator byte and the big endian uint128 with the
25 : message size in bits) and enough zero padding to make the message
26 : an integer number of blocks long. We compute the 1 or 2 tail
27 : blocks of each message here. We then process complete blocks of
28 : the original messages in place, switching to processing these tail
29 : blocks in the same pass toward the end. TODO: This code could
30 : probably be SIMD optimized slightly more (this is where all the
31 : really performance suboptimally designed parts of SHA live so it is
32 : just inherently gross). The main optimization would probably be to
33 : allow tail reading to use a faster memcpy and then maybe some
34 : vectorization of the bswap. */
35 :
36 3869474 : ulong const * batch_data = (ulong const *)_batch_data;
37 :
38 3869474 : ulong batch_tail_data[ FD_SHA512_BATCH_MAX ] __attribute__((aligned(32)));
39 3869474 : ulong batch_tail_rem [ FD_SHA512_BATCH_MAX ] __attribute__((aligned(32)));
40 :
41 3869474 : uchar scratch[ FD_SHA512_BATCH_MAX*2UL*FD_SHA512_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
42 3869474 : do {
43 3869474 : ulong scratch_free = (ulong)scratch;
44 :
45 3869474 : wv_t zero = wv_zero();
46 :
47 18210566 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
48 :
49 : /* Allocate the tail blocks for this message */
50 :
51 14341092 : ulong data = batch_data[ batch_idx ];
52 14341092 : ulong sz = batch_sz [ batch_idx ];
53 :
54 14341092 : ulong tail_data = scratch_free;
55 14341092 : ulong tail_data_sz = sz & (FD_SHA512_PRIVATE_BUF_MAX-1UL);
56 14341092 : ulong tail_data_off = fd_ulong_align_dn( sz, FD_SHA512_PRIVATE_BUF_MAX );
57 14341092 : ulong tail_sz = fd_ulong_align_up( tail_data_sz+17UL, FD_SHA512_PRIVATE_BUF_MAX );
58 :
59 14341092 : batch_tail_data[ batch_idx ] = tail_data;
60 14341092 : batch_tail_rem [ batch_idx ] = tail_sz >> FD_SHA512_PRIVATE_LG_BUF_MAX;
61 :
62 14341092 : scratch_free += tail_sz;
63 :
64 : /* Populate the tail blocks. We first clear the blocks (note that
65 : it is okay to clobber bytes 128:255 if tail_sz only 128, saving
66 : a nasty branch). Then we copy any straggler data bytes into
67 : the tail, terminate the message, and finally record the size of
68 : the message in bits at the end as a big endian ulong. */
69 :
70 14341092 : wv_st( (ulong *) tail_data, zero ); wv_st( (ulong *)(tail_data+ 32), zero );
71 14341092 : wv_st( (ulong *)(tail_data+ 64), zero ); wv_st( (ulong *)(tail_data+ 96), zero );
72 14341092 : wv_st( (ulong *)(tail_data+128), zero ); wv_st( (ulong *)(tail_data+160), zero );
73 14341092 : wv_st( (ulong *)(tail_data+192), zero ); wv_st( (ulong *)(tail_data+224), zero );
74 :
75 14341092 : # if 1 /* See notes in fd_sha256_batch_avx.c for more details here */
76 14341092 : ulong src = data + tail_data_off;
77 14341092 : ulong dst = tail_data;
78 14341092 : ulong rem = tail_data_sz;
79 29385780 : while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
80 23468929 : while( rem>= 8UL ) { *(ulong *)dst = FD_LOAD( ulong, src ); dst += 8UL; src += 8UL; rem -= 8UL; }
81 14341092 : if ( rem>= 4UL ) { *(uint *)dst = FD_LOAD( uint, src ); dst += 4UL; src += 4UL; rem -= 4UL; }
82 14341092 : if ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src ); dst += 2UL; src += 2UL; rem -= 2UL; }
83 14341092 : if ( rem ) { *(uchar *)dst = FD_LOAD( uchar, src ); dst++; }
84 14341092 : *(uchar *)dst = (uchar)0x80;
85 : # else
86 : fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
87 : *((uchar *)(tail_data+tail_data_sz)) = (uchar)0x80;
88 : # endif
89 :
90 14341092 : *((ulong *)(tail_data+tail_sz-16UL )) = fd_ulong_bswap( sz>>61 );
91 14341092 : *((ulong *)(tail_data+tail_sz- 8UL )) = fd_ulong_bswap( sz<< 3 );
92 14341092 : }
93 3869474 : } while(0);
94 :
95 3869474 : wv_t s0 = wv_bcast( 0x6a09e667f3bcc908UL );
96 3869474 : wv_t s1 = wv_bcast( 0xbb67ae8584caa73bUL );
97 3869474 : wv_t s2 = wv_bcast( 0x3c6ef372fe94f82bUL );
98 3869474 : wv_t s3 = wv_bcast( 0xa54ff53a5f1d36f1UL );
99 3869474 : wv_t s4 = wv_bcast( 0x510e527fade682d1UL );
100 3869474 : wv_t s5 = wv_bcast( 0x9b05688c2b3e6c1fUL );
101 3869474 : wv_t s6 = wv_bcast( 0x1f83d9abfb41bd6bUL );
102 3869474 : wv_t s7 = wv_bcast( 0x5be0cd19137e2179UL );
103 :
104 3869474 : wv_t wv_128 = wv_bcast( FD_SHA512_PRIVATE_BUF_MAX );
105 3869474 : wv_t W_sentinel = wv_bcast( (ulong)scratch );
106 3869474 : wc_t batch_lane = wc_unpack( (1<<(2*batch_cnt))-1 );
107 3869474 : wv_t tail = wv_ld( batch_tail_data );
108 3869474 : wv_t tail_rem = wv_ld( batch_tail_rem );
109 3869474 : wv_t W = wv_ld( batch_data );
110 3869474 : wv_t block_rem = wv_notczero( batch_lane, wv_add( wv_shr( wv_ld( batch_sz ), FD_SHA512_PRIVATE_LG_BUF_MAX ), tail_rem ) );
111 26983827 : for(;;) {
112 26983827 : wc_t active_lane = wv_to_wc( block_rem );
113 26983827 : if( FD_UNLIKELY( !wc_any( active_lane ) ) ) break;
114 :
115 : /* Switch lanes that have hit the end of their in-place bulk
116 : processing to their out-of-place scratch tail regions as
117 : necessary. */
118 :
119 23114353 : W = wv_if( wv_eq( block_rem, tail_rem ), tail, W );
120 :
121 : /* At this point, we have at least 1 block in this message segment
122 : pass that has not been processed. Load the next 128 bytes of
123 : each unprocessed block. Inactive lanes (e.g. message segments
124 : in this pass for which we've already processed all the blocks)
125 : will load garbage from a sentinel location (and the result of
126 : the state computations for the inactive lane will be ignored). */
127 :
128 23114353 : wv_t W03 = wv_if( active_lane, W, W_sentinel );
129 23114353 : uchar const * W0 = (uchar const *)wv_extract( W03, 0 );
130 23114353 : uchar const * W1 = (uchar const *)wv_extract( W03, 1 );
131 23114353 : uchar const * W2 = (uchar const *)wv_extract( W03, 2 );
132 23114353 : uchar const * W3 = (uchar const *)wv_extract( W03, 3 );
133 :
134 23114353 : wv_t x0; wv_t x1; wv_t x2; wv_t x3;
135 23114353 : wv_transpose_4x4( wv_bswap( wv_ldu(W0 ) ), wv_bswap( wv_ldu(W1 ) ), wv_bswap( wv_ldu(W2 ) ), wv_bswap( wv_ldu(W3 ) ),
136 23114353 : x0, x1, x2, x3 );
137 :
138 23114353 : wv_t x4; wv_t x5; wv_t x6; wv_t x7;
139 23114353 : wv_transpose_4x4( wv_bswap( wv_ldu(W0+32) ), wv_bswap( wv_ldu(W1+32) ), wv_bswap( wv_ldu(W2+32) ), wv_bswap( wv_ldu(W3+32) ),
140 23114353 : x4, x5, x6, x7 );
141 :
142 23114353 : wv_t x8; wv_t x9; wv_t xa; wv_t xb;
143 23114353 : wv_transpose_4x4( wv_bswap( wv_ldu(W0+64) ), wv_bswap( wv_ldu(W1+64) ), wv_bswap( wv_ldu(W2+64) ), wv_bswap( wv_ldu(W3+64) ),
144 23114353 : x8, x9, xa, xb );
145 :
146 23114353 : wv_t xc; wv_t xd; wv_t xe; wv_t xf;
147 23114353 : wv_transpose_4x4( wv_bswap( wv_ldu(W0+96) ), wv_bswap( wv_ldu(W1+96) ), wv_bswap( wv_ldu(W2+96) ), wv_bswap( wv_ldu(W3+96) ),
148 23114353 : xc, xd, xe, xf );
149 :
150 : /* Compute the SHA-512 state updates */
151 :
152 23114353 : wv_t a = s0; wv_t b = s1; wv_t c = s2; wv_t d = s3; wv_t e = s4; wv_t f = s5; wv_t g = s6; wv_t h = s7;
153 :
154 23114353 : static ulong const K[80] = { /* FIXME: Reuse with other functions */
155 23114353 : 0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
156 23114353 : 0x3956c25bf348b538UL, 0x59f111f1b605d019UL, 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL,
157 23114353 : 0xd807aa98a3030242UL, 0x12835b0145706fbeUL, 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
158 23114353 : 0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL, 0xc19bf174cf692694UL,
159 23114353 : 0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL,
160 23114353 : 0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
161 23114353 : 0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL,
162 23114353 : 0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, 0x06ca6351e003826fUL, 0x142929670a0e6e70UL,
163 23114353 : 0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
164 23114353 : 0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL, 0x92722c851482353bUL,
165 23114353 : 0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL,
166 23114353 : 0xd192e819d6ef5218UL, 0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
167 23114353 : 0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL,
168 23114353 : 0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, 0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL,
169 23114353 : 0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
170 23114353 : 0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL,
171 23114353 : 0xca273eceea26619cUL, 0xd186b8c721c0c207UL, 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL,
172 23114353 : 0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, 0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
173 23114353 : 0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL,
174 23114353 : 0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL
175 23114353 : };
176 :
177 23114353 : # define Sigma0(x) wv_xor( wv_ror(x,28), wv_xor( wv_ror(x,34), wv_ror(x,39) ) )
178 23114353 : # define Sigma1(x) wv_xor( wv_ror(x,14), wv_xor( wv_ror(x,18), wv_ror(x,41) ) )
179 23114353 : # define sigma0(x) wv_xor( wv_ror(x, 1), wv_xor( wv_ror(x, 8), wv_shr(x, 7) ) )
180 23114353 : # define sigma1(x) wv_xor( wv_ror(x,19), wv_xor( wv_ror(x,61), wv_shr(x, 6) ) )
181 23114353 : # define Ch(x,y,z) wv_xor( wv_and(x,y), wv_andnot(x,z) )
182 23114353 : # define Maj(x,y,z) wv_xor( wv_and(x,y), wv_xor( wv_and(x,z), wv_and(y,z) ) )
183 23114353 : # define SHA_CORE(xi,ki) \
184 1849148240 : T1 = wv_add( wv_add(xi,ki), wv_add( wv_add( h, Sigma1(e) ), Ch(e, f, g) ) ); \
185 1849148240 : T2 = wv_add( Sigma0(a), Maj(a, b, c) ); \
186 1849148240 : h = g; \
187 1849148240 : g = f; \
188 1849148240 : f = e; \
189 1849148240 : e = wv_add( d, T1 ); \
190 1849148240 : d = c; \
191 1849148240 : c = b; \
192 1849148240 : b = a; \
193 1849148240 : a = wv_add( T1, T2 )
194 :
195 23114353 : wv_t T1;
196 23114353 : wv_t T2;
197 :
198 23114353 : SHA_CORE( x0, wv_bcast( K[ 0] ) );
199 23114353 : SHA_CORE( x1, wv_bcast( K[ 1] ) );
200 23114353 : SHA_CORE( x2, wv_bcast( K[ 2] ) );
201 23114353 : SHA_CORE( x3, wv_bcast( K[ 3] ) );
202 23114353 : SHA_CORE( x4, wv_bcast( K[ 4] ) );
203 23114353 : SHA_CORE( x5, wv_bcast( K[ 5] ) );
204 23114353 : SHA_CORE( x6, wv_bcast( K[ 6] ) );
205 23114353 : SHA_CORE( x7, wv_bcast( K[ 7] ) );
206 23114353 : SHA_CORE( x8, wv_bcast( K[ 8] ) );
207 23114353 : SHA_CORE( x9, wv_bcast( K[ 9] ) );
208 23114353 : SHA_CORE( xa, wv_bcast( K[10] ) );
209 23114353 : SHA_CORE( xb, wv_bcast( K[11] ) );
210 23114353 : SHA_CORE( xc, wv_bcast( K[12] ) );
211 23114353 : SHA_CORE( xd, wv_bcast( K[13] ) );
212 23114353 : SHA_CORE( xe, wv_bcast( K[14] ) );
213 23114353 : SHA_CORE( xf, wv_bcast( K[15] ) );
214 115571765 : for( ulong i=16UL; i<80UL; i+=16UL ) {
215 92457412 : x0 = wv_add( wv_add( x0, sigma0(x1) ), wv_add( sigma1(xe), x9 ) ); SHA_CORE( x0, wv_bcast( K[i ] ) );
216 92457412 : x1 = wv_add( wv_add( x1, sigma0(x2) ), wv_add( sigma1(xf), xa ) ); SHA_CORE( x1, wv_bcast( K[i+ 1UL] ) );
217 92457412 : x2 = wv_add( wv_add( x2, sigma0(x3) ), wv_add( sigma1(x0), xb ) ); SHA_CORE( x2, wv_bcast( K[i+ 2UL] ) );
218 92457412 : x3 = wv_add( wv_add( x3, sigma0(x4) ), wv_add( sigma1(x1), xc ) ); SHA_CORE( x3, wv_bcast( K[i+ 3UL] ) );
219 92457412 : x4 = wv_add( wv_add( x4, sigma0(x5) ), wv_add( sigma1(x2), xd ) ); SHA_CORE( x4, wv_bcast( K[i+ 4UL] ) );
220 92457412 : x5 = wv_add( wv_add( x5, sigma0(x6) ), wv_add( sigma1(x3), xe ) ); SHA_CORE( x5, wv_bcast( K[i+ 5UL] ) );
221 92457412 : x6 = wv_add( wv_add( x6, sigma0(x7) ), wv_add( sigma1(x4), xf ) ); SHA_CORE( x6, wv_bcast( K[i+ 6UL] ) );
222 92457412 : x7 = wv_add( wv_add( x7, sigma0(x8) ), wv_add( sigma1(x5), x0 ) ); SHA_CORE( x7, wv_bcast( K[i+ 7UL] ) );
223 92457412 : x8 = wv_add( wv_add( x8, sigma0(x9) ), wv_add( sigma1(x6), x1 ) ); SHA_CORE( x8, wv_bcast( K[i+ 8UL] ) );
224 92457412 : x9 = wv_add( wv_add( x9, sigma0(xa) ), wv_add( sigma1(x7), x2 ) ); SHA_CORE( x9, wv_bcast( K[i+ 9UL] ) );
225 92457412 : xa = wv_add( wv_add( xa, sigma0(xb) ), wv_add( sigma1(x8), x3 ) ); SHA_CORE( xa, wv_bcast( K[i+10UL] ) );
226 92457412 : xb = wv_add( wv_add( xb, sigma0(xc) ), wv_add( sigma1(x9), x4 ) ); SHA_CORE( xb, wv_bcast( K[i+11UL] ) );
227 92457412 : xc = wv_add( wv_add( xc, sigma0(xd) ), wv_add( sigma1(xa), x5 ) ); SHA_CORE( xc, wv_bcast( K[i+12UL] ) );
228 92457412 : xd = wv_add( wv_add( xd, sigma0(xe) ), wv_add( sigma1(xb), x6 ) ); SHA_CORE( xd, wv_bcast( K[i+13UL] ) );
229 92457412 : xe = wv_add( wv_add( xe, sigma0(xf) ), wv_add( sigma1(xc), x7 ) ); SHA_CORE( xe, wv_bcast( K[i+14UL] ) );
230 92457412 : xf = wv_add( wv_add( xf, sigma0(x0) ), wv_add( sigma1(xd), x8 ) ); SHA_CORE( xf, wv_bcast( K[i+15UL] ) );
231 92457412 : }
232 :
233 23114353 : # undef SHA_CORE
234 23114353 : # undef Sigma0
235 23114353 : # undef Sigma1
236 23114353 : # undef sigma0
237 23114353 : # undef sigma1
238 23114353 : # undef Ch
239 23114353 : # undef Maj
240 :
241 : /* Apply the state updates to the active lanes */
242 :
243 23114353 : s0 = wv_add( s0, wv_notczero( active_lane, a ) );
244 23114353 : s1 = wv_add( s1, wv_notczero( active_lane, b ) );
245 23114353 : s2 = wv_add( s2, wv_notczero( active_lane, c ) );
246 23114353 : s3 = wv_add( s3, wv_notczero( active_lane, d ) );
247 23114353 : s4 = wv_add( s4, wv_notczero( active_lane, e ) );
248 23114353 : s5 = wv_add( s5, wv_notczero( active_lane, f ) );
249 23114353 : s6 = wv_add( s6, wv_notczero( active_lane, g ) );
250 23114353 : s7 = wv_add( s7, wv_notczero( active_lane, h ) );
251 :
252 : /* Advance to the next message segment blocks. In pseudo code,
253 : the below is:
254 :
255 : W += 128; if( block_rem ) block_rem--;
256 :
257 : Since wc_to_wv_raw(false/true) is 0UL/~0UL, we can use wv_add /
258 : wc_to_wv_raw instead of wv_sub / wc_to_wv to save some ops.
259 : (Consider conditional increment / decrement operations?)
260 :
261 : Also since we do not load anything at W(lane) above unless
262 : block_rem(lane) is non-zero, we can omit vector conditional
263 : operations for W(lane) below to save some additional ops. */
264 :
265 23114353 : W = wv_add( W, wv_128 );
266 :
267 23114353 : block_rem = wv_add( block_rem, wc_to_wv_raw( active_lane ) );
268 23114353 : }
269 :
270 : /* Store the results. FIXME: Probably could optimize the transpose
271 : further by taking into account needed stores (and then maybe go
272 : direct into memory ... would need a family of such transposed
273 : stores). */
274 :
275 3869474 : wv_transpose_4x4( s0,s1,s2,s3, s0,s1,s2,s3 );
276 3869474 : wv_transpose_4x4( s4,s5,s6,s7, s4,s5,s6,s7 );
277 :
278 3869474 : ulong * const * batch_hash = (ulong * const *)_batch_hash;
279 3869474 : switch( batch_cnt ) { /* application dependent prob */
280 3111313 : case 4UL: wv_stu( batch_hash[3], wv_bswap( s3 ) ); wv_stu( batch_hash[3]+4, wv_bswap( s7 ) ); __attribute__((fallthrough));
281 3490831 : case 3UL: wv_stu( batch_hash[2], wv_bswap( s2 ) ); wv_stu( batch_hash[2]+4, wv_bswap( s6 ) ); __attribute__((fallthrough));
282 3869474 : case 2UL: wv_stu( batch_hash[1], wv_bswap( s1 ) ); wv_stu( batch_hash[1]+4, wv_bswap( s5 ) ); __attribute__((fallthrough));
283 3869474 : case 1UL: wv_stu( batch_hash[0], wv_bswap( s0 ) ); wv_stu( batch_hash[0]+4, wv_bswap( s4 ) ); __attribute__((fallthrough));
284 3869474 : default: break;
285 3869474 : }
286 3869474 : }
|