Line data Source code
1 : #define FD_SHA256_BATCH_IMPL 2
2 :
3 : #include "fd_sha256.h"
4 : #include "../../util/simd/fd_avx512.h"
5 : #include "../../util/simd/fd_avx.h"
6 :
7 : FD_STATIC_ASSERT( FD_SHA256_BATCH_MAX==16UL, compat );
8 :
9 : void
10 : fd_sha256_private_batch_avx( ulong batch_cnt,
11 : void const * batch_data,
12 : ulong const * batch_sz,
13 : void * const * batch_hash );
14 :
15 : void
16 : fd_sha256_private_batch_avx512( ulong batch_cnt,
17 : void const * _batch_data,
18 : ulong const * batch_sz,
19 5701612 : void * const * _batch_hash ) {
20 :
21 : /* If the batch is small enough, it is more efficient to use the
22 : narrow batched implementations. The threshold for fallback depends
23 : on whether that itself narrower batched implementation is using
24 : SHA-NI acceleration for really small batches. */
25 :
26 5701612 : # if FD_HAS_SHANI
27 5701612 : # define MIN_BATCH_CNT (5UL)
28 : # else
29 : # define MIN_BATCH_CNT (2UL)
30 : # endif
31 :
32 5701612 : if( FD_UNLIKELY( batch_cnt<MIN_BATCH_CNT ) ) {
33 1545805 : fd_sha256_private_batch_avx( batch_cnt, _batch_data, batch_sz, _batch_hash );
34 1545805 : return;
35 1545805 : }
36 :
37 4155807 : # undef MIN_BATCH_CNT
38 :
39 : /* SHA appends to the end of each message 9 bytes of additional data
40 : (a messaging terminator byte and the big endian ulong with the
41 : message size in bits) and enough zero padding to make the message
42 : an integer number of blocks long. We compute the 1 or 2 tail
43 : blocks of each message here. We then process complete blocks of
44 : the original messages in place, switching to processing these tail
45 : blocks in the same pass toward the end. TODO: This code could
46 : probably be SIMD optimized slightly more (this is where all the
47 : really performance suboptimally designed parts of SHA live so it is
48 : just inherently gross). The main optimization would probably be to
49 : allow tail reading to use a faster memcpy and then maybe some
50 : vectorization of the bswap. */
51 :
52 4155807 : ulong const * batch_data = (ulong const *)_batch_data;
53 :
54 4155807 : ulong batch_tail_data[ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
55 4155807 : ulong batch_tail_rem [ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
56 :
57 4155807 : uchar scratch[ FD_SHA256_BATCH_MAX*2UL*FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
58 4155807 : do {
59 4155807 : ulong scratch_free = (ulong)scratch;
60 :
61 4155807 : wwv_t zero = wwv_zero();
62 :
63 64965998 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
64 :
65 : /* Allocate the tail blocks for this message */
66 :
67 60810191 : ulong data = batch_data[ batch_idx ];
68 60810191 : ulong sz = batch_sz [ batch_idx ];
69 :
70 60810191 : ulong tail_data = scratch_free;
71 60810191 : ulong tail_data_sz = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
72 60810191 : ulong tail_data_off = fd_ulong_align_dn( sz, FD_SHA256_PRIVATE_BUF_MAX );
73 60810191 : ulong tail_sz = fd_ulong_align_up( tail_data_sz+9UL, FD_SHA256_PRIVATE_BUF_MAX );
74 :
75 60810191 : batch_tail_data[ batch_idx ] = tail_data;
76 60810191 : batch_tail_rem [ batch_idx ] = tail_sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
77 :
78 60810191 : scratch_free += tail_sz;
79 :
80 : /* Populate the tail blocks. We first clear the blocks (note that
81 : it is okay to clobber bytes 64:127 if tail_sz only 64, saving a
82 : nasty branch). Then we copy any straggler data bytes into the
83 : tail, terminate the message, and finally record the size of the
84 : message in bits at the end as a big endian ulong. */
85 :
86 60810191 : wwv_st( (ulong *) tail_data, zero );
87 60810191 : wwv_st( (ulong *)(tail_data+64), zero );
88 :
89 60810191 : # if 1
90 : /* Quick experiments found that, once again, straight memcpy is
91 : much slower than a fd_memcpy is slightly slower than a
92 : site-optimized handrolled memcpy (fd_memcpy would be less L1I
93 : cache footprint though). They also found that doing the below
94 : in a branchless way is slightly worse and an ILP optimized
95 : version of the conditional calculation is about the same. They
96 : also found that vectorizing the overall loop and/or Duffing the
97 : vectorized loop did not provide noticeable performance
98 : improvements under various styles of memcpy. */
99 60810191 : ulong src = data + tail_data_off;
100 60810191 : ulong dst = tail_data;
101 60810191 : ulong rem = tail_data_sz;
102 75899019 : while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
103 127889648 : while( rem>= 8UL ) { *(ulong *)dst = FD_LOAD( ulong, src ); dst += 8UL; src += 8UL; rem -= 8UL; }
104 60810191 : if ( rem>= 4UL ) { *(uint *)dst = FD_LOAD( uint, src ); dst += 4UL; src += 4UL; rem -= 4UL; }
105 60810191 : if ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src ); dst += 2UL; src += 2UL; rem -= 2UL; }
106 60810191 : if ( rem ) { *(uchar *)dst = FD_LOAD( uchar, src ); dst++; }
107 60810191 : *(uchar *)dst = (uchar)0x80;
108 : # else
109 : fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
110 : *((uchar *)(tail_data+tail_data_sz)) = (uchar)0x80;
111 : # endif
112 :
113 60810191 : *((ulong *)(tail_data+tail_sz-8UL )) = fd_ulong_bswap( sz<<3 );
114 60810191 : }
115 4155807 : } while(0);
116 :
117 4155807 : wwu_t s0 = wwu_bcast( 0x6a09e667U );
118 4155807 : wwu_t s1 = wwu_bcast( 0xbb67ae85U );
119 4155807 : wwu_t s2 = wwu_bcast( 0x3c6ef372U );
120 4155807 : wwu_t s3 = wwu_bcast( 0xa54ff53aU );
121 4155807 : wwu_t s4 = wwu_bcast( 0x510e527fU );
122 4155807 : wwu_t s5 = wwu_bcast( 0x9b05688cU );
123 4155807 : wwu_t s6 = wwu_bcast( 0x1f83d9abU );
124 4155807 : wwu_t s7 = wwu_bcast( 0x5be0cd19U );
125 :
126 4155807 : wwv_t zero = wwv_zero();
127 4155807 : wwv_t one = wwv_one();
128 4155807 : wwv_t wwv_64 = wwv_bcast( FD_SHA256_PRIVATE_BUF_MAX );
129 4155807 : wwv_t W_sentinel = wwv_bcast( (ulong)scratch );
130 :
131 4155807 : wwv_t tail_lo = wwv_ld( batch_tail_data ); wwv_t tail_hi = wwv_ld( batch_tail_data+8 );
132 4155807 : wwv_t tail_rem_lo = wwv_ld( batch_tail_rem ); wwv_t tail_rem_hi = wwv_ld( batch_tail_rem +8 );
133 4155807 : wwv_t W_lo = wwv_ld( batch_data ); wwv_t W_hi = wwv_ld( batch_data +8 );
134 :
135 4155807 : wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
136 4155807 : wwv_add( wwv_shr( wwv_ld( batch_sz ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_lo ), zero );
137 4155807 : wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
138 4155807 : wwv_add( wwv_shr( wwv_ld( batch_sz+8 ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_hi ), zero );
139 :
140 62590620 : for(;;) {
141 62590620 : int active_lane_lo = wwv_ne( block_rem_lo, zero );
142 62590620 : int active_lane_hi = wwv_ne( block_rem_hi, zero );
143 62590620 : if( FD_UNLIKELY( !(active_lane_lo | active_lane_hi) ) ) break;
144 :
145 : /* Switch lanes that have hit the end of their in-place bulk
146 : processing to their out-of-place scratch tail regions as
147 : necessary. */
148 :
149 58434813 : W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
150 58434813 : W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
151 :
152 : /* At this point, we have at least 1 block in this message segment
153 : pass that has not been processed. Load the next 64 bytes of
154 : each unprocessed block. Inactive lanes (e.g. message segments
155 : in this pass for which we've already processed all the blocks)
156 : will load garbage from a sentinel location (and the result of
157 : the state computations for the inactive lane will be ignored). */
158 :
159 58434813 : ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
160 58434813 : ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
161 58434813 : wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
162 58434813 : wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
163 58434813 : uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
164 58434813 : uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
165 58434813 : uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
166 58434813 : uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
167 58434813 : uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
168 58434813 : uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
169 58434813 : uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
170 58434813 : uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
171 :
172 58434813 : wwu_t x0; wwu_t x1; wwu_t x2; wwu_t x3; wwu_t x4; wwu_t x5; wwu_t x6; wwu_t x7;
173 58434813 : wwu_t x8; wwu_t x9; wwu_t xa; wwu_t xb; wwu_t xc; wwu_t xd; wwu_t xe; wwu_t xf;
174 58434813 : wwu_transpose_16x16( wwu_bswap( wwu_ldu( W0 ) ), wwu_bswap( wwu_ldu( W1 ) ),
175 58434813 : wwu_bswap( wwu_ldu( W2 ) ), wwu_bswap( wwu_ldu( W3 ) ),
176 58434813 : wwu_bswap( wwu_ldu( W4 ) ), wwu_bswap( wwu_ldu( W5 ) ),
177 58434813 : wwu_bswap( wwu_ldu( W6 ) ), wwu_bswap( wwu_ldu( W7 ) ),
178 58434813 : wwu_bswap( wwu_ldu( W8 ) ), wwu_bswap( wwu_ldu( W9 ) ),
179 58434813 : wwu_bswap( wwu_ldu( Wa ) ), wwu_bswap( wwu_ldu( Wb ) ),
180 58434813 : wwu_bswap( wwu_ldu( Wc ) ), wwu_bswap( wwu_ldu( Wd ) ),
181 58434813 : wwu_bswap( wwu_ldu( We ) ), wwu_bswap( wwu_ldu( Wf ) ),
182 58434813 : x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc, xd, xe, xf );
183 :
184 : /* Compute the SHA-256 state updates */
185 :
186 58434813 : wwu_t a = s0; wwu_t b = s1; wwu_t c = s2; wwu_t d = s3; wwu_t e = s4; wwu_t f = s5; wwu_t g = s6; wwu_t h = s7;
187 :
188 58434813 : static uint const K[64] = { /* FIXME: Reuse with other functions */
189 58434813 : 0x428a2f98U, 0x71374491U, 0xb5c0fbcfU, 0xe9b5dba5U, 0x3956c25bU, 0x59f111f1U, 0x923f82a4U, 0xab1c5ed5U,
190 58434813 : 0xd807aa98U, 0x12835b01U, 0x243185beU, 0x550c7dc3U, 0x72be5d74U, 0x80deb1feU, 0x9bdc06a7U, 0xc19bf174U,
191 58434813 : 0xe49b69c1U, 0xefbe4786U, 0x0fc19dc6U, 0x240ca1ccU, 0x2de92c6fU, 0x4a7484aaU, 0x5cb0a9dcU, 0x76f988daU,
192 58434813 : 0x983e5152U, 0xa831c66dU, 0xb00327c8U, 0xbf597fc7U, 0xc6e00bf3U, 0xd5a79147U, 0x06ca6351U, 0x14292967U,
193 58434813 : 0x27b70a85U, 0x2e1b2138U, 0x4d2c6dfcU, 0x53380d13U, 0x650a7354U, 0x766a0abbU, 0x81c2c92eU, 0x92722c85U,
194 58434813 : 0xa2bfe8a1U, 0xa81a664bU, 0xc24b8b70U, 0xc76c51a3U, 0xd192e819U, 0xd6990624U, 0xf40e3585U, 0x106aa070U,
195 58434813 : 0x19a4c116U, 0x1e376c08U, 0x2748774cU, 0x34b0bcb5U, 0x391c0cb3U, 0x4ed8aa4aU, 0x5b9cca4fU, 0x682e6ff3U,
196 58434813 : 0x748f82eeU, 0x78a5636fU, 0x84c87814U, 0x8cc70208U, 0x90befffaU, 0xa4506cebU, 0xbef9a3f7U, 0xc67178f2U,
197 58434813 : };
198 :
199 58434813 : # define Sigma0(x) wwu_xor( wwu_rol(x,30), wwu_xor( wwu_rol(x,19), wwu_rol(x,10) ) )
200 58434813 : # define Sigma1(x) wwu_xor( wwu_rol(x,26), wwu_xor( wwu_rol(x,21), wwu_rol(x, 7) ) )
201 58434813 : # define sigma0(x) wwu_xor( wwu_rol(x,25), wwu_xor( wwu_rol(x,14), wwu_shr(x, 3) ) )
202 58434813 : # define sigma1(x) wwu_xor( wwu_rol(x,15), wwu_xor( wwu_rol(x,13), wwu_shr(x,10) ) )
203 58434813 : # define Ch(x,y,z) wwu_xor( wwu_and(x,y), wwu_andnot(x,z) )
204 58434813 : # define Maj(x,y,z) wwu_xor( wwu_and(x,y), wwu_xor( wwu_and(x,z), wwu_and(y,z) ) )
205 58434813 : # define SHA_CORE(xi,ki) \
206 3739828032 : T1 = wwu_add( wwu_add(xi,ki), wwu_add( wwu_add( h, Sigma1(e) ), Ch(e, f, g) ) ); \
207 3739828032 : T2 = wwu_add( Sigma0(a), Maj(a, b, c) ); \
208 3739828032 : h = g; \
209 3739828032 : g = f; \
210 3739828032 : f = e; \
211 3739828032 : e = wwu_add( d, T1 ); \
212 3739828032 : d = c; \
213 3739828032 : c = b; \
214 3739828032 : b = a; \
215 3739828032 : a = wwu_add( T1, T2 )
216 :
217 58434813 : wwu_t T1;
218 58434813 : wwu_t T2;
219 :
220 58434813 : SHA_CORE( x0, wwu_bcast( K[ 0] ) );
221 58434813 : SHA_CORE( x1, wwu_bcast( K[ 1] ) );
222 58434813 : SHA_CORE( x2, wwu_bcast( K[ 2] ) );
223 58434813 : SHA_CORE( x3, wwu_bcast( K[ 3] ) );
224 58434813 : SHA_CORE( x4, wwu_bcast( K[ 4] ) );
225 58434813 : SHA_CORE( x5, wwu_bcast( K[ 5] ) );
226 58434813 : SHA_CORE( x6, wwu_bcast( K[ 6] ) );
227 58434813 : SHA_CORE( x7, wwu_bcast( K[ 7] ) );
228 58434813 : SHA_CORE( x8, wwu_bcast( K[ 8] ) );
229 58434813 : SHA_CORE( x9, wwu_bcast( K[ 9] ) );
230 58434813 : SHA_CORE( xa, wwu_bcast( K[10] ) );
231 58434813 : SHA_CORE( xb, wwu_bcast( K[11] ) );
232 58434813 : SHA_CORE( xc, wwu_bcast( K[12] ) );
233 58434813 : SHA_CORE( xd, wwu_bcast( K[13] ) );
234 58434813 : SHA_CORE( xe, wwu_bcast( K[14] ) );
235 58434813 : SHA_CORE( xf, wwu_bcast( K[15] ) );
236 233739252 : for( ulong i=16UL; i<64UL; i+=16UL ) {
237 175304439 : x0 = wwu_add( wwu_add( x0, sigma0(x1) ), wwu_add( sigma1(xe), x9 ) ); SHA_CORE( x0, wwu_bcast( K[i ] ) );
238 175304439 : x1 = wwu_add( wwu_add( x1, sigma0(x2) ), wwu_add( sigma1(xf), xa ) ); SHA_CORE( x1, wwu_bcast( K[i+ 1UL] ) );
239 175304439 : x2 = wwu_add( wwu_add( x2, sigma0(x3) ), wwu_add( sigma1(x0), xb ) ); SHA_CORE( x2, wwu_bcast( K[i+ 2UL] ) );
240 175304439 : x3 = wwu_add( wwu_add( x3, sigma0(x4) ), wwu_add( sigma1(x1), xc ) ); SHA_CORE( x3, wwu_bcast( K[i+ 3UL] ) );
241 175304439 : x4 = wwu_add( wwu_add( x4, sigma0(x5) ), wwu_add( sigma1(x2), xd ) ); SHA_CORE( x4, wwu_bcast( K[i+ 4UL] ) );
242 175304439 : x5 = wwu_add( wwu_add( x5, sigma0(x6) ), wwu_add( sigma1(x3), xe ) ); SHA_CORE( x5, wwu_bcast( K[i+ 5UL] ) );
243 175304439 : x6 = wwu_add( wwu_add( x6, sigma0(x7) ), wwu_add( sigma1(x4), xf ) ); SHA_CORE( x6, wwu_bcast( K[i+ 6UL] ) );
244 175304439 : x7 = wwu_add( wwu_add( x7, sigma0(x8) ), wwu_add( sigma1(x5), x0 ) ); SHA_CORE( x7, wwu_bcast( K[i+ 7UL] ) );
245 175304439 : x8 = wwu_add( wwu_add( x8, sigma0(x9) ), wwu_add( sigma1(x6), x1 ) ); SHA_CORE( x8, wwu_bcast( K[i+ 8UL] ) );
246 175304439 : x9 = wwu_add( wwu_add( x9, sigma0(xa) ), wwu_add( sigma1(x7), x2 ) ); SHA_CORE( x9, wwu_bcast( K[i+ 9UL] ) );
247 175304439 : xa = wwu_add( wwu_add( xa, sigma0(xb) ), wwu_add( sigma1(x8), x3 ) ); SHA_CORE( xa, wwu_bcast( K[i+10UL] ) );
248 175304439 : xb = wwu_add( wwu_add( xb, sigma0(xc) ), wwu_add( sigma1(x9), x4 ) ); SHA_CORE( xb, wwu_bcast( K[i+11UL] ) );
249 175304439 : xc = wwu_add( wwu_add( xc, sigma0(xd) ), wwu_add( sigma1(xa), x5 ) ); SHA_CORE( xc, wwu_bcast( K[i+12UL] ) );
250 175304439 : xd = wwu_add( wwu_add( xd, sigma0(xe) ), wwu_add( sigma1(xb), x6 ) ); SHA_CORE( xd, wwu_bcast( K[i+13UL] ) );
251 175304439 : xe = wwu_add( wwu_add( xe, sigma0(xf) ), wwu_add( sigma1(xc), x7 ) ); SHA_CORE( xe, wwu_bcast( K[i+14UL] ) );
252 175304439 : xf = wwu_add( wwu_add( xf, sigma0(x0) ), wwu_add( sigma1(xd), x8 ) ); SHA_CORE( xf, wwu_bcast( K[i+15UL] ) );
253 175304439 : }
254 :
255 58434813 : # undef SHA_CORE
256 58434813 : # undef Sigma0
257 58434813 : # undef Sigma1
258 58434813 : # undef sigma0
259 58434813 : # undef sigma1
260 58434813 : # undef Ch
261 58434813 : # undef Maj
262 :
263 : /* Apply the state updates to the active lanes */
264 :
265 58434813 : int active_lane = active_lane_lo | (active_lane_hi<<8);
266 :
267 58434813 : s0 = wwu_add_if( active_lane, s0, a, s0 );
268 58434813 : s1 = wwu_add_if( active_lane, s1, b, s1 );
269 58434813 : s2 = wwu_add_if( active_lane, s2, c, s2 );
270 58434813 : s3 = wwu_add_if( active_lane, s3, d, s3 );
271 58434813 : s4 = wwu_add_if( active_lane, s4, e, s4 );
272 58434813 : s5 = wwu_add_if( active_lane, s5, f, s5 );
273 58434813 : s6 = wwu_add_if( active_lane, s6, g, s6 );
274 58434813 : s7 = wwu_add_if( active_lane, s7, h, s7 );
275 :
276 : /* Advance to the next message segment blocks. In pseudo code,
277 : the below is:
278 :
279 : W += 64; if( block_rem ) block_rem--;
280 :
281 : Since we do not load anything at W(lane) above unless
282 : block_rem(lane) is non-zero, we can omit vector conditional
283 : operations for W(lane) below. */
284 :
285 58434813 : W_lo = wwv_add( W_lo, wwv_64 );
286 58434813 : W_hi = wwv_add( W_hi, wwv_64 );
287 :
288 58434813 : block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
289 58434813 : block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
290 58434813 : }
291 :
292 : /* Store the results. FIXME: Probably could optimize the transpose
293 : further by taking into account needed stores (and then maybe go
294 : direct into memory ... would need a family of such transposed
295 : stores). */
296 :
297 4155807 : wwu_transpose_2x8x8( wwu_bswap(s0), wwu_bswap(s1), wwu_bswap(s2), wwu_bswap(s3),
298 4155807 : wwu_bswap(s4), wwu_bswap(s5), wwu_bswap(s6), wwu_bswap(s7), s0,s1,s2,s3,s4,s5,s6,s7 );
299 :
300 4155807 : uint * const * batch_hash = (uint * const *)_batch_hash;
301 4155807 : switch( batch_cnt ) { /* application dependent prob */
302 3230871 : case 16UL: wu_stu( batch_hash[15], _mm512_extracti32x8_epi32( s7, 1 ) ); __attribute__((fallthrough));
303 3307869 : case 15UL: wu_stu( batch_hash[14], _mm512_extracti32x8_epi32( s6, 1 ) ); __attribute__((fallthrough));
304 3394666 : case 14UL: wu_stu( batch_hash[13], _mm512_extracti32x8_epi32( s5, 1 ) ); __attribute__((fallthrough));
305 3471673 : case 13UL: wu_stu( batch_hash[12], _mm512_extracti32x8_epi32( s4, 1 ) ); __attribute__((fallthrough));
306 3557789 : case 12UL: wu_stu( batch_hash[11], _mm512_extracti32x8_epi32( s3, 1 ) ); __attribute__((fallthrough));
307 3636725 : case 11UL: wu_stu( batch_hash[10], _mm512_extracti32x8_epi32( s2, 1 ) ); __attribute__((fallthrough));
308 3723437 : case 10UL: wu_stu( batch_hash[ 9], _mm512_extracti32x8_epi32( s1, 1 ) ); __attribute__((fallthrough));
309 3801249 : case 9UL: wu_stu( batch_hash[ 8], _mm512_extracti32x8_epi32( s0, 1 ) ); __attribute__((fallthrough));
310 3887152 : case 8UL: wu_stu( batch_hash[ 7], _mm512_extracti32x8_epi32( s7, 0 ) ); __attribute__((fallthrough));
311 3965979 : case 7UL: wu_stu( batch_hash[ 6], _mm512_extracti32x8_epi32( s6, 0 ) ); __attribute__((fallthrough));
312 4053746 : case 6UL: wu_stu( batch_hash[ 5], _mm512_extracti32x8_epi32( s5, 0 ) ); __attribute__((fallthrough));
313 4155807 : case 5UL: wu_stu( batch_hash[ 4], _mm512_extracti32x8_epi32( s4, 0 ) ); __attribute__((fallthrough));
314 4155807 : case 4UL: wu_stu( batch_hash[ 3], _mm512_extracti32x8_epi32( s3, 0 ) ); __attribute__((fallthrough));
315 4155807 : case 3UL: wu_stu( batch_hash[ 2], _mm512_extracti32x8_epi32( s2, 0 ) ); __attribute__((fallthrough));
316 4155807 : case 2UL: wu_stu( batch_hash[ 1], _mm512_extracti32x8_epi32( s1, 0 ) ); __attribute__((fallthrough));
317 4155807 : case 1UL: wu_stu( batch_hash[ 0], _mm512_extracti32x8_epi32( s0, 0 ) ); __attribute__((fallthrough));
318 4155807 : default: break;
319 4155807 : }
320 4155807 : }
|