Line data Source code
1 : #define FD_SHA256_BATCH_IMPL 2
2 :
3 : #include "fd_sha256.h"
4 : #include "fd_sha256_constants.h"
5 : #include "../../util/simd/fd_avx512.h"
6 : #include "../../util/simd/fd_avx.h"
7 :
8 : FD_STATIC_ASSERT( FD_SHA256_BATCH_MAX==16UL, compat );
9 :
10 : void
11 : fd_sha256_private_batch_avx( ulong batch_cnt,
12 : void const * batch_data,
13 : ulong const * batch_sz,
14 : void * const * batch_hash );
15 :
16 : void
17 : fd_sha256_private_batch_avx512( ulong batch_cnt,
18 : void const * _batch_data,
19 : ulong const * batch_sz,
20 5883787 : void * const * _batch_hash ) {
21 :
22 : /* If the batch is small enough, it is more efficient to use the
23 : narrow batched implementations. The threshold for fallback depends
24 : on whether that itself narrower batched implementation is using
25 : SHA-NI acceleration for really small batches. */
26 :
27 5883787 : # if FD_HAS_SHANI
28 5883787 : # define MIN_BATCH_CNT (5UL)
29 : # else
30 : # define MIN_BATCH_CNT (2UL)
31 : # endif
32 :
33 5883787 : if( FD_UNLIKELY( batch_cnt<MIN_BATCH_CNT ) ) {
34 1594061 : fd_sha256_private_batch_avx( batch_cnt, _batch_data, batch_sz, _batch_hash );
35 1594061 : return;
36 1594061 : }
37 :
38 4289726 : # undef MIN_BATCH_CNT
39 :
40 : /* SHA appends to the end of each message 9 bytes of additional data
41 : (a messaging terminator byte and the big endian ulong with the
42 : message size in bits) and enough zero padding to make the message
43 : an integer number of blocks long. We compute the 1 or 2 tail
44 : blocks of each message here. We then process complete blocks of
45 : the original messages in place, switching to processing these tail
46 : blocks in the same pass toward the end. TODO: This code could
47 : probably be SIMD optimized slightly more (this is where all the
48 : really performance suboptimally designed parts of SHA live so it is
49 : just inherently gross). The main optimization would probably be to
50 : allow tail reading to use a faster memcpy and then maybe some
51 : vectorization of the bswap. */
52 :
53 4289726 : ulong const * batch_data = (ulong const *)_batch_data;
54 :
55 4289726 : ulong batch_tail_data[ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
56 4289726 : ulong batch_tail_rem [ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
57 :
58 4289726 : uchar scratch[ FD_SHA256_BATCH_MAX*2UL*FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
59 4289726 : do {
60 4289726 : ulong scratch_free = (ulong)scratch;
61 :
62 4289726 : wwv_t zero = wwv_zero();
63 :
64 66458941 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
65 :
66 : /* Allocate the tail blocks for this message */
67 :
68 62169215 : ulong data = batch_data[ batch_idx ];
69 62169215 : ulong sz = batch_sz [ batch_idx ];
70 :
71 62169215 : ulong tail_data = scratch_free;
72 62169215 : ulong tail_data_sz = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
73 62169215 : ulong tail_data_off = fd_ulong_align_dn( sz, FD_SHA256_PRIVATE_BUF_MAX );
74 62169215 : ulong tail_sz = fd_ulong_align_up( tail_data_sz+9UL, FD_SHA256_PRIVATE_BUF_MAX );
75 :
76 62169215 : batch_tail_data[ batch_idx ] = tail_data;
77 62169215 : batch_tail_rem [ batch_idx ] = tail_sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
78 :
79 62169215 : scratch_free += tail_sz;
80 :
81 : /* Populate the tail blocks. We first clear the blocks (note that
82 : it is okay to clobber bytes 64:127 if tail_sz only 64, saving a
83 : nasty branch). Then we copy any straggler data bytes into the
84 : tail, terminate the message, and finally record the size of the
85 : message in bits at the end as a big endian ulong. */
86 :
87 62169215 : wwv_st( (ulong *) tail_data, zero );
88 62169215 : wwv_st( (ulong *)(tail_data+64), zero );
89 :
90 62169215 : # if 1
91 : /* Quick experiments found that, once again, straight memcpy is
92 : much slower than a fd_memcpy is slightly slower than a
93 : site-optimized handrolled memcpy (fd_memcpy would be less L1I
94 : cache footprint though). They also found that doing the below
95 : in a branchless way is slightly worse and an ILP optimized
96 : version of the conditional calculation is about the same. They
97 : also found that vectorizing the overall loop and/or Duffing the
98 : vectorized loop did not provide noticeable performance
99 : improvements under various styles of memcpy. */
100 62169215 : ulong src = data + tail_data_off;
101 62169215 : ulong dst = tail_data;
102 62169215 : ulong rem = tail_data_sz;
103 72225915 : while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
104 129798018 : while( rem>= 8UL ) { *(ulong *)dst = FD_LOAD( ulong, src ); dst += 8UL; src += 8UL; rem -= 8UL; }
105 62169215 : if ( rem>= 4UL ) { *(uint *)dst = FD_LOAD( uint, src ); dst += 4UL; src += 4UL; rem -= 4UL; }
106 62169215 : if ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src ); dst += 2UL; src += 2UL; rem -= 2UL; }
107 62169215 : if ( rem ) { *(uchar *)dst = FD_LOAD( uchar, src ); dst++; }
108 62169215 : *(uchar *)dst = (uchar)0x80;
109 : # else
110 : fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
111 : *((uchar *)(tail_data+tail_data_sz)) = (uchar)0x80;
112 : # endif
113 :
114 62169215 : *((ulong *)(tail_data+tail_sz-8UL )) = fd_ulong_bswap( sz<<3 );
115 62169215 : }
116 4289726 : } while(0);
117 :
118 4289726 : wwu_t s0 = wwu_bcast( FD_SHA256_INITIAL_A );
119 4289726 : wwu_t s1 = wwu_bcast( FD_SHA256_INITIAL_B );
120 4289726 : wwu_t s2 = wwu_bcast( FD_SHA256_INITIAL_C );
121 4289726 : wwu_t s3 = wwu_bcast( FD_SHA256_INITIAL_D );
122 4289726 : wwu_t s4 = wwu_bcast( FD_SHA256_INITIAL_E );
123 4289726 : wwu_t s5 = wwu_bcast( FD_SHA256_INITIAL_F );
124 4289726 : wwu_t s6 = wwu_bcast( FD_SHA256_INITIAL_G );
125 4289726 : wwu_t s7 = wwu_bcast( FD_SHA256_INITIAL_H );
126 :
127 4289726 : wwv_t zero = wwv_zero();
128 4289726 : wwv_t one = wwv_one();
129 4289726 : wwv_t wwv_64 = wwv_bcast( FD_SHA256_PRIVATE_BUF_MAX );
130 4289726 : wwv_t W_sentinel = wwv_bcast( (ulong)scratch );
131 :
132 4289726 : wwv_t tail_lo = wwv_ld( batch_tail_data ); wwv_t tail_hi = wwv_ld( batch_tail_data+8 );
133 4289726 : wwv_t tail_rem_lo = wwv_ld( batch_tail_rem ); wwv_t tail_rem_hi = wwv_ld( batch_tail_rem +8 );
134 4289726 : wwv_t W_lo = wwv_ld( batch_data ); wwv_t W_hi = wwv_ld( batch_data +8 );
135 :
136 4289726 : wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
137 4289726 : wwv_add( wwv_shr( wwv_ld( batch_sz ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_lo ), zero );
138 4289726 : wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
139 4289726 : wwv_add( wwv_shr( wwv_ld( batch_sz+8 ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_hi ), zero );
140 :
141 64263155 : for(;;) {
142 64263155 : int active_lane_lo = wwv_ne( block_rem_lo, zero );
143 64263155 : int active_lane_hi = wwv_ne( block_rem_hi, zero );
144 64263155 : if( FD_UNLIKELY( !(active_lane_lo | active_lane_hi) ) ) break;
145 :
146 : /* Switch lanes that have hit the end of their in-place bulk
147 : processing to their out-of-place scratch tail regions as
148 : necessary. */
149 :
150 59973429 : W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
151 59973429 : W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
152 :
153 : /* At this point, we have at least 1 block in this message segment
154 : pass that has not been processed. Load the next 64 bytes of
155 : each unprocessed block. Inactive lanes (e.g. message segments
156 : in this pass for which we've already processed all the blocks)
157 : will load garbage from a sentinel location (and the result of
158 : the state computations for the inactive lane will be ignored). */
159 :
160 59973429 : ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
161 59973429 : ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
162 59973429 : wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
163 59973429 : wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
164 59973429 : uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
165 59973429 : uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
166 59973429 : uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
167 59973429 : uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
168 59973429 : uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
169 59973429 : uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
170 59973429 : uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
171 59973429 : uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
172 :
173 59973429 : wwu_t x0; wwu_t x1; wwu_t x2; wwu_t x3; wwu_t x4; wwu_t x5; wwu_t x6; wwu_t x7;
174 59973429 : wwu_t x8; wwu_t x9; wwu_t xa; wwu_t xb; wwu_t xc; wwu_t xd; wwu_t xe; wwu_t xf;
175 59973429 : wwu_transpose_16x16( wwu_bswap( wwu_ldu( W0 ) ), wwu_bswap( wwu_ldu( W1 ) ),
176 59973429 : wwu_bswap( wwu_ldu( W2 ) ), wwu_bswap( wwu_ldu( W3 ) ),
177 59973429 : wwu_bswap( wwu_ldu( W4 ) ), wwu_bswap( wwu_ldu( W5 ) ),
178 59973429 : wwu_bswap( wwu_ldu( W6 ) ), wwu_bswap( wwu_ldu( W7 ) ),
179 59973429 : wwu_bswap( wwu_ldu( W8 ) ), wwu_bswap( wwu_ldu( W9 ) ),
180 59973429 : wwu_bswap( wwu_ldu( Wa ) ), wwu_bswap( wwu_ldu( Wb ) ),
181 59973429 : wwu_bswap( wwu_ldu( Wc ) ), wwu_bswap( wwu_ldu( Wd ) ),
182 59973429 : wwu_bswap( wwu_ldu( We ) ), wwu_bswap( wwu_ldu( Wf ) ),
183 59973429 : x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc, xd, xe, xf );
184 :
185 : /* Compute the SHA-256 state updates */
186 :
187 59973429 : wwu_t a = s0; wwu_t b = s1; wwu_t c = s2; wwu_t d = s3; wwu_t e = s4; wwu_t f = s5; wwu_t g = s6; wwu_t h = s7;
188 :
189 59973429 : # define Sigma0(x) wwu_xor( wwu_rol(x,30), wwu_xor( wwu_rol(x,19), wwu_rol(x,10) ) )
190 59973429 : # define Sigma1(x) wwu_xor( wwu_rol(x,26), wwu_xor( wwu_rol(x,21), wwu_rol(x, 7) ) )
191 59973429 : # define sigma0(x) wwu_xor( wwu_rol(x,25), wwu_xor( wwu_rol(x,14), wwu_shr(x, 3) ) )
192 59973429 : # define sigma1(x) wwu_xor( wwu_rol(x,15), wwu_xor( wwu_rol(x,13), wwu_shr(x,10) ) )
193 59973429 : # define Ch(x,y,z) wwu_xor( wwu_and(x,y), wwu_andnot(x,z) )
194 59973429 : # define Maj(x,y,z) wwu_xor( wwu_and(x,y), wwu_xor( wwu_and(x,z), wwu_and(y,z) ) )
195 59973429 : # define SHA_CORE(xi,ki) \
196 3838299456 : T1 = wwu_add( wwu_add(xi,ki), wwu_add( wwu_add( h, Sigma1(e) ), Ch(e, f, g) ) ); \
197 3838299456 : T2 = wwu_add( Sigma0(a), Maj(a, b, c) ); \
198 3838299456 : h = g; \
199 3838299456 : g = f; \
200 3838299456 : f = e; \
201 3838299456 : e = wwu_add( d, T1 ); \
202 3838299456 : d = c; \
203 3838299456 : c = b; \
204 3838299456 : b = a; \
205 3838299456 : a = wwu_add( T1, T2 )
206 :
207 59973429 : wwu_t T1;
208 59973429 : wwu_t T2;
209 :
210 59973429 : SHA_CORE( x0, wwu_bcast( fd_sha256_K[ 0] ) );
211 59973429 : SHA_CORE( x1, wwu_bcast( fd_sha256_K[ 1] ) );
212 59973429 : SHA_CORE( x2, wwu_bcast( fd_sha256_K[ 2] ) );
213 59973429 : SHA_CORE( x3, wwu_bcast( fd_sha256_K[ 3] ) );
214 59973429 : SHA_CORE( x4, wwu_bcast( fd_sha256_K[ 4] ) );
215 59973429 : SHA_CORE( x5, wwu_bcast( fd_sha256_K[ 5] ) );
216 59973429 : SHA_CORE( x6, wwu_bcast( fd_sha256_K[ 6] ) );
217 59973429 : SHA_CORE( x7, wwu_bcast( fd_sha256_K[ 7] ) );
218 59973429 : SHA_CORE( x8, wwu_bcast( fd_sha256_K[ 8] ) );
219 59973429 : SHA_CORE( x9, wwu_bcast( fd_sha256_K[ 9] ) );
220 59973429 : SHA_CORE( xa, wwu_bcast( fd_sha256_K[10] ) );
221 59973429 : SHA_CORE( xb, wwu_bcast( fd_sha256_K[11] ) );
222 59973429 : SHA_CORE( xc, wwu_bcast( fd_sha256_K[12] ) );
223 59973429 : SHA_CORE( xd, wwu_bcast( fd_sha256_K[13] ) );
224 59973429 : SHA_CORE( xe, wwu_bcast( fd_sha256_K[14] ) );
225 59973429 : SHA_CORE( xf, wwu_bcast( fd_sha256_K[15] ) );
226 239893716 : for( ulong i=16UL; i<64UL; i+=16UL ) {
227 179920287 : x0 = wwu_add( wwu_add( x0, sigma0(x1) ), wwu_add( sigma1(xe), x9 ) ); SHA_CORE( x0, wwu_bcast( fd_sha256_K[i ] ) );
228 179920287 : x1 = wwu_add( wwu_add( x1, sigma0(x2) ), wwu_add( sigma1(xf), xa ) ); SHA_CORE( x1, wwu_bcast( fd_sha256_K[i+ 1UL] ) );
229 179920287 : x2 = wwu_add( wwu_add( x2, sigma0(x3) ), wwu_add( sigma1(x0), xb ) ); SHA_CORE( x2, wwu_bcast( fd_sha256_K[i+ 2UL] ) );
230 179920287 : x3 = wwu_add( wwu_add( x3, sigma0(x4) ), wwu_add( sigma1(x1), xc ) ); SHA_CORE( x3, wwu_bcast( fd_sha256_K[i+ 3UL] ) );
231 179920287 : x4 = wwu_add( wwu_add( x4, sigma0(x5) ), wwu_add( sigma1(x2), xd ) ); SHA_CORE( x4, wwu_bcast( fd_sha256_K[i+ 4UL] ) );
232 179920287 : x5 = wwu_add( wwu_add( x5, sigma0(x6) ), wwu_add( sigma1(x3), xe ) ); SHA_CORE( x5, wwu_bcast( fd_sha256_K[i+ 5UL] ) );
233 179920287 : x6 = wwu_add( wwu_add( x6, sigma0(x7) ), wwu_add( sigma1(x4), xf ) ); SHA_CORE( x6, wwu_bcast( fd_sha256_K[i+ 6UL] ) );
234 179920287 : x7 = wwu_add( wwu_add( x7, sigma0(x8) ), wwu_add( sigma1(x5), x0 ) ); SHA_CORE( x7, wwu_bcast( fd_sha256_K[i+ 7UL] ) );
235 179920287 : x8 = wwu_add( wwu_add( x8, sigma0(x9) ), wwu_add( sigma1(x6), x1 ) ); SHA_CORE( x8, wwu_bcast( fd_sha256_K[i+ 8UL] ) );
236 179920287 : x9 = wwu_add( wwu_add( x9, sigma0(xa) ), wwu_add( sigma1(x7), x2 ) ); SHA_CORE( x9, wwu_bcast( fd_sha256_K[i+ 9UL] ) );
237 179920287 : xa = wwu_add( wwu_add( xa, sigma0(xb) ), wwu_add( sigma1(x8), x3 ) ); SHA_CORE( xa, wwu_bcast( fd_sha256_K[i+10UL] ) );
238 179920287 : xb = wwu_add( wwu_add( xb, sigma0(xc) ), wwu_add( sigma1(x9), x4 ) ); SHA_CORE( xb, wwu_bcast( fd_sha256_K[i+11UL] ) );
239 179920287 : xc = wwu_add( wwu_add( xc, sigma0(xd) ), wwu_add( sigma1(xa), x5 ) ); SHA_CORE( xc, wwu_bcast( fd_sha256_K[i+12UL] ) );
240 179920287 : xd = wwu_add( wwu_add( xd, sigma0(xe) ), wwu_add( sigma1(xb), x6 ) ); SHA_CORE( xd, wwu_bcast( fd_sha256_K[i+13UL] ) );
241 179920287 : xe = wwu_add( wwu_add( xe, sigma0(xf) ), wwu_add( sigma1(xc), x7 ) ); SHA_CORE( xe, wwu_bcast( fd_sha256_K[i+14UL] ) );
242 179920287 : xf = wwu_add( wwu_add( xf, sigma0(x0) ), wwu_add( sigma1(xd), x8 ) ); SHA_CORE( xf, wwu_bcast( fd_sha256_K[i+15UL] ) );
243 179920287 : }
244 :
245 59973429 : # undef SHA_CORE
246 59973429 : # undef Sigma0
247 59973429 : # undef Sigma1
248 59973429 : # undef sigma0
249 59973429 : # undef sigma1
250 59973429 : # undef Ch
251 59973429 : # undef Maj
252 :
253 : /* Apply the state updates to the active lanes */
254 :
255 59973429 : int active_lane = active_lane_lo | (active_lane_hi<<8);
256 :
257 59973429 : s0 = wwu_add_if( active_lane, s0, a, s0 );
258 59973429 : s1 = wwu_add_if( active_lane, s1, b, s1 );
259 59973429 : s2 = wwu_add_if( active_lane, s2, c, s2 );
260 59973429 : s3 = wwu_add_if( active_lane, s3, d, s3 );
261 59973429 : s4 = wwu_add_if( active_lane, s4, e, s4 );
262 59973429 : s5 = wwu_add_if( active_lane, s5, f, s5 );
263 59973429 : s6 = wwu_add_if( active_lane, s6, g, s6 );
264 59973429 : s7 = wwu_add_if( active_lane, s7, h, s7 );
265 :
266 : /* Advance to the next message segment blocks. In pseudo code,
267 : the below is:
268 :
269 : W += 64; if( block_rem ) block_rem--;
270 :
271 : Since we do not load anything at W(lane) above unless
272 : block_rem(lane) is non-zero, we can omit vector conditional
273 : operations for W(lane) below. */
274 :
275 59973429 : W_lo = wwv_add( W_lo, wwv_64 );
276 59973429 : W_hi = wwv_add( W_hi, wwv_64 );
277 :
278 59973429 : block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
279 59973429 : block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
280 59973429 : }
281 :
282 : /* Store the results. FIXME: Probably could optimize the transpose
283 : further by taking into account needed stores (and then maybe go
284 : direct into memory ... would need a family of such transposed
285 : stores). */
286 :
287 4289726 : wwu_transpose_2x8x8( wwu_bswap(s0), wwu_bswap(s1), wwu_bswap(s2), wwu_bswap(s3),
288 4289726 : wwu_bswap(s4), wwu_bswap(s5), wwu_bswap(s6), wwu_bswap(s7), s0,s1,s2,s3,s4,s5,s6,s7 );
289 :
290 4289726 : uint * const * batch_hash = (uint * const *)_batch_hash;
291 4289726 : switch( batch_cnt ) { /* application dependent prob */
292 3235949 : case 16UL: wu_stu( batch_hash[15], _mm512_extracti32x8_epi32( s7, 1 ) ); __attribute__((fallthrough));
293 3314809 : case 15UL: wu_stu( batch_hash[14], _mm512_extracti32x8_epi32( s6, 1 ) ); __attribute__((fallthrough));
294 3424509 : case 14UL: wu_stu( batch_hash[13], _mm512_extracti32x8_epi32( s5, 1 ) ); __attribute__((fallthrough));
295 3503378 : case 13UL: wu_stu( batch_hash[12], _mm512_extracti32x8_epi32( s4, 1 ) ); __attribute__((fallthrough));
296 3610535 : case 12UL: wu_stu( batch_hash[11], _mm512_extracti32x8_epi32( s3, 1 ) ); __attribute__((fallthrough));
297 3695097 : case 11UL: wu_stu( batch_hash[10], _mm512_extracti32x8_epi32( s2, 1 ) ); __attribute__((fallthrough));
298 3802810 : case 10UL: wu_stu( batch_hash[ 9], _mm512_extracti32x8_epi32( s1, 1 ) ); __attribute__((fallthrough));
299 3884386 : case 9UL: wu_stu( batch_hash[ 8], _mm512_extracti32x8_epi32( s0, 1 ) ); __attribute__((fallthrough));
300 3989428 : case 8UL: wu_stu( batch_hash[ 7], _mm512_extracti32x8_epi32( s7, 0 ) ); __attribute__((fallthrough));
301 4073881 : case 7UL: wu_stu( batch_hash[ 6], _mm512_extracti32x8_epi32( s6, 0 ) ); __attribute__((fallthrough));
302 4185803 : case 6UL: wu_stu( batch_hash[ 5], _mm512_extracti32x8_epi32( s5, 0 ) ); __attribute__((fallthrough));
303 4289726 : case 5UL: wu_stu( batch_hash[ 4], _mm512_extracti32x8_epi32( s4, 0 ) ); __attribute__((fallthrough));
304 4289726 : case 4UL: wu_stu( batch_hash[ 3], _mm512_extracti32x8_epi32( s3, 0 ) ); __attribute__((fallthrough));
305 4289726 : case 3UL: wu_stu( batch_hash[ 2], _mm512_extracti32x8_epi32( s2, 0 ) ); __attribute__((fallthrough));
306 4289726 : case 2UL: wu_stu( batch_hash[ 1], _mm512_extracti32x8_epi32( s1, 0 ) ); __attribute__((fallthrough));
307 4289726 : case 1UL: wu_stu( batch_hash[ 0], _mm512_extracti32x8_epi32( s0, 0 ) ); __attribute__((fallthrough));
308 4289726 : default: break;
309 4289726 : }
310 4289726 : }
|