LCOV - code coverage report
Current view: top level - ballet/sha256 - fd_sha256_batch_avx512.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 186 186 100.0 %
Date: 2025-08-05 05:04:49 Functions: 1 1 100.0 %

          Line data    Source code
       1             : #define FD_SHA256_BATCH_IMPL 2
       2             : 
       3             : #include "fd_sha256.h"
       4             : #include "fd_sha256_constants.h"
       5             : #include "../../util/simd/fd_avx512.h"
       6             : #include "../../util/simd/fd_avx.h"
       7             : 
       8             : FD_STATIC_ASSERT( FD_SHA256_BATCH_MAX==16UL, compat );
       9             : 
      10             : void
      11             : fd_sha256_private_batch_avx( ulong          batch_cnt,
      12             :                              void const *   batch_data,
      13             :                              ulong const *  batch_sz,
      14             :                              void * const * batch_hash );
      15             : 
      16             : void
      17             : fd_sha256_private_batch_avx512( ulong          batch_cnt,
      18             :                                 void const *   _batch_data,
      19             :                                 ulong const *  batch_sz,
      20     5883787 :                                 void * const * _batch_hash ) {
      21             : 
      22             :   /* If the batch is small enough, it is more efficient to use the
      23             :      narrow batched implementations.  The threshold for fallback depends
      24             :      on whether that itself narrower batched implementation is using
      25             :      SHA-NI acceleration for really small batches. */
      26             : 
      27     5883787 : # if FD_HAS_SHANI
      28     5883787 : # define MIN_BATCH_CNT (5UL)
      29             : # else
      30             : # define MIN_BATCH_CNT (2UL)
      31             : # endif
      32             : 
      33     5883787 :   if( FD_UNLIKELY( batch_cnt<MIN_BATCH_CNT ) ) {
      34     1594061 :     fd_sha256_private_batch_avx( batch_cnt, _batch_data, batch_sz, _batch_hash );
      35     1594061 :     return;
      36     1594061 :   }
      37             : 
      38     4289726 : # undef MIN_BATCH_CNT
      39             : 
      40             :   /* SHA appends to the end of each message 9 bytes of additional data
      41             :      (a messaging terminator byte and the big endian ulong with the
      42             :      message size in bits) and enough zero padding to make the message
      43             :      an integer number of blocks long.  We compute the 1 or 2 tail
      44             :      blocks of each message here.  We then process complete blocks of
      45             :      the original messages in place, switching to processing these tail
      46             :      blocks in the same pass toward the end.  TODO: This code could
      47             :      probably be SIMD optimized slightly more (this is where all the
      48             :      really performance suboptimally designed parts of SHA live so it is
      49             :      just inherently gross).  The main optimization would probably be to
      50             :      allow tail reading to use a faster memcpy and then maybe some
      51             :      vectorization of the bswap. */
      52             : 
      53     4289726 :   ulong const * batch_data = (ulong const *)_batch_data;
      54             : 
      55     4289726 :   ulong batch_tail_data[ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
      56     4289726 :   ulong batch_tail_rem [ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
      57             : 
      58     4289726 :   uchar scratch[ FD_SHA256_BATCH_MAX*2UL*FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
      59     4289726 :   do {
      60     4289726 :     ulong scratch_free = (ulong)scratch;
      61             : 
      62     4289726 :     wwv_t zero = wwv_zero();
      63             : 
      64    66458941 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
      65             : 
      66             :       /* Allocate the tail blocks for this message */
      67             : 
      68    62169215 :       ulong data = batch_data[ batch_idx ];
      69    62169215 :       ulong sz   = batch_sz  [ batch_idx ];
      70             : 
      71    62169215 :       ulong tail_data     = scratch_free;
      72    62169215 :       ulong tail_data_sz  = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
      73    62169215 :       ulong tail_data_off = fd_ulong_align_dn( sz,               FD_SHA256_PRIVATE_BUF_MAX );
      74    62169215 :       ulong tail_sz       = fd_ulong_align_up( tail_data_sz+9UL, FD_SHA256_PRIVATE_BUF_MAX );
      75             : 
      76    62169215 :       batch_tail_data[ batch_idx ] = tail_data;
      77    62169215 :       batch_tail_rem [ batch_idx ] = tail_sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
      78             : 
      79    62169215 :       scratch_free += tail_sz;
      80             : 
      81             :       /* Populate the tail blocks.  We first clear the blocks (note that
      82             :          it is okay to clobber bytes 64:127 if tail_sz only 64, saving a
      83             :          nasty branch).  Then we copy any straggler data bytes into the
      84             :          tail, terminate the message, and finally record the size of the
      85             :          message in bits at the end as a big endian ulong.  */
      86             : 
      87    62169215 :       wwv_st( (ulong *) tail_data,     zero );
      88    62169215 :       wwv_st( (ulong *)(tail_data+64), zero );
      89             : 
      90    62169215 : #     if 1
      91             :       /* Quick experiments found that, once again, straight memcpy is
      92             :          much slower than a fd_memcpy is slightly slower than a
      93             :          site-optimized handrolled memcpy (fd_memcpy would be less L1I
      94             :          cache footprint though).  They also found that doing the below
      95             :          in a branchless way is slightly worse and an ILP optimized
      96             :          version of the conditional calculation is about the same.  They
      97             :          also found that vectorizing the overall loop and/or Duffing the
      98             :          vectorized loop did not provide noticeable performance
      99             :          improvements under various styles of memcpy. */
     100    62169215 :       ulong src = data + tail_data_off;
     101    62169215 :       ulong dst = tail_data;
     102    62169215 :       ulong rem = tail_data_sz;
     103    72225915 :       while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
     104   129798018 :       while( rem>= 8UL ) { *(ulong  *)dst = FD_LOAD( ulong,  src );             dst +=  8UL; src +=  8UL; rem -=  8UL; }
     105    62169215 :       if   ( rem>= 4UL ) { *(uint   *)dst = FD_LOAD( uint,   src );             dst +=  4UL; src +=  4UL; rem -=  4UL; }
     106    62169215 :       if   ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src );             dst +=  2UL; src +=  2UL; rem -=  2UL; }
     107    62169215 :       if   ( rem       ) { *(uchar  *)dst = FD_LOAD( uchar,  src );             dst++;                                 }
     108    62169215 :       *(uchar *)dst = (uchar)0x80;
     109             : #     else
     110             :       fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
     111             :       *((uchar *)(tail_data+tail_data_sz)) = (uchar)0x80;
     112             : #     endif
     113             : 
     114    62169215 :       *((ulong *)(tail_data+tail_sz-8UL )) = fd_ulong_bswap( sz<<3 );
     115    62169215 :     }
     116     4289726 :   } while(0);
     117             : 
     118     4289726 :   wwu_t s0 = wwu_bcast( FD_SHA256_INITIAL_A );
     119     4289726 :   wwu_t s1 = wwu_bcast( FD_SHA256_INITIAL_B );
     120     4289726 :   wwu_t s2 = wwu_bcast( FD_SHA256_INITIAL_C );
     121     4289726 :   wwu_t s3 = wwu_bcast( FD_SHA256_INITIAL_D );
     122     4289726 :   wwu_t s4 = wwu_bcast( FD_SHA256_INITIAL_E );
     123     4289726 :   wwu_t s5 = wwu_bcast( FD_SHA256_INITIAL_F );
     124     4289726 :   wwu_t s6 = wwu_bcast( FD_SHA256_INITIAL_G );
     125     4289726 :   wwu_t s7 = wwu_bcast( FD_SHA256_INITIAL_H );
     126             : 
     127     4289726 :   wwv_t zero       = wwv_zero();
     128     4289726 :   wwv_t one        = wwv_one();
     129     4289726 :   wwv_t wwv_64     = wwv_bcast( FD_SHA256_PRIVATE_BUF_MAX );
     130     4289726 :   wwv_t W_sentinel = wwv_bcast( (ulong)scratch );
     131             : 
     132     4289726 :   wwv_t tail_lo      = wwv_ld( batch_tail_data   ); wwv_t tail_hi      = wwv_ld( batch_tail_data+8 );
     133     4289726 :   wwv_t tail_rem_lo  = wwv_ld( batch_tail_rem    ); wwv_t tail_rem_hi  = wwv_ld( batch_tail_rem +8 );
     134     4289726 :   wwv_t W_lo         = wwv_ld( batch_data        ); wwv_t W_hi         = wwv_ld( batch_data     +8 );
     135             : 
     136     4289726 :   wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
     137     4289726 :                                wwv_add( wwv_shr( wwv_ld( batch_sz   ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_lo ), zero );
     138     4289726 :   wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
     139     4289726 :                                wwv_add( wwv_shr( wwv_ld( batch_sz+8 ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_hi ), zero );
     140             : 
     141    64263155 :   for(;;) {
     142    64263155 :     int active_lane_lo = wwv_ne( block_rem_lo, zero );
     143    64263155 :     int active_lane_hi = wwv_ne( block_rem_hi, zero );
     144    64263155 :     if( FD_UNLIKELY( !(active_lane_lo | active_lane_hi) ) ) break;
     145             : 
     146             :     /* Switch lanes that have hit the end of their in-place bulk
     147             :        processing to their out-of-place scratch tail regions as
     148             :        necessary. */
     149             : 
     150    59973429 :     W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
     151    59973429 :     W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
     152             : 
     153             :     /* At this point, we have at least 1 block in this message segment
     154             :        pass that has not been processed.  Load the next 64 bytes of
     155             :        each unprocessed block.  Inactive lanes (e.g. message segments
     156             :        in this pass for which we've already processed all the blocks)
     157             :        will load garbage from a sentinel location (and the result of
     158             :        the state computations for the inactive lane will be ignored). */
     159             : 
     160    59973429 :     ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
     161    59973429 :     ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
     162    59973429 :     wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
     163    59973429 :     wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
     164    59973429 :     uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
     165    59973429 :     uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
     166    59973429 :     uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
     167    59973429 :     uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
     168    59973429 :     uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
     169    59973429 :     uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
     170    59973429 :     uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
     171    59973429 :     uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
     172             : 
     173    59973429 :     wwu_t x0; wwu_t x1; wwu_t x2; wwu_t x3; wwu_t x4; wwu_t x5; wwu_t x6; wwu_t x7;
     174    59973429 :     wwu_t x8; wwu_t x9; wwu_t xa; wwu_t xb; wwu_t xc; wwu_t xd; wwu_t xe; wwu_t xf;
     175    59973429 :     wwu_transpose_16x16( wwu_bswap( wwu_ldu( W0 ) ), wwu_bswap( wwu_ldu( W1 ) ),
     176    59973429 :                          wwu_bswap( wwu_ldu( W2 ) ), wwu_bswap( wwu_ldu( W3 ) ),
     177    59973429 :                          wwu_bswap( wwu_ldu( W4 ) ), wwu_bswap( wwu_ldu( W5 ) ),
     178    59973429 :                          wwu_bswap( wwu_ldu( W6 ) ), wwu_bswap( wwu_ldu( W7 ) ),
     179    59973429 :                          wwu_bswap( wwu_ldu( W8 ) ), wwu_bswap( wwu_ldu( W9 ) ),
     180    59973429 :                          wwu_bswap( wwu_ldu( Wa ) ), wwu_bswap( wwu_ldu( Wb ) ),
     181    59973429 :                          wwu_bswap( wwu_ldu( Wc ) ), wwu_bswap( wwu_ldu( Wd ) ),
     182    59973429 :                          wwu_bswap( wwu_ldu( We ) ), wwu_bswap( wwu_ldu( Wf ) ),
     183    59973429 :                          x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc, xd, xe, xf );
     184             : 
     185             :     /* Compute the SHA-256 state updates */
     186             : 
     187    59973429 :     wwu_t a = s0; wwu_t b = s1; wwu_t c = s2; wwu_t d = s3; wwu_t e = s4; wwu_t f = s5; wwu_t g = s6; wwu_t h = s7;
     188             : 
     189    59973429 : #   define Sigma0(x)  wwu_xor( wwu_rol(x,30), wwu_xor( wwu_rol(x,19), wwu_rol(x,10) ) )
     190    59973429 : #   define Sigma1(x)  wwu_xor( wwu_rol(x,26), wwu_xor( wwu_rol(x,21), wwu_rol(x, 7) ) )
     191    59973429 : #   define sigma0(x)  wwu_xor( wwu_rol(x,25), wwu_xor( wwu_rol(x,14), wwu_shr(x, 3) ) )
     192    59973429 : #   define sigma1(x)  wwu_xor( wwu_rol(x,15), wwu_xor( wwu_rol(x,13), wwu_shr(x,10) ) )
     193    59973429 : #   define Ch(x,y,z)  wwu_xor( wwu_and(x,y), wwu_andnot(x,z) )
     194    59973429 : #   define Maj(x,y,z) wwu_xor( wwu_and(x,y), wwu_xor( wwu_and(x,z), wwu_and(y,z) ) )
     195    59973429 : #   define SHA_CORE(xi,ki)                                                           \
     196  3838299456 :     T1 = wwu_add( wwu_add(xi,ki), wwu_add( wwu_add( h, Sigma1(e) ), Ch(e, f, g) ) ); \
     197  3838299456 :     T2 = wwu_add( Sigma0(a), Maj(a, b, c) );                                         \
     198  3838299456 :     h = g;                                                                           \
     199  3838299456 :     g = f;                                                                           \
     200  3838299456 :     f = e;                                                                           \
     201  3838299456 :     e = wwu_add( d, T1 );                                                            \
     202  3838299456 :     d = c;                                                                           \
     203  3838299456 :     c = b;                                                                           \
     204  3838299456 :     b = a;                                                                           \
     205  3838299456 :     a = wwu_add( T1, T2 )
     206             : 
     207    59973429 :     wwu_t T1;
     208    59973429 :     wwu_t T2;
     209             : 
     210    59973429 :     SHA_CORE( x0, wwu_bcast( fd_sha256_K[ 0] ) );
     211    59973429 :     SHA_CORE( x1, wwu_bcast( fd_sha256_K[ 1] ) );
     212    59973429 :     SHA_CORE( x2, wwu_bcast( fd_sha256_K[ 2] ) );
     213    59973429 :     SHA_CORE( x3, wwu_bcast( fd_sha256_K[ 3] ) );
     214    59973429 :     SHA_CORE( x4, wwu_bcast( fd_sha256_K[ 4] ) );
     215    59973429 :     SHA_CORE( x5, wwu_bcast( fd_sha256_K[ 5] ) );
     216    59973429 :     SHA_CORE( x6, wwu_bcast( fd_sha256_K[ 6] ) );
     217    59973429 :     SHA_CORE( x7, wwu_bcast( fd_sha256_K[ 7] ) );
     218    59973429 :     SHA_CORE( x8, wwu_bcast( fd_sha256_K[ 8] ) );
     219    59973429 :     SHA_CORE( x9, wwu_bcast( fd_sha256_K[ 9] ) );
     220    59973429 :     SHA_CORE( xa, wwu_bcast( fd_sha256_K[10] ) );
     221    59973429 :     SHA_CORE( xb, wwu_bcast( fd_sha256_K[11] ) );
     222    59973429 :     SHA_CORE( xc, wwu_bcast( fd_sha256_K[12] ) );
     223    59973429 :     SHA_CORE( xd, wwu_bcast( fd_sha256_K[13] ) );
     224    59973429 :     SHA_CORE( xe, wwu_bcast( fd_sha256_K[14] ) );
     225    59973429 :     SHA_CORE( xf, wwu_bcast( fd_sha256_K[15] ) );
     226   239893716 :     for( ulong i=16UL; i<64UL; i+=16UL ) {
     227   179920287 :       x0 = wwu_add( wwu_add( x0, sigma0(x1) ), wwu_add( sigma1(xe), x9 ) ); SHA_CORE( x0, wwu_bcast( fd_sha256_K[i     ] ) );
     228   179920287 :       x1 = wwu_add( wwu_add( x1, sigma0(x2) ), wwu_add( sigma1(xf), xa ) ); SHA_CORE( x1, wwu_bcast( fd_sha256_K[i+ 1UL] ) );
     229   179920287 :       x2 = wwu_add( wwu_add( x2, sigma0(x3) ), wwu_add( sigma1(x0), xb ) ); SHA_CORE( x2, wwu_bcast( fd_sha256_K[i+ 2UL] ) );
     230   179920287 :       x3 = wwu_add( wwu_add( x3, sigma0(x4) ), wwu_add( sigma1(x1), xc ) ); SHA_CORE( x3, wwu_bcast( fd_sha256_K[i+ 3UL] ) );
     231   179920287 :       x4 = wwu_add( wwu_add( x4, sigma0(x5) ), wwu_add( sigma1(x2), xd ) ); SHA_CORE( x4, wwu_bcast( fd_sha256_K[i+ 4UL] ) );
     232   179920287 :       x5 = wwu_add( wwu_add( x5, sigma0(x6) ), wwu_add( sigma1(x3), xe ) ); SHA_CORE( x5, wwu_bcast( fd_sha256_K[i+ 5UL] ) );
     233   179920287 :       x6 = wwu_add( wwu_add( x6, sigma0(x7) ), wwu_add( sigma1(x4), xf ) ); SHA_CORE( x6, wwu_bcast( fd_sha256_K[i+ 6UL] ) );
     234   179920287 :       x7 = wwu_add( wwu_add( x7, sigma0(x8) ), wwu_add( sigma1(x5), x0 ) ); SHA_CORE( x7, wwu_bcast( fd_sha256_K[i+ 7UL] ) );
     235   179920287 :       x8 = wwu_add( wwu_add( x8, sigma0(x9) ), wwu_add( sigma1(x6), x1 ) ); SHA_CORE( x8, wwu_bcast( fd_sha256_K[i+ 8UL] ) );
     236   179920287 :       x9 = wwu_add( wwu_add( x9, sigma0(xa) ), wwu_add( sigma1(x7), x2 ) ); SHA_CORE( x9, wwu_bcast( fd_sha256_K[i+ 9UL] ) );
     237   179920287 :       xa = wwu_add( wwu_add( xa, sigma0(xb) ), wwu_add( sigma1(x8), x3 ) ); SHA_CORE( xa, wwu_bcast( fd_sha256_K[i+10UL] ) );
     238   179920287 :       xb = wwu_add( wwu_add( xb, sigma0(xc) ), wwu_add( sigma1(x9), x4 ) ); SHA_CORE( xb, wwu_bcast( fd_sha256_K[i+11UL] ) );
     239   179920287 :       xc = wwu_add( wwu_add( xc, sigma0(xd) ), wwu_add( sigma1(xa), x5 ) ); SHA_CORE( xc, wwu_bcast( fd_sha256_K[i+12UL] ) );
     240   179920287 :       xd = wwu_add( wwu_add( xd, sigma0(xe) ), wwu_add( sigma1(xb), x6 ) ); SHA_CORE( xd, wwu_bcast( fd_sha256_K[i+13UL] ) );
     241   179920287 :       xe = wwu_add( wwu_add( xe, sigma0(xf) ), wwu_add( sigma1(xc), x7 ) ); SHA_CORE( xe, wwu_bcast( fd_sha256_K[i+14UL] ) );
     242   179920287 :       xf = wwu_add( wwu_add( xf, sigma0(x0) ), wwu_add( sigma1(xd), x8 ) ); SHA_CORE( xf, wwu_bcast( fd_sha256_K[i+15UL] ) );
     243   179920287 :     }
     244             : 
     245    59973429 : #   undef SHA_CORE
     246    59973429 : #   undef Sigma0
     247    59973429 : #   undef Sigma1
     248    59973429 : #   undef sigma0
     249    59973429 : #   undef sigma1
     250    59973429 : #   undef Ch
     251    59973429 : #   undef Maj
     252             : 
     253             :     /* Apply the state updates to the active lanes */
     254             : 
     255    59973429 :     int active_lane = active_lane_lo | (active_lane_hi<<8);
     256             : 
     257    59973429 :     s0 = wwu_add_if( active_lane, s0, a, s0 );
     258    59973429 :     s1 = wwu_add_if( active_lane, s1, b, s1 );
     259    59973429 :     s2 = wwu_add_if( active_lane, s2, c, s2 );
     260    59973429 :     s3 = wwu_add_if( active_lane, s3, d, s3 );
     261    59973429 :     s4 = wwu_add_if( active_lane, s4, e, s4 );
     262    59973429 :     s5 = wwu_add_if( active_lane, s5, f, s5 );
     263    59973429 :     s6 = wwu_add_if( active_lane, s6, g, s6 );
     264    59973429 :     s7 = wwu_add_if( active_lane, s7, h, s7 );
     265             : 
     266             :     /* Advance to the next message segment blocks.  In pseudo code,
     267             :        the below is:
     268             : 
     269             :          W += 64; if( block_rem ) block_rem--;
     270             : 
     271             :        Since we do not load anything at W(lane) above unless
     272             :        block_rem(lane) is non-zero, we can omit vector conditional
     273             :        operations for W(lane) below. */
     274             : 
     275    59973429 :     W_lo = wwv_add( W_lo, wwv_64 );
     276    59973429 :     W_hi = wwv_add( W_hi, wwv_64 );
     277             : 
     278    59973429 :     block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
     279    59973429 :     block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
     280    59973429 :   }
     281             : 
     282             :   /* Store the results.  FIXME: Probably could optimize the transpose
     283             :      further by taking into account needed stores (and then maybe go
     284             :      direct into memory ... would need a family of such transposed
     285             :      stores). */
     286             : 
     287     4289726 :   wwu_transpose_2x8x8( wwu_bswap(s0), wwu_bswap(s1), wwu_bswap(s2), wwu_bswap(s3),
     288     4289726 :                        wwu_bswap(s4), wwu_bswap(s5), wwu_bswap(s6), wwu_bswap(s7), s0,s1,s2,s3,s4,s5,s6,s7 );
     289             : 
     290     4289726 :   uint * const * batch_hash = (uint * const *)_batch_hash;
     291     4289726 :   switch( batch_cnt ) { /* application dependent prob */
     292     3235949 :   case 16UL: wu_stu( batch_hash[15], _mm512_extracti32x8_epi32( s7, 1 ) ); __attribute__((fallthrough));
     293     3314809 :   case 15UL: wu_stu( batch_hash[14], _mm512_extracti32x8_epi32( s6, 1 ) ); __attribute__((fallthrough));
     294     3424509 :   case 14UL: wu_stu( batch_hash[13], _mm512_extracti32x8_epi32( s5, 1 ) ); __attribute__((fallthrough));
     295     3503378 :   case 13UL: wu_stu( batch_hash[12], _mm512_extracti32x8_epi32( s4, 1 ) ); __attribute__((fallthrough));
     296     3610535 :   case 12UL: wu_stu( batch_hash[11], _mm512_extracti32x8_epi32( s3, 1 ) ); __attribute__((fallthrough));
     297     3695097 :   case 11UL: wu_stu( batch_hash[10], _mm512_extracti32x8_epi32( s2, 1 ) ); __attribute__((fallthrough));
     298     3802810 :   case 10UL: wu_stu( batch_hash[ 9], _mm512_extracti32x8_epi32( s1, 1 ) ); __attribute__((fallthrough));
     299     3884386 :   case  9UL: wu_stu( batch_hash[ 8], _mm512_extracti32x8_epi32( s0, 1 ) ); __attribute__((fallthrough));
     300     3989428 :   case  8UL: wu_stu( batch_hash[ 7], _mm512_extracti32x8_epi32( s7, 0 ) ); __attribute__((fallthrough));
     301     4073881 :   case  7UL: wu_stu( batch_hash[ 6], _mm512_extracti32x8_epi32( s6, 0 ) ); __attribute__((fallthrough));
     302     4185803 :   case  6UL: wu_stu( batch_hash[ 5], _mm512_extracti32x8_epi32( s5, 0 ) ); __attribute__((fallthrough));
     303     4289726 :   case  5UL: wu_stu( batch_hash[ 4], _mm512_extracti32x8_epi32( s4, 0 ) ); __attribute__((fallthrough));
     304     4289726 :   case  4UL: wu_stu( batch_hash[ 3], _mm512_extracti32x8_epi32( s3, 0 ) ); __attribute__((fallthrough));
     305     4289726 :   case  3UL: wu_stu( batch_hash[ 2], _mm512_extracti32x8_epi32( s2, 0 ) ); __attribute__((fallthrough));
     306     4289726 :   case  2UL: wu_stu( batch_hash[ 1], _mm512_extracti32x8_epi32( s1, 0 ) ); __attribute__((fallthrough));
     307     4289726 :   case  1UL: wu_stu( batch_hash[ 0], _mm512_extracti32x8_epi32( s0, 0 ) ); __attribute__((fallthrough));
     308     4289726 :   default: break;
     309     4289726 :   }
     310     4289726 : }

Generated by: LCOV version 1.14