LCOV - code coverage report
Current view: top level - ballet/sha256 - fd_sha256_batch_avx.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 190 190 100.0 %
Date: 2024-11-13 11:58:15 Functions: 1 1 100.0 %

          Line data    Source code
       1             : #define FD_SHA256_BATCH_IMPL 1
       2             : 
       3             : #include "fd_sha256.h"
       4             : #include "../../util/simd/fd_avx.h"
       5             : 
       6             : FD_STATIC_ASSERT( FD_SHA256_BATCH_MAX==8UL, compat );
       7             : 
       8             : void
       9             : fd_sha256_private_batch_avx( ulong          batch_cnt,
      10             :                              void const *   _batch_data,
      11             :                              ulong const *  batch_sz,
      12    20559531 :                              void * const * _batch_hash ) {
      13             : 
      14             :   /* If the batch is too small, it's faster to run each part of the
      15             :      batch sequentially.  When we have SHA-NI instructions, the
      16             :      sequential implementation is faster, so we need a larger batch size
      17             :      to justify using the batched implementation. */
      18             : 
      19     1545805 : # if FD_HAS_SHANI
      20     1545805 : # define MIN_BATCH_CNT (6UL)
      21             : # else
      22    19013726 : # define MIN_BATCH_CNT (2UL)
      23    19013726 : # endif
      24             : 
      25    20559531 :   if( FD_UNLIKELY( batch_cnt<MIN_BATCH_CNT ) ) {
      26     4291599 :     void const * const * batch_data = (void const * const *)_batch_data;
      27     9089645 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ )
      28     4798046 :       fd_sha256_hash( batch_data[ batch_idx ], batch_sz[ batch_idx ], _batch_hash[ batch_idx ] );
      29     4291599 :     return;
      30     4291599 :   }
      31             : 
      32    16267932 : # undef MIN_BATCH_CNT
      33             : 
      34             :   /* SHA appends to the end of each message 9 bytes of additional data
      35             :      (a messaging terminator byte and the big endian ulong with the
      36             :      message size in bits) and enough zero padding to make the message
      37             :      an integer number of blocks long.  We compute the 1 or 2 tail
      38             :      blocks of each message here.  We then process complete blocks of
      39             :      the original messages in place, switching to processing these tail
      40             :      blocks in the same pass toward the end.  TODO: This code could
      41             :      probably be SIMD optimized slightly more (this is where all the
      42             :      really performance suboptimally designed parts of SHA live so it is
      43             :      just inherently gross).  The main optimization would probably be to
      44             :      allow tail reading to use a faster memcpy and then maybe some
      45             :      vectorization of the bswap. */
      46             : 
      47    16267932 :   ulong const * batch_data = (ulong const *)_batch_data;
      48             : 
      49    16267932 :   ulong batch_tail_data[ FD_SHA256_BATCH_MAX ] __attribute__((aligned(32)));
      50    16267932 :   ulong batch_tail_rem [ FD_SHA256_BATCH_MAX ] __attribute__((aligned(32)));
      51             : 
      52    16267932 :   uchar scratch[ FD_SHA256_BATCH_MAX*2UL*FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
      53    16267932 :   do {
      54    16267932 :     ulong scratch_free = (ulong)scratch;
      55             : 
      56    16267932 :     wv_t zero = wv_zero();
      57             : 
      58   139311056 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
      59             : 
      60             :       /* Allocate the tail blocks for this message */
      61             : 
      62   123043124 :       ulong data = batch_data[ batch_idx ];
      63   123043124 :       ulong sz   = batch_sz  [ batch_idx ];
      64             : 
      65   123043124 :       ulong tail_data     = scratch_free;
      66   123043124 :       ulong tail_data_sz  = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
      67   123043124 :       ulong tail_data_off = fd_ulong_align_dn( sz,               FD_SHA256_PRIVATE_BUF_MAX );
      68   123043124 :       ulong tail_sz       = fd_ulong_align_up( tail_data_sz+9UL, FD_SHA256_PRIVATE_BUF_MAX );
      69             : 
      70   123043124 :       batch_tail_data[ batch_idx ] = tail_data;
      71   123043124 :       batch_tail_rem [ batch_idx ] = tail_sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
      72             : 
      73   123043124 :       scratch_free += tail_sz;
      74             : 
      75             :       /* Populate the tail blocks.  We first clear the blocks (note that
      76             :          it is okay to clobber bytes 64:127 if tail_sz only 64, saving a
      77             :          nasty branch).  Then we copy any straggler data bytes into the
      78             :          tail, terminate the message, and finally record the size of the
      79             :          message in bits at the end as a big endian ulong.  */
      80             : 
      81   123043124 :       wv_st( (ulong *) tail_data,     zero );
      82   123043124 :       wv_st( (ulong *)(tail_data+32), zero );
      83   123043124 :       wv_st( (ulong *)(tail_data+64), zero );
      84   123043124 :       wv_st( (ulong *)(tail_data+96), zero );
      85             : 
      86   123043124 : #     if 1
      87             :       /* Quick experiments found that, once again, straight memcpy is
      88             :          much slower than a fd_memcpy is slightly slower than a
      89             :          site-optimized handrolled memcpy (fd_memcpy would be less L1I
      90             :          cache footprint though).  They also found that doing the below
      91             :          in a branchless way is slightly worse and an ILP optimized
      92             :          version of the conditional calculation is about the same.  They
      93             :          also found that vectorizing the overall loop and/or Duffing the
      94             :          vectorized loop did not provide noticeable performance
      95             :          improvements under various styles of memcpy. */
      96   123043124 :       ulong src = data + tail_data_off;
      97   123043124 :       ulong dst = tail_data;
      98   123043124 :       ulong rem = tail_data_sz;
      99   153385816 :       while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
     100   258455602 :       while( rem>= 8UL ) { *(ulong  *)dst = FD_LOAD( ulong,  src );             dst +=  8UL; src +=  8UL; rem -=  8UL; }
     101   123043124 :       if   ( rem>= 4UL ) { *(uint   *)dst = FD_LOAD( uint,   src );             dst +=  4UL; src +=  4UL; rem -=  4UL; }
     102   123043124 :       if   ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src );             dst +=  2UL; src +=  2UL; rem -=  2UL; }
     103   123043124 :       if   ( rem       ) { *(uchar  *)dst = FD_LOAD( uchar,  src );             dst++;                                 }
     104   123043124 :       *(uchar *)dst = (uchar)0x80;
     105             : #     else
     106             :       fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
     107             :       *((uchar *)(tail_data+tail_data_sz)) = (uchar)0x80;
     108             : #     endif
     109             : 
     110   123043124 :       *((ulong *)(tail_data+tail_sz-8UL )) = fd_ulong_bswap( sz<<3 );
     111   123043124 :     }
     112    16267932 :   } while(0);
     113             : 
     114    16267932 :   wu_t s0 = wu_bcast( 0x6a09e667U );
     115    16267932 :   wu_t s1 = wu_bcast( 0xbb67ae85U );
     116    16267932 :   wu_t s2 = wu_bcast( 0x3c6ef372U );
     117    16267932 :   wu_t s3 = wu_bcast( 0xa54ff53aU );
     118    16267932 :   wu_t s4 = wu_bcast( 0x510e527fU );
     119    16267932 :   wu_t s5 = wu_bcast( 0x9b05688cU );
     120    16267932 :   wu_t s6 = wu_bcast( 0x1f83d9abU );
     121    16267932 :   wu_t s7 = wu_bcast( 0x5be0cd19U );
     122             : 
     123    16267932 :   wv_t wv_64        = wv_bcast( FD_SHA256_PRIVATE_BUF_MAX );
     124    16267932 :   wv_t W_sentinel   = wv_bcast( (ulong)scratch );
     125    16267932 :   wc_t batch_lane   = wc_unpack( (1<<batch_cnt)-1 );
     126             : 
     127    16267932 :   wv_t tail_lo      = wv_ld( batch_tail_data   );
     128    16267932 :   wv_t tail_hi      = wv_ld( batch_tail_data+4 );
     129             : 
     130    16267932 :   wv_t tail_rem_lo  = wv_ld( batch_tail_rem    );
     131    16267932 :   wv_t tail_rem_hi  = wv_ld( batch_tail_rem+4  );
     132             : 
     133    16267932 :   wv_t W_lo         = wv_ld( batch_data        );
     134    16267932 :   wv_t W_hi         = wv_ld( batch_data+4      );
     135             : 
     136    16267932 :   wv_t block_rem_lo = wv_notczero( wc_expand( batch_lane, 0 ),
     137    16267932 :                         wv_add( wv_shr( wv_ld( batch_sz   ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_lo ) );
     138    16267932 :   wv_t block_rem_hi = wv_notczero( wc_expand( batch_lane, 1 ),
     139    16267932 :                         wv_add( wv_shr( wv_ld( batch_sz+4 ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_hi ) );
     140   246206218 :   for(;;) {
     141   246206218 :     wc_t active_lane_lo = wv_to_wc( block_rem_lo );
     142   246206218 :     wc_t active_lane_hi = wv_to_wc( block_rem_hi );
     143   246206218 :     if( FD_UNLIKELY( !wc_any( wc_or( active_lane_lo, active_lane_hi ) ) ) ) break;
     144             : 
     145             :     /* Switch lanes that have hit the end of their in-place bulk
     146             :        processing to their out-of-place scratch tail regions as
     147             :        necessary. */
     148             : 
     149   229938286 :     W_lo = wv_if( wv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
     150   229938286 :     W_hi = wv_if( wv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
     151             : 
     152             :     /* At this point, we have at least 1 block in this message segment
     153             :        pass that has not been processed.  Load the next 64 bytes of
     154             :        each unprocessed block.  Inactive lanes (e.g. message segments
     155             :        in this pass for which we've already processed all the blocks)
     156             :        will load garbage from a sentinel location (and the result of
     157             :        the state computations for the inactive lane will be ignored). */
     158             : 
     159   229938286 :     wv_t W03 = wv_if( active_lane_lo, W_lo, W_sentinel );
     160   229938286 :     uchar const * W0 = (uchar const *)wv_extract( W03, 0 );
     161   229938286 :     uchar const * W1 = (uchar const *)wv_extract( W03, 1 );
     162   229938286 :     uchar const * W2 = (uchar const *)wv_extract( W03, 2 );
     163   229938286 :     uchar const * W3 = (uchar const *)wv_extract( W03, 3 );
     164             : 
     165   229938286 :     wv_t W47 = wv_if( active_lane_hi, W_hi, W_sentinel );
     166   229938286 :     uchar const * W4 = (uchar const *)wv_extract( W47, 0 );
     167   229938286 :     uchar const * W5 = (uchar const *)wv_extract( W47, 1 );
     168   229938286 :     uchar const * W6 = (uchar const *)wv_extract( W47, 2 );
     169   229938286 :     uchar const * W7 = (uchar const *)wv_extract( W47, 3 );
     170             : 
     171   229938286 :     wu_t x0; wu_t x1; wu_t x2; wu_t x3; wu_t x4; wu_t x5; wu_t x6; wu_t x7;
     172   229938286 :     wu_transpose_8x8( wu_bswap( wu_ldu(W0   ) ), wu_bswap( wu_ldu(W1   ) ), wu_bswap( wu_ldu(W2   ) ), wu_bswap( wu_ldu(W3   ) ),
     173   229938286 :                       wu_bswap( wu_ldu(W4   ) ), wu_bswap( wu_ldu(W5   ) ), wu_bswap( wu_ldu(W6   ) ), wu_bswap( wu_ldu(W7   ) ),
     174   229938286 :                       x0, x1, x2, x3, x4, x5, x6, x7 );
     175             : 
     176   229938286 :     wu_t x8; wu_t x9; wu_t xa; wu_t xb; wu_t xc; wu_t xd; wu_t xe; wu_t xf;
     177   229938286 :     wu_transpose_8x8( wu_bswap( wu_ldu(W0+32) ), wu_bswap( wu_ldu(W1+32) ), wu_bswap( wu_ldu(W2+32) ), wu_bswap( wu_ldu(W3+32) ),
     178   229938286 :                       wu_bswap( wu_ldu(W4+32) ), wu_bswap( wu_ldu(W5+32) ), wu_bswap( wu_ldu(W6+32) ), wu_bswap( wu_ldu(W7+32) ),
     179   229938286 :                       x8, x9, xa, xb, xc, xd, xe, xf );
     180             : 
     181             :     /* Compute the SHA-256 state updates */
     182             : 
     183   229938286 :     wu_t a = s0; wu_t b = s1; wu_t c = s2; wu_t d = s3; wu_t e = s4; wu_t f = s5; wu_t g = s6; wu_t h = s7;
     184             : 
     185   229938286 :     static uint const K[64] = { /* FIXME: Reuse with other functions */
     186   229938286 :       0x428a2f98U, 0x71374491U, 0xb5c0fbcfU, 0xe9b5dba5U, 0x3956c25bU, 0x59f111f1U, 0x923f82a4U, 0xab1c5ed5U,
     187   229938286 :       0xd807aa98U, 0x12835b01U, 0x243185beU, 0x550c7dc3U, 0x72be5d74U, 0x80deb1feU, 0x9bdc06a7U, 0xc19bf174U,
     188   229938286 :       0xe49b69c1U, 0xefbe4786U, 0x0fc19dc6U, 0x240ca1ccU, 0x2de92c6fU, 0x4a7484aaU, 0x5cb0a9dcU, 0x76f988daU,
     189   229938286 :       0x983e5152U, 0xa831c66dU, 0xb00327c8U, 0xbf597fc7U, 0xc6e00bf3U, 0xd5a79147U, 0x06ca6351U, 0x14292967U,
     190   229938286 :       0x27b70a85U, 0x2e1b2138U, 0x4d2c6dfcU, 0x53380d13U, 0x650a7354U, 0x766a0abbU, 0x81c2c92eU, 0x92722c85U,
     191   229938286 :       0xa2bfe8a1U, 0xa81a664bU, 0xc24b8b70U, 0xc76c51a3U, 0xd192e819U, 0xd6990624U, 0xf40e3585U, 0x106aa070U,
     192   229938286 :       0x19a4c116U, 0x1e376c08U, 0x2748774cU, 0x34b0bcb5U, 0x391c0cb3U, 0x4ed8aa4aU, 0x5b9cca4fU, 0x682e6ff3U,
     193   229938286 :       0x748f82eeU, 0x78a5636fU, 0x84c87814U, 0x8cc70208U, 0x90befffaU, 0xa4506cebU, 0xbef9a3f7U, 0xc67178f2U,
     194   229938286 :     };
     195             : 
     196   229938286 : #   define Sigma0(x)  wu_xor( wu_rol(x,30), wu_xor( wu_rol(x,19), wu_rol(x,10) ) )
     197   229938286 : #   define Sigma1(x)  wu_xor( wu_rol(x,26), wu_xor( wu_rol(x,21), wu_rol(x, 7) ) )
     198   229938286 : #   define sigma0(x)  wu_xor( wu_rol(x,25), wu_xor( wu_rol(x,14), wu_shr(x, 3) ) )
     199   229938286 : #   define sigma1(x)  wu_xor( wu_rol(x,15), wu_xor( wu_rol(x,13), wu_shr(x,10) ) )
     200   229938286 : #   define Ch(x,y,z)  wu_xor( wu_and(x,y), wu_andnot(x,z) )
     201   229938286 : #   define Maj(x,y,z) wu_xor( wu_and(x,y), wu_xor( wu_and(x,z), wu_and(y,z) ) )
     202   229938286 : #   define SHA_CORE(xi,ki)                                                       \
     203 14716050304 :     T1 = wu_add( wu_add(xi,ki), wu_add( wu_add( h, Sigma1(e) ), Ch(e, f, g) ) ); \
     204 14716050304 :     T2 = wu_add( Sigma0(a), Maj(a, b, c) );                                      \
     205 14716050304 :     h = g;                                                                       \
     206 14716050304 :     g = f;                                                                       \
     207 14716050304 :     f = e;                                                                       \
     208 14716050304 :     e = wu_add( d, T1 );                                                         \
     209 14716050304 :     d = c;                                                                       \
     210 14716050304 :     c = b;                                                                       \
     211 14716050304 :     b = a;                                                                       \
     212 14716050304 :     a = wu_add( T1, T2 )
     213             : 
     214   229938286 :     wu_t T1;
     215   229938286 :     wu_t T2;
     216             : 
     217   229938286 :     SHA_CORE( x0, wu_bcast( K[ 0] ) );
     218   229938286 :     SHA_CORE( x1, wu_bcast( K[ 1] ) );
     219   229938286 :     SHA_CORE( x2, wu_bcast( K[ 2] ) );
     220   229938286 :     SHA_CORE( x3, wu_bcast( K[ 3] ) );
     221   229938286 :     SHA_CORE( x4, wu_bcast( K[ 4] ) );
     222   229938286 :     SHA_CORE( x5, wu_bcast( K[ 5] ) );
     223   229938286 :     SHA_CORE( x6, wu_bcast( K[ 6] ) );
     224   229938286 :     SHA_CORE( x7, wu_bcast( K[ 7] ) );
     225   229938286 :     SHA_CORE( x8, wu_bcast( K[ 8] ) );
     226   229938286 :     SHA_CORE( x9, wu_bcast( K[ 9] ) );
     227   229938286 :     SHA_CORE( xa, wu_bcast( K[10] ) );
     228   229938286 :     SHA_CORE( xb, wu_bcast( K[11] ) );
     229   229938286 :     SHA_CORE( xc, wu_bcast( K[12] ) );
     230   229938286 :     SHA_CORE( xd, wu_bcast( K[13] ) );
     231   229938286 :     SHA_CORE( xe, wu_bcast( K[14] ) );
     232   229938286 :     SHA_CORE( xf, wu_bcast( K[15] ) );
     233   919753144 :     for( ulong i=16UL; i<64UL; i+=16UL ) {
     234   689814858 :       x0 = wu_add( wu_add( x0, sigma0(x1) ), wu_add( sigma1(xe), x9 ) ); SHA_CORE( x0, wu_bcast( K[i     ] ) );
     235   689814858 :       x1 = wu_add( wu_add( x1, sigma0(x2) ), wu_add( sigma1(xf), xa ) ); SHA_CORE( x1, wu_bcast( K[i+ 1UL] ) );
     236   689814858 :       x2 = wu_add( wu_add( x2, sigma0(x3) ), wu_add( sigma1(x0), xb ) ); SHA_CORE( x2, wu_bcast( K[i+ 2UL] ) );
     237   689814858 :       x3 = wu_add( wu_add( x3, sigma0(x4) ), wu_add( sigma1(x1), xc ) ); SHA_CORE( x3, wu_bcast( K[i+ 3UL] ) );
     238   689814858 :       x4 = wu_add( wu_add( x4, sigma0(x5) ), wu_add( sigma1(x2), xd ) ); SHA_CORE( x4, wu_bcast( K[i+ 4UL] ) );
     239   689814858 :       x5 = wu_add( wu_add( x5, sigma0(x6) ), wu_add( sigma1(x3), xe ) ); SHA_CORE( x5, wu_bcast( K[i+ 5UL] ) );
     240   689814858 :       x6 = wu_add( wu_add( x6, sigma0(x7) ), wu_add( sigma1(x4), xf ) ); SHA_CORE( x6, wu_bcast( K[i+ 6UL] ) );
     241   689814858 :       x7 = wu_add( wu_add( x7, sigma0(x8) ), wu_add( sigma1(x5), x0 ) ); SHA_CORE( x7, wu_bcast( K[i+ 7UL] ) );
     242   689814858 :       x8 = wu_add( wu_add( x8, sigma0(x9) ), wu_add( sigma1(x6), x1 ) ); SHA_CORE( x8, wu_bcast( K[i+ 8UL] ) );
     243   689814858 :       x9 = wu_add( wu_add( x9, sigma0(xa) ), wu_add( sigma1(x7), x2 ) ); SHA_CORE( x9, wu_bcast( K[i+ 9UL] ) );
     244   689814858 :       xa = wu_add( wu_add( xa, sigma0(xb) ), wu_add( sigma1(x8), x3 ) ); SHA_CORE( xa, wu_bcast( K[i+10UL] ) );
     245   689814858 :       xb = wu_add( wu_add( xb, sigma0(xc) ), wu_add( sigma1(x9), x4 ) ); SHA_CORE( xb, wu_bcast( K[i+11UL] ) );
     246   689814858 :       xc = wu_add( wu_add( xc, sigma0(xd) ), wu_add( sigma1(xa), x5 ) ); SHA_CORE( xc, wu_bcast( K[i+12UL] ) );
     247   689814858 :       xd = wu_add( wu_add( xd, sigma0(xe) ), wu_add( sigma1(xb), x6 ) ); SHA_CORE( xd, wu_bcast( K[i+13UL] ) );
     248   689814858 :       xe = wu_add( wu_add( xe, sigma0(xf) ), wu_add( sigma1(xc), x7 ) ); SHA_CORE( xe, wu_bcast( K[i+14UL] ) );
     249   689814858 :       xf = wu_add( wu_add( xf, sigma0(x0) ), wu_add( sigma1(xd), x8 ) ); SHA_CORE( xf, wu_bcast( K[i+15UL] ) );
     250   689814858 :     }
     251             : 
     252   229938286 : #   undef SHA_CORE
     253   229938286 : #   undef Sigma0
     254   229938286 : #   undef Sigma1
     255   229938286 : #   undef sigma0
     256   229938286 : #   undef sigma1
     257   229938286 : #   undef Ch
     258   229938286 : #   undef Maj
     259             : 
     260             :     /* Apply the state updates to the active lanes */
     261             : 
     262   229938286 :     wc_t active_lane = wc_narrow( active_lane_lo, active_lane_hi );
     263   229938286 :     s0 = wu_add( s0, wu_notczero( active_lane, a ) );
     264   229938286 :     s1 = wu_add( s1, wu_notczero( active_lane, b ) );
     265   229938286 :     s2 = wu_add( s2, wu_notczero( active_lane, c ) );
     266   229938286 :     s3 = wu_add( s3, wu_notczero( active_lane, d ) );
     267   229938286 :     s4 = wu_add( s4, wu_notczero( active_lane, e ) );
     268   229938286 :     s5 = wu_add( s5, wu_notczero( active_lane, f ) );
     269   229938286 :     s6 = wu_add( s6, wu_notczero( active_lane, g ) );
     270   229938286 :     s7 = wu_add( s7, wu_notczero( active_lane, h ) );
     271             : 
     272             :     /* Advance to the next message segment blocks.  In pseudo code,
     273             :        the below is:
     274             : 
     275             :          W += 64; if( block_rem ) block_rem--;
     276             : 
     277             :        Since wc_to_wv_raw(false/true) is 0UL/~0UL, we can use wv_add /
     278             :        wc_to_wv_raw instead of wv_sub / wc_to_wv to save some ops.
     279             :        (Consider conditional increment / decrement operations?)
     280             : 
     281             :        Also since we do not load anything at W(lane) above unless
     282             :        block_rem(lane) is non-zero, we can omit vector conditional
     283             :        operations for W(lane) below to save some additional ops. */
     284             : 
     285   229938286 :     W_lo = wv_add( W_lo, wv_64 );
     286   229938286 :     W_hi = wv_add( W_hi, wv_64 );
     287             : 
     288   229938286 :     block_rem_lo = wv_add( block_rem_lo, wc_to_wv_raw( active_lane_lo ) );
     289   229938286 :     block_rem_hi = wv_add( block_rem_hi, wc_to_wv_raw( active_lane_hi ) );
     290   229938286 :   }
     291             : 
     292             :   /* Store the results.  FIXME: Probably could optimize the transpose
     293             :      further by taking into account needed stores (and then maybe go
     294             :      direct into memory ... would need a family of such transposed
     295             :      stores). */
     296             : 
     297    16267932 :   wu_transpose_8x8( s0,s1,s2,s3,s4,s5,s6,s7, s0,s1,s2,s3,s4,s5,s6,s7 );
     298             : 
     299    16267932 :   uint * const * batch_hash = (uint * const *)_batch_hash;
     300    16267932 :   switch( batch_cnt ) { /* application dependent prob */
     301    14244050 :   case 8UL: wu_stu( batch_hash[7], wu_bswap( s7 ) ); __attribute__((fallthrough));
     302    14555700 :   case 7UL: wu_stu( batch_hash[6], wu_bswap( s6 ) ); __attribute__((fallthrough));
     303    14904828 :   case 6UL: wu_stu( batch_hash[5], wu_bswap( s5 ) ); __attribute__((fallthrough));
     304    15262964 :   case 5UL: wu_stu( batch_hash[4], wu_bswap( s4 ) ); __attribute__((fallthrough));
     305    15612846 :   case 4UL: wu_stu( batch_hash[3], wu_bswap( s3 ) ); __attribute__((fallthrough));
     306    15926872 :   case 3UL: wu_stu( batch_hash[2], wu_bswap( s2 ) ); __attribute__((fallthrough));
     307    16267932 :   case 2UL: wu_stu( batch_hash[1], wu_bswap( s1 ) ); __attribute__((fallthrough));
     308    16267932 :   case 1UL: wu_stu( batch_hash[0], wu_bswap( s0 ) ); __attribute__((fallthrough));
     309    16267932 :   default: break;
     310    16267932 :   }
     311    16267932 : }

Generated by: LCOV version 1.14