LCOV - code coverage report
Current view: top level - ballet/sha256 - fd_sha256_batch_avx512.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 196 196 100.0 %
Date: 2024-11-13 11:58:15 Functions: 1 1 100.0 %

          Line data    Source code
       1             : #define FD_SHA256_BATCH_IMPL 2
       2             : 
       3             : #include "fd_sha256.h"
       4             : #include "../../util/simd/fd_avx512.h"
       5             : #include "../../util/simd/fd_avx.h"
       6             : 
       7             : FD_STATIC_ASSERT( FD_SHA256_BATCH_MAX==16UL, compat );
       8             : 
       9             : void
      10             : fd_sha256_private_batch_avx( ulong          batch_cnt,
      11             :                              void const *   batch_data,
      12             :                              ulong const *  batch_sz,
      13             :                              void * const * batch_hash );
      14             : 
      15             : void
      16             : fd_sha256_private_batch_avx512( ulong          batch_cnt,
      17             :                                 void const *   _batch_data,
      18             :                                 ulong const *  batch_sz,
      19     5701612 :                                 void * const * _batch_hash ) {
      20             : 
      21             :   /* If the batch is small enough, it is more efficient to use the
      22             :      narrow batched implementations.  The threshold for fallback depends
      23             :      on whether that itself narrower batched implementation is using
      24             :      SHA-NI acceleration for really small batches. */
      25             : 
      26     5701612 : # if FD_HAS_SHANI
      27     5701612 : # define MIN_BATCH_CNT (5UL)
      28             : # else
      29             : # define MIN_BATCH_CNT (2UL)
      30             : # endif
      31             : 
      32     5701612 :   if( FD_UNLIKELY( batch_cnt<MIN_BATCH_CNT ) ) {
      33     1545805 :     fd_sha256_private_batch_avx( batch_cnt, _batch_data, batch_sz, _batch_hash );
      34     1545805 :     return;
      35     1545805 :   }
      36             : 
      37     4155807 : # undef MIN_BATCH_CNT
      38             : 
      39             :   /* SHA appends to the end of each message 9 bytes of additional data
      40             :      (a messaging terminator byte and the big endian ulong with the
      41             :      message size in bits) and enough zero padding to make the message
      42             :      an integer number of blocks long.  We compute the 1 or 2 tail
      43             :      blocks of each message here.  We then process complete blocks of
      44             :      the original messages in place, switching to processing these tail
      45             :      blocks in the same pass toward the end.  TODO: This code could
      46             :      probably be SIMD optimized slightly more (this is where all the
      47             :      really performance suboptimally designed parts of SHA live so it is
      48             :      just inherently gross).  The main optimization would probably be to
      49             :      allow tail reading to use a faster memcpy and then maybe some
      50             :      vectorization of the bswap. */
      51             : 
      52     4155807 :   ulong const * batch_data = (ulong const *)_batch_data;
      53             : 
      54     4155807 :   ulong batch_tail_data[ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
      55     4155807 :   ulong batch_tail_rem [ FD_SHA256_BATCH_MAX ] __attribute__((aligned(64)));
      56             : 
      57     4155807 :   uchar scratch[ FD_SHA256_BATCH_MAX*2UL*FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
      58     4155807 :   do {
      59     4155807 :     ulong scratch_free = (ulong)scratch;
      60             : 
      61     4155807 :     wwv_t zero = wwv_zero();
      62             : 
      63    64965998 :     for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
      64             : 
      65             :       /* Allocate the tail blocks for this message */
      66             : 
      67    60810191 :       ulong data = batch_data[ batch_idx ];
      68    60810191 :       ulong sz   = batch_sz  [ batch_idx ];
      69             : 
      70    60810191 :       ulong tail_data     = scratch_free;
      71    60810191 :       ulong tail_data_sz  = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
      72    60810191 :       ulong tail_data_off = fd_ulong_align_dn( sz,               FD_SHA256_PRIVATE_BUF_MAX );
      73    60810191 :       ulong tail_sz       = fd_ulong_align_up( tail_data_sz+9UL, FD_SHA256_PRIVATE_BUF_MAX );
      74             : 
      75    60810191 :       batch_tail_data[ batch_idx ] = tail_data;
      76    60810191 :       batch_tail_rem [ batch_idx ] = tail_sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
      77             : 
      78    60810191 :       scratch_free += tail_sz;
      79             : 
      80             :       /* Populate the tail blocks.  We first clear the blocks (note that
      81             :          it is okay to clobber bytes 64:127 if tail_sz only 64, saving a
      82             :          nasty branch).  Then we copy any straggler data bytes into the
      83             :          tail, terminate the message, and finally record the size of the
      84             :          message in bits at the end as a big endian ulong.  */
      85             : 
      86    60810191 :       wwv_st( (ulong *) tail_data,     zero );
      87    60810191 :       wwv_st( (ulong *)(tail_data+64), zero );
      88             : 
      89    60810191 : #     if 1
      90             :       /* Quick experiments found that, once again, straight memcpy is
      91             :          much slower than a fd_memcpy is slightly slower than a
      92             :          site-optimized handrolled memcpy (fd_memcpy would be less L1I
      93             :          cache footprint though).  They also found that doing the below
      94             :          in a branchless way is slightly worse and an ILP optimized
      95             :          version of the conditional calculation is about the same.  They
      96             :          also found that vectorizing the overall loop and/or Duffing the
      97             :          vectorized loop did not provide noticeable performance
      98             :          improvements under various styles of memcpy. */
      99    60810191 :       ulong src = data + tail_data_off;
     100    60810191 :       ulong dst = tail_data;
     101    60810191 :       ulong rem = tail_data_sz;
     102    75899019 :       while( rem>=32UL ) { wv_st( (ulong *)dst, wv_ldu( (ulong const *)src ) ); dst += 32UL; src += 32UL; rem -= 32UL; }
     103   127889648 :       while( rem>= 8UL ) { *(ulong  *)dst = FD_LOAD( ulong,  src );             dst +=  8UL; src +=  8UL; rem -=  8UL; }
     104    60810191 :       if   ( rem>= 4UL ) { *(uint   *)dst = FD_LOAD( uint,   src );             dst +=  4UL; src +=  4UL; rem -=  4UL; }
     105    60810191 :       if   ( rem>= 2UL ) { *(ushort *)dst = FD_LOAD( ushort, src );             dst +=  2UL; src +=  2UL; rem -=  2UL; }
     106    60810191 :       if   ( rem       ) { *(uchar  *)dst = FD_LOAD( uchar,  src );             dst++;                                 }
     107    60810191 :       *(uchar *)dst = (uchar)0x80;
     108             : #     else
     109             :       fd_memcpy( (void *)tail_data, (void const *)(data + tail_data_off), tail_data_sz );
     110             :       *((uchar *)(tail_data+tail_data_sz)) = (uchar)0x80;
     111             : #     endif
     112             : 
     113    60810191 :       *((ulong *)(tail_data+tail_sz-8UL )) = fd_ulong_bswap( sz<<3 );
     114    60810191 :     }
     115     4155807 :   } while(0);
     116             : 
     117     4155807 :   wwu_t s0 = wwu_bcast( 0x6a09e667U );
     118     4155807 :   wwu_t s1 = wwu_bcast( 0xbb67ae85U );
     119     4155807 :   wwu_t s2 = wwu_bcast( 0x3c6ef372U );
     120     4155807 :   wwu_t s3 = wwu_bcast( 0xa54ff53aU );
     121     4155807 :   wwu_t s4 = wwu_bcast( 0x510e527fU );
     122     4155807 :   wwu_t s5 = wwu_bcast( 0x9b05688cU );
     123     4155807 :   wwu_t s6 = wwu_bcast( 0x1f83d9abU );
     124     4155807 :   wwu_t s7 = wwu_bcast( 0x5be0cd19U );
     125             : 
     126     4155807 :   wwv_t zero       = wwv_zero();
     127     4155807 :   wwv_t one        = wwv_one();
     128     4155807 :   wwv_t wwv_64     = wwv_bcast( FD_SHA256_PRIVATE_BUF_MAX );
     129     4155807 :   wwv_t W_sentinel = wwv_bcast( (ulong)scratch );
     130             : 
     131     4155807 :   wwv_t tail_lo      = wwv_ld( batch_tail_data   ); wwv_t tail_hi      = wwv_ld( batch_tail_data+8 );
     132     4155807 :   wwv_t tail_rem_lo  = wwv_ld( batch_tail_rem    ); wwv_t tail_rem_hi  = wwv_ld( batch_tail_rem +8 );
     133     4155807 :   wwv_t W_lo         = wwv_ld( batch_data        ); wwv_t W_hi         = wwv_ld( batch_data     +8 );
     134             : 
     135     4155807 :   wwv_t block_rem_lo = wwv_if( ((1<<batch_cnt)-1) & 0xff,
     136     4155807 :                                wwv_add( wwv_shr( wwv_ld( batch_sz   ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_lo ), zero );
     137     4155807 :   wwv_t block_rem_hi = wwv_if( ((1<<batch_cnt)-1) >> 8,
     138     4155807 :                                wwv_add( wwv_shr( wwv_ld( batch_sz+8 ), FD_SHA256_PRIVATE_LG_BUF_MAX ), tail_rem_hi ), zero );
     139             : 
     140    62590620 :   for(;;) {
     141    62590620 :     int active_lane_lo = wwv_ne( block_rem_lo, zero );
     142    62590620 :     int active_lane_hi = wwv_ne( block_rem_hi, zero );
     143    62590620 :     if( FD_UNLIKELY( !(active_lane_lo | active_lane_hi) ) ) break;
     144             : 
     145             :     /* Switch lanes that have hit the end of their in-place bulk
     146             :        processing to their out-of-place scratch tail regions as
     147             :        necessary. */
     148             : 
     149    58434813 :     W_lo = wwv_if( wwv_eq( block_rem_lo, tail_rem_lo ), tail_lo, W_lo );
     150    58434813 :     W_hi = wwv_if( wwv_eq( block_rem_hi, tail_rem_hi ), tail_hi, W_hi );
     151             : 
     152             :     /* At this point, we have at least 1 block in this message segment
     153             :        pass that has not been processed.  Load the next 64 bytes of
     154             :        each unprocessed block.  Inactive lanes (e.g. message segments
     155             :        in this pass for which we've already processed all the blocks)
     156             :        will load garbage from a sentinel location (and the result of
     157             :        the state computations for the inactive lane will be ignored). */
     158             : 
     159    58434813 :     ulong _W0; ulong _W1; ulong _W2; ulong _W3; ulong _W4; ulong _W5; ulong _W6; ulong _W7;
     160    58434813 :     ulong _W8; ulong _W9; ulong _Wa; ulong _Wb; ulong _Wc; ulong _Wd; ulong _We; ulong _Wf;
     161    58434813 :     wwv_unpack( wwv_if( active_lane_lo, W_lo, W_sentinel ), _W0, _W1, _W2, _W3, _W4, _W5, _W6, _W7 );
     162    58434813 :     wwv_unpack( wwv_if( active_lane_hi, W_hi, W_sentinel ), _W8, _W9, _Wa, _Wb, _Wc, _Wd, _We, _Wf );
     163    58434813 :     uchar const * W0 = (uchar const *)_W0; uchar const * W1 = (uchar const *)_W1;
     164    58434813 :     uchar const * W2 = (uchar const *)_W2; uchar const * W3 = (uchar const *)_W3;
     165    58434813 :     uchar const * W4 = (uchar const *)_W4; uchar const * W5 = (uchar const *)_W5;
     166    58434813 :     uchar const * W6 = (uchar const *)_W6; uchar const * W7 = (uchar const *)_W7;
     167    58434813 :     uchar const * W8 = (uchar const *)_W8; uchar const * W9 = (uchar const *)_W9;
     168    58434813 :     uchar const * Wa = (uchar const *)_Wa; uchar const * Wb = (uchar const *)_Wb;
     169    58434813 :     uchar const * Wc = (uchar const *)_Wc; uchar const * Wd = (uchar const *)_Wd;
     170    58434813 :     uchar const * We = (uchar const *)_We; uchar const * Wf = (uchar const *)_Wf;
     171             : 
     172    58434813 :     wwu_t x0; wwu_t x1; wwu_t x2; wwu_t x3; wwu_t x4; wwu_t x5; wwu_t x6; wwu_t x7;
     173    58434813 :     wwu_t x8; wwu_t x9; wwu_t xa; wwu_t xb; wwu_t xc; wwu_t xd; wwu_t xe; wwu_t xf;
     174    58434813 :     wwu_transpose_16x16( wwu_bswap( wwu_ldu( W0 ) ), wwu_bswap( wwu_ldu( W1 ) ),
     175    58434813 :                          wwu_bswap( wwu_ldu( W2 ) ), wwu_bswap( wwu_ldu( W3 ) ),
     176    58434813 :                          wwu_bswap( wwu_ldu( W4 ) ), wwu_bswap( wwu_ldu( W5 ) ),
     177    58434813 :                          wwu_bswap( wwu_ldu( W6 ) ), wwu_bswap( wwu_ldu( W7 ) ),
     178    58434813 :                          wwu_bswap( wwu_ldu( W8 ) ), wwu_bswap( wwu_ldu( W9 ) ),
     179    58434813 :                          wwu_bswap( wwu_ldu( Wa ) ), wwu_bswap( wwu_ldu( Wb ) ),
     180    58434813 :                          wwu_bswap( wwu_ldu( Wc ) ), wwu_bswap( wwu_ldu( Wd ) ),
     181    58434813 :                          wwu_bswap( wwu_ldu( We ) ), wwu_bswap( wwu_ldu( Wf ) ),
     182    58434813 :                          x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xa, xb, xc, xd, xe, xf );
     183             : 
     184             :     /* Compute the SHA-256 state updates */
     185             : 
     186    58434813 :     wwu_t a = s0; wwu_t b = s1; wwu_t c = s2; wwu_t d = s3; wwu_t e = s4; wwu_t f = s5; wwu_t g = s6; wwu_t h = s7;
     187             : 
     188    58434813 :     static uint const K[64] = { /* FIXME: Reuse with other functions */
     189    58434813 :       0x428a2f98U, 0x71374491U, 0xb5c0fbcfU, 0xe9b5dba5U, 0x3956c25bU, 0x59f111f1U, 0x923f82a4U, 0xab1c5ed5U,
     190    58434813 :       0xd807aa98U, 0x12835b01U, 0x243185beU, 0x550c7dc3U, 0x72be5d74U, 0x80deb1feU, 0x9bdc06a7U, 0xc19bf174U,
     191    58434813 :       0xe49b69c1U, 0xefbe4786U, 0x0fc19dc6U, 0x240ca1ccU, 0x2de92c6fU, 0x4a7484aaU, 0x5cb0a9dcU, 0x76f988daU,
     192    58434813 :       0x983e5152U, 0xa831c66dU, 0xb00327c8U, 0xbf597fc7U, 0xc6e00bf3U, 0xd5a79147U, 0x06ca6351U, 0x14292967U,
     193    58434813 :       0x27b70a85U, 0x2e1b2138U, 0x4d2c6dfcU, 0x53380d13U, 0x650a7354U, 0x766a0abbU, 0x81c2c92eU, 0x92722c85U,
     194    58434813 :       0xa2bfe8a1U, 0xa81a664bU, 0xc24b8b70U, 0xc76c51a3U, 0xd192e819U, 0xd6990624U, 0xf40e3585U, 0x106aa070U,
     195    58434813 :       0x19a4c116U, 0x1e376c08U, 0x2748774cU, 0x34b0bcb5U, 0x391c0cb3U, 0x4ed8aa4aU, 0x5b9cca4fU, 0x682e6ff3U,
     196    58434813 :       0x748f82eeU, 0x78a5636fU, 0x84c87814U, 0x8cc70208U, 0x90befffaU, 0xa4506cebU, 0xbef9a3f7U, 0xc67178f2U,
     197    58434813 :     };
     198             : 
     199    58434813 : #   define Sigma0(x)  wwu_xor( wwu_rol(x,30), wwu_xor( wwu_rol(x,19), wwu_rol(x,10) ) )
     200    58434813 : #   define Sigma1(x)  wwu_xor( wwu_rol(x,26), wwu_xor( wwu_rol(x,21), wwu_rol(x, 7) ) )
     201    58434813 : #   define sigma0(x)  wwu_xor( wwu_rol(x,25), wwu_xor( wwu_rol(x,14), wwu_shr(x, 3) ) )
     202    58434813 : #   define sigma1(x)  wwu_xor( wwu_rol(x,15), wwu_xor( wwu_rol(x,13), wwu_shr(x,10) ) )
     203    58434813 : #   define Ch(x,y,z)  wwu_xor( wwu_and(x,y), wwu_andnot(x,z) )
     204    58434813 : #   define Maj(x,y,z) wwu_xor( wwu_and(x,y), wwu_xor( wwu_and(x,z), wwu_and(y,z) ) )
     205    58434813 : #   define SHA_CORE(xi,ki)                                                           \
     206  3739828032 :     T1 = wwu_add( wwu_add(xi,ki), wwu_add( wwu_add( h, Sigma1(e) ), Ch(e, f, g) ) ); \
     207  3739828032 :     T2 = wwu_add( Sigma0(a), Maj(a, b, c) );                                         \
     208  3739828032 :     h = g;                                                                           \
     209  3739828032 :     g = f;                                                                           \
     210  3739828032 :     f = e;                                                                           \
     211  3739828032 :     e = wwu_add( d, T1 );                                                            \
     212  3739828032 :     d = c;                                                                           \
     213  3739828032 :     c = b;                                                                           \
     214  3739828032 :     b = a;                                                                           \
     215  3739828032 :     a = wwu_add( T1, T2 )
     216             : 
     217    58434813 :     wwu_t T1;
     218    58434813 :     wwu_t T2;
     219             : 
     220    58434813 :     SHA_CORE( x0, wwu_bcast( K[ 0] ) );
     221    58434813 :     SHA_CORE( x1, wwu_bcast( K[ 1] ) );
     222    58434813 :     SHA_CORE( x2, wwu_bcast( K[ 2] ) );
     223    58434813 :     SHA_CORE( x3, wwu_bcast( K[ 3] ) );
     224    58434813 :     SHA_CORE( x4, wwu_bcast( K[ 4] ) );
     225    58434813 :     SHA_CORE( x5, wwu_bcast( K[ 5] ) );
     226    58434813 :     SHA_CORE( x6, wwu_bcast( K[ 6] ) );
     227    58434813 :     SHA_CORE( x7, wwu_bcast( K[ 7] ) );
     228    58434813 :     SHA_CORE( x8, wwu_bcast( K[ 8] ) );
     229    58434813 :     SHA_CORE( x9, wwu_bcast( K[ 9] ) );
     230    58434813 :     SHA_CORE( xa, wwu_bcast( K[10] ) );
     231    58434813 :     SHA_CORE( xb, wwu_bcast( K[11] ) );
     232    58434813 :     SHA_CORE( xc, wwu_bcast( K[12] ) );
     233    58434813 :     SHA_CORE( xd, wwu_bcast( K[13] ) );
     234    58434813 :     SHA_CORE( xe, wwu_bcast( K[14] ) );
     235    58434813 :     SHA_CORE( xf, wwu_bcast( K[15] ) );
     236   233739252 :     for( ulong i=16UL; i<64UL; i+=16UL ) {
     237   175304439 :       x0 = wwu_add( wwu_add( x0, sigma0(x1) ), wwu_add( sigma1(xe), x9 ) ); SHA_CORE( x0, wwu_bcast( K[i     ] ) );
     238   175304439 :       x1 = wwu_add( wwu_add( x1, sigma0(x2) ), wwu_add( sigma1(xf), xa ) ); SHA_CORE( x1, wwu_bcast( K[i+ 1UL] ) );
     239   175304439 :       x2 = wwu_add( wwu_add( x2, sigma0(x3) ), wwu_add( sigma1(x0), xb ) ); SHA_CORE( x2, wwu_bcast( K[i+ 2UL] ) );
     240   175304439 :       x3 = wwu_add( wwu_add( x3, sigma0(x4) ), wwu_add( sigma1(x1), xc ) ); SHA_CORE( x3, wwu_bcast( K[i+ 3UL] ) );
     241   175304439 :       x4 = wwu_add( wwu_add( x4, sigma0(x5) ), wwu_add( sigma1(x2), xd ) ); SHA_CORE( x4, wwu_bcast( K[i+ 4UL] ) );
     242   175304439 :       x5 = wwu_add( wwu_add( x5, sigma0(x6) ), wwu_add( sigma1(x3), xe ) ); SHA_CORE( x5, wwu_bcast( K[i+ 5UL] ) );
     243   175304439 :       x6 = wwu_add( wwu_add( x6, sigma0(x7) ), wwu_add( sigma1(x4), xf ) ); SHA_CORE( x6, wwu_bcast( K[i+ 6UL] ) );
     244   175304439 :       x7 = wwu_add( wwu_add( x7, sigma0(x8) ), wwu_add( sigma1(x5), x0 ) ); SHA_CORE( x7, wwu_bcast( K[i+ 7UL] ) );
     245   175304439 :       x8 = wwu_add( wwu_add( x8, sigma0(x9) ), wwu_add( sigma1(x6), x1 ) ); SHA_CORE( x8, wwu_bcast( K[i+ 8UL] ) );
     246   175304439 :       x9 = wwu_add( wwu_add( x9, sigma0(xa) ), wwu_add( sigma1(x7), x2 ) ); SHA_CORE( x9, wwu_bcast( K[i+ 9UL] ) );
     247   175304439 :       xa = wwu_add( wwu_add( xa, sigma0(xb) ), wwu_add( sigma1(x8), x3 ) ); SHA_CORE( xa, wwu_bcast( K[i+10UL] ) );
     248   175304439 :       xb = wwu_add( wwu_add( xb, sigma0(xc) ), wwu_add( sigma1(x9), x4 ) ); SHA_CORE( xb, wwu_bcast( K[i+11UL] ) );
     249   175304439 :       xc = wwu_add( wwu_add( xc, sigma0(xd) ), wwu_add( sigma1(xa), x5 ) ); SHA_CORE( xc, wwu_bcast( K[i+12UL] ) );
     250   175304439 :       xd = wwu_add( wwu_add( xd, sigma0(xe) ), wwu_add( sigma1(xb), x6 ) ); SHA_CORE( xd, wwu_bcast( K[i+13UL] ) );
     251   175304439 :       xe = wwu_add( wwu_add( xe, sigma0(xf) ), wwu_add( sigma1(xc), x7 ) ); SHA_CORE( xe, wwu_bcast( K[i+14UL] ) );
     252   175304439 :       xf = wwu_add( wwu_add( xf, sigma0(x0) ), wwu_add( sigma1(xd), x8 ) ); SHA_CORE( xf, wwu_bcast( K[i+15UL] ) );
     253   175304439 :     }
     254             : 
     255    58434813 : #   undef SHA_CORE
     256    58434813 : #   undef Sigma0
     257    58434813 : #   undef Sigma1
     258    58434813 : #   undef sigma0
     259    58434813 : #   undef sigma1
     260    58434813 : #   undef Ch
     261    58434813 : #   undef Maj
     262             : 
     263             :     /* Apply the state updates to the active lanes */
     264             : 
     265    58434813 :     int active_lane = active_lane_lo | (active_lane_hi<<8);
     266             : 
     267    58434813 :     s0 = wwu_add_if( active_lane, s0, a, s0 );
     268    58434813 :     s1 = wwu_add_if( active_lane, s1, b, s1 );
     269    58434813 :     s2 = wwu_add_if( active_lane, s2, c, s2 );
     270    58434813 :     s3 = wwu_add_if( active_lane, s3, d, s3 );
     271    58434813 :     s4 = wwu_add_if( active_lane, s4, e, s4 );
     272    58434813 :     s5 = wwu_add_if( active_lane, s5, f, s5 );
     273    58434813 :     s6 = wwu_add_if( active_lane, s6, g, s6 );
     274    58434813 :     s7 = wwu_add_if( active_lane, s7, h, s7 );
     275             : 
     276             :     /* Advance to the next message segment blocks.  In pseudo code,
     277             :        the below is:
     278             : 
     279             :          W += 64; if( block_rem ) block_rem--;
     280             : 
     281             :        Since we do not load anything at W(lane) above unless
     282             :        block_rem(lane) is non-zero, we can omit vector conditional
     283             :        operations for W(lane) below. */
     284             : 
     285    58434813 :     W_lo = wwv_add( W_lo, wwv_64 );
     286    58434813 :     W_hi = wwv_add( W_hi, wwv_64 );
     287             : 
     288    58434813 :     block_rem_lo = wwv_sub_if( active_lane_lo, block_rem_lo, one, block_rem_lo );
     289    58434813 :     block_rem_hi = wwv_sub_if( active_lane_hi, block_rem_hi, one, block_rem_hi );
     290    58434813 :   }
     291             : 
     292             :   /* Store the results.  FIXME: Probably could optimize the transpose
     293             :      further by taking into account needed stores (and then maybe go
     294             :      direct into memory ... would need a family of such transposed
     295             :      stores). */
     296             : 
     297     4155807 :   wwu_transpose_2x8x8( wwu_bswap(s0), wwu_bswap(s1), wwu_bswap(s2), wwu_bswap(s3),
     298     4155807 :                        wwu_bswap(s4), wwu_bswap(s5), wwu_bswap(s6), wwu_bswap(s7), s0,s1,s2,s3,s4,s5,s6,s7 );
     299             : 
     300     4155807 :   uint * const * batch_hash = (uint * const *)_batch_hash;
     301     4155807 :   switch( batch_cnt ) { /* application dependent prob */
     302     3230871 :   case 16UL: wu_stu( batch_hash[15], _mm512_extracti32x8_epi32( s7, 1 ) ); __attribute__((fallthrough));
     303     3307869 :   case 15UL: wu_stu( batch_hash[14], _mm512_extracti32x8_epi32( s6, 1 ) ); __attribute__((fallthrough));
     304     3394666 :   case 14UL: wu_stu( batch_hash[13], _mm512_extracti32x8_epi32( s5, 1 ) ); __attribute__((fallthrough));
     305     3471673 :   case 13UL: wu_stu( batch_hash[12], _mm512_extracti32x8_epi32( s4, 1 ) ); __attribute__((fallthrough));
     306     3557789 :   case 12UL: wu_stu( batch_hash[11], _mm512_extracti32x8_epi32( s3, 1 ) ); __attribute__((fallthrough));
     307     3636725 :   case 11UL: wu_stu( batch_hash[10], _mm512_extracti32x8_epi32( s2, 1 ) ); __attribute__((fallthrough));
     308     3723437 :   case 10UL: wu_stu( batch_hash[ 9], _mm512_extracti32x8_epi32( s1, 1 ) ); __attribute__((fallthrough));
     309     3801249 :   case  9UL: wu_stu( batch_hash[ 8], _mm512_extracti32x8_epi32( s0, 1 ) ); __attribute__((fallthrough));
     310     3887152 :   case  8UL: wu_stu( batch_hash[ 7], _mm512_extracti32x8_epi32( s7, 0 ) ); __attribute__((fallthrough));
     311     3965979 :   case  7UL: wu_stu( batch_hash[ 6], _mm512_extracti32x8_epi32( s6, 0 ) ); __attribute__((fallthrough));
     312     4053746 :   case  6UL: wu_stu( batch_hash[ 5], _mm512_extracti32x8_epi32( s5, 0 ) ); __attribute__((fallthrough));
     313     4155807 :   case  5UL: wu_stu( batch_hash[ 4], _mm512_extracti32x8_epi32( s4, 0 ) ); __attribute__((fallthrough));
     314     4155807 :   case  4UL: wu_stu( batch_hash[ 3], _mm512_extracti32x8_epi32( s3, 0 ) ); __attribute__((fallthrough));
     315     4155807 :   case  3UL: wu_stu( batch_hash[ 2], _mm512_extracti32x8_epi32( s2, 0 ) ); __attribute__((fallthrough));
     316     4155807 :   case  2UL: wu_stu( batch_hash[ 1], _mm512_extracti32x8_epi32( s1, 0 ) ); __attribute__((fallthrough));
     317     4155807 :   case  1UL: wu_stu( batch_hash[ 0], _mm512_extracti32x8_epi32( s0, 0 ) ); __attribute__((fallthrough));
     318     4155807 :   default: break;
     319     4155807 :   }
     320     4155807 : }

Generated by: LCOV version 1.14