LCOV - code coverage report
Current view: top level - ballet/aes - fd_aes_gcm_ref_ghash.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 104 108 96.3 %
Date: 2025-01-08 12:08:44 Functions: 3 3 100.0 %

          Line data    Source code
       1             : /* fd_aes_gcm_ref.c was imported from the OpenSSL project circa 2023-Aug.
       2             :    Original source file:  crypto/modes/gcm128.c */
       3             : 
       4             : #include "fd_aes_gcm.h"
       5             : 
       6             : /*
       7             :  * Copyright 2010-2022 The OpenSSL Project Authors. All Rights Reserved.
       8             :  *
       9             :  * Licensed under the Apache License 2.0 (the "License").  You may not use
      10             :  * this file except in compliance with the License.  You can obtain a copy
      11             :  * in the file LICENSE in the source distribution or at
      12             :  * https://www.openssl.org/source/license.html
      13             :  */
      14             : 
      15             : /*
      16             :  * NOTE: TABLE_BITS and all non-4bit implementations have been removed in 3.1.
      17             :  *
      18             :  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
      19             :  * never be set to 8. 8 is effectively reserved for testing purposes.
      20             :  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
      21             :  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
      22             :  * whole spectrum of possible table driven implementations. Why? In
      23             :  * non-"Shoup's" case memory access pattern is segmented in such manner,
      24             :  * that it's trivial to see that cache timing information can reveal
      25             :  * fair portion of intermediate hash value. Given that ciphertext is
      26             :  * always available to attacker, it's possible for him to attempt to
      27             :  * deduce secret parameter H and if successful, tamper with messages
      28             :  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
      29             :  * not as trivial, but there is no reason to believe that it's resistant
      30             :  * to cache-timing attack. And the thing about "8-bit" implementation is
      31             :  * that it consumes 16 (sixteen) times more memory, 4KB per individual
      32             :  * key + 1KB shared. Well, on pros side it should be twice as fast as
      33             :  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
      34             :  * was observed to run ~75% faster, closer to 100% for commercial
      35             :  * compilers... Yet "4-bit" procedure is preferred, because it's
      36             :  * believed to provide better security-performance balance and adequate
      37             :  * all-round performance. "All-round" refers to things like:
      38             :  *
      39             :  * - shorter setup time effectively improves overall timing for
      40             :  *   handling short messages;
      41             :  * - larger table allocation can become unbearable because of VM
      42             :  *   subsystem penalties (for example on Windows large enough free
      43             :  *   results in VM working set trimming, meaning that consequent
      44             :  *   malloc would immediately incur working set expansion);
      45             :  * - larger table has larger cache footprint, which can affect
      46             :  *   performance of other code paths (not necessarily even from same
      47             :  *   thread in Hyper-Threading world);
      48             :  *
      49             :  * Value of 1 is not appropriate for performance reasons.
      50             :  */
      51             : 
      52             : #define REDUCE1BIT(V)                                \
      53    24741558 :   do {                                               \
      54    24741558 :     if (sizeof(ulong)==8) {                         \
      55    24741558 :       ulong T = 0xe100000000000000UL & (0-(V.lo&1)); \
      56    24741558 :       V.lo  = (V.hi<<63)|(V.lo>>1);                  \
      57    24741558 :       V.hi  = (V.hi>>1 )^T;                          \
      58    24741558 :     }                                                \
      59    24741558 :     else {                                           \
      60           0 :       ulong T = 0xe1000000U & (0-(uint)(V.lo&1));    \
      61           0 :       V.lo  = (V.hi<<63)|(V.lo>>1);                  \
      62           0 :       V.hi  = (V.hi>>1 )^((ulong)T<<32);             \
      63           0 :     }                                                \
      64    24741558 :   } while(0)
      65             : 
      66             : void
      67             : fd_gcm_init_4bit( fd_gcm128_t Htable[16],
      68             :                   ulong const H[2] )
      69     8247186 : {
      70     8247186 :   fd_gcm128_t V;
      71             : 
      72     8247186 :   Htable[0].hi = 0;
      73     8247186 :   Htable[0].lo = 0;
      74     8247186 :   V.hi = H[0];
      75     8247186 :   V.lo = H[1];
      76             : 
      77     8247186 :   Htable[8] = V;
      78     8247186 :   REDUCE1BIT(V);
      79     8247186 :   Htable[4] = V;
      80     8247186 :   REDUCE1BIT(V);
      81     8247186 :   Htable[2] = V;
      82     8247186 :   REDUCE1BIT(V);
      83     8247186 :   Htable[1] = V;
      84     8247186 :   Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
      85     8247186 :   V = Htable[4];
      86     8247186 :   Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
      87     8247186 :   Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
      88     8247186 :   Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
      89     8247186 :   V = Htable[8];
      90     8247186 :   Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
      91     8247186 :   Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
      92     8247186 :   Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
      93     8247186 :   Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
      94     8247186 :   Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
      95     8247186 :   Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
      96     8247186 :   Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
      97     8247186 : }
      98             : 
      99             : #define PACK(s) ((ulong)(s)<<(sizeof(ulong)*8-16))
     100             : 
     101             : static const ulong rem_4bit[16] = {
     102             :   PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
     103             :   PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
     104             :   PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
     105             :   PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
     106             : };
     107             : 
     108             : void
     109             : fd_gcm_gmult_4bit( ulong             Xi[2],
     110     4001670 :                    fd_gcm128_t const Htable[16]) {
     111             : 
     112     4001670 :   fd_gcm128_t Z;
     113     4001670 :   int cnt = 15;
     114     4001670 :   ulong rem, nlo, nhi;
     115             : 
     116     4001670 :   nlo = ((uchar const *)Xi)[15];
     117     4001670 :   nhi = nlo >> 4;
     118     4001670 :   nlo &= 0xf;
     119             : 
     120     4001670 :   Z.hi = Htable[nlo].hi;
     121     4001670 :   Z.lo = Htable[nlo].lo;
     122             : 
     123    64026720 :   while (1) {
     124    64026720 :     rem = (ulong)Z.lo & 0xf;
     125    64026720 :     Z.lo = (Z.hi << 60) | (Z.lo >> 4);
     126    64026720 :     Z.hi = (Z.hi >> 4);
     127    64026720 :     Z.hi ^= rem_4bit[rem];
     128             : 
     129    64026720 :     Z.hi ^= Htable[nhi].hi;
     130    64026720 :     Z.lo ^= Htable[nhi].lo;
     131             : 
     132    64026720 :     if (--cnt < 0)
     133     4001670 :       break;
     134             : 
     135    60025050 :     nlo = ((uchar const *)Xi)[cnt];
     136    60025050 :     nhi = nlo >> 4;
     137    60025050 :     nlo &= 0xf;
     138             : 
     139    60025050 :     rem = (ulong)Z.lo & 0xf;
     140    60025050 :     Z.lo = (Z.hi << 60) | (Z.lo >> 4);
     141    60025050 :     Z.hi = (Z.hi >> 4);
     142    60025050 :     Z.hi ^= rem_4bit[rem];
     143             : 
     144    60025050 :     Z.hi ^= Htable[nlo].hi;
     145    60025050 :     Z.lo ^= Htable[nlo].lo;
     146    60025050 :   }
     147             : 
     148     4001670 :   Xi[0] = fd_ulong_bswap( Z.hi );
     149     4001670 :   Xi[1] = fd_ulong_bswap( Z.lo );
     150     4001670 : }
     151             : 
     152             : /*
     153             :  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
     154             :  * details... Compiler-generated code doesn't seem to give any
     155             :  * performance improvement, at least not on x86[_64]. It's here
     156             :  * mostly as reference and a placeholder for possible future
     157             :  * non-trivial optimization[s]...
     158             :  */
     159             : void
     160             : fd_gcm_ghash_4bit( ulong             Xi[2],
     161             :                    fd_gcm128_t const Htable[16],
     162             :                    uchar const *     inp,
     163    80391172 :                    ulong             len ) {
     164             : 
     165    80391172 :   fd_gcm128_t Z;
     166    80391172 :   int cnt;
     167    80391172 :   ulong rem, nlo, nhi;
     168             : 
     169   233760896 :   do {
     170   233760896 :     cnt = 15;
     171   233760896 :     nlo = ((uchar const *)Xi)[15];
     172   233760896 :     nlo ^= inp[15];
     173   233760896 :     nhi = nlo >> 4;
     174   233760896 :     nlo &= 0xf;
     175             : 
     176   233760896 :     Z.hi = Htable[nlo].hi;
     177   233760896 :     Z.lo = Htable[nlo].lo;
     178             : 
     179  3740174336 :     while (1) {
     180  3740174336 :       rem = (ulong)Z.lo & 0xf;
     181  3740174336 :       Z.lo = (Z.hi << 60) | (Z.lo >> 4);
     182  3740174336 :       Z.hi = (Z.hi >> 4);
     183  3740174336 :       Z.hi ^= rem_4bit[rem];
     184             : 
     185  3740174336 :       Z.hi ^= Htable[nhi].hi;
     186  3740174336 :       Z.lo ^= Htable[nhi].lo;
     187             : 
     188  3740174336 :       if (--cnt < 0)
     189   233760896 :         break;
     190             : 
     191  3506413440 :       nlo = ((uchar const *)Xi)[cnt];
     192  3506413440 :       nlo ^= inp[cnt];
     193  3506413440 :       nhi = nlo >> 4;
     194  3506413440 :       nlo &= 0xf;
     195             : 
     196  3506413440 :       rem = (ulong)Z.lo & 0xf;
     197  3506413440 :       Z.lo = (Z.hi << 60) | (Z.lo >> 4);
     198  3506413440 :       Z.hi = (Z.hi >> 4);
     199  3506413440 :       Z.hi ^= rem_4bit[rem];
     200             : 
     201  3506413440 :       Z.hi ^= Htable[nlo].hi;
     202  3506413440 :       Z.lo ^= Htable[nlo].lo;
     203  3506413440 :     }
     204             : 
     205   233760896 :     Xi[0] = fd_ulong_bswap( Z.hi );
     206   233760896 :     Xi[1] = fd_ulong_bswap( Z.lo );
     207             : 
     208   233760896 :     inp += 16;
     209             :     /* Block size is 128 bits so len is a multiple of 16 */
     210   233760896 :     len -= 16;
     211   233760896 :   } while (len > 0);
     212    80391172 : }

Generated by: LCOV version 1.14