Line data Source code
1 : /* fd_aes_gcm_ref.c was imported from the OpenSSL project circa 2023-Aug.
2 : Original source file: crypto/modes/gcm128.c */
3 :
4 : #include "fd_aes_gcm.h"
5 :
6 : /*
7 : * Copyright 2010-2022 The OpenSSL Project Authors. All Rights Reserved.
8 : *
9 : * Licensed under the Apache License 2.0 (the "License"). You may not use
10 : * this file except in compliance with the License. You can obtain a copy
11 : * in the file LICENSE in the source distribution or at
12 : * https://www.openssl.org/source/license.html
13 : */
14 :
15 : /*
16 : * NOTE: TABLE_BITS and all non-4bit implementations have been removed in 3.1.
17 : *
18 : * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
19 : * never be set to 8. 8 is effectively reserved for testing purposes.
20 : * TABLE_BITS>1 are lookup-table-driven implementations referred to as
21 : * "Shoup's" in GCM specification. In other words OpenSSL does not cover
22 : * whole spectrum of possible table driven implementations. Why? In
23 : * non-"Shoup's" case memory access pattern is segmented in such manner,
24 : * that it's trivial to see that cache timing information can reveal
25 : * fair portion of intermediate hash value. Given that ciphertext is
26 : * always available to attacker, it's possible for him to attempt to
27 : * deduce secret parameter H and if successful, tamper with messages
28 : * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
29 : * not as trivial, but there is no reason to believe that it's resistant
30 : * to cache-timing attack. And the thing about "8-bit" implementation is
31 : * that it consumes 16 (sixteen) times more memory, 4KB per individual
32 : * key + 1KB shared. Well, on pros side it should be twice as fast as
33 : * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
34 : * was observed to run ~75% faster, closer to 100% for commercial
35 : * compilers... Yet "4-bit" procedure is preferred, because it's
36 : * believed to provide better security-performance balance and adequate
37 : * all-round performance. "All-round" refers to things like:
38 : *
39 : * - shorter setup time effectively improves overall timing for
40 : * handling short messages;
41 : * - larger table allocation can become unbearable because of VM
42 : * subsystem penalties (for example on Windows large enough free
43 : * results in VM working set trimming, meaning that consequent
44 : * malloc would immediately incur working set expansion);
45 : * - larger table has larger cache footprint, which can affect
46 : * performance of other code paths (not necessarily even from same
47 : * thread in Hyper-Threading world);
48 : *
49 : * Value of 1 is not appropriate for performance reasons.
50 : */
51 :
52 : #define REDUCE1BIT(V) \
53 24741558 : do { \
54 24741558 : if (sizeof(ulong)==8) { \
55 24741558 : ulong T = 0xe100000000000000UL & (0-(V.lo&1)); \
56 24741558 : V.lo = (V.hi<<63)|(V.lo>>1); \
57 24741558 : V.hi = (V.hi>>1 )^T; \
58 24741558 : } \
59 24741558 : else { \
60 0 : ulong T = 0xe1000000U & (0-(uint)(V.lo&1)); \
61 0 : V.lo = (V.hi<<63)|(V.lo>>1); \
62 0 : V.hi = (V.hi>>1 )^((ulong)T<<32); \
63 0 : } \
64 24741558 : } while(0)
65 :
66 : void
67 : fd_gcm_init_4bit( fd_gcm128_t Htable[16],
68 : ulong const H[2] )
69 8247186 : {
70 8247186 : fd_gcm128_t V;
71 :
72 8247186 : Htable[0].hi = 0;
73 8247186 : Htable[0].lo = 0;
74 8247186 : V.hi = H[0];
75 8247186 : V.lo = H[1];
76 :
77 8247186 : Htable[8] = V;
78 8247186 : REDUCE1BIT(V);
79 8247186 : Htable[4] = V;
80 8247186 : REDUCE1BIT(V);
81 8247186 : Htable[2] = V;
82 8247186 : REDUCE1BIT(V);
83 8247186 : Htable[1] = V;
84 8247186 : Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
85 8247186 : V = Htable[4];
86 8247186 : Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
87 8247186 : Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
88 8247186 : Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
89 8247186 : V = Htable[8];
90 8247186 : Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
91 8247186 : Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
92 8247186 : Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
93 8247186 : Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
94 8247186 : Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
95 8247186 : Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
96 8247186 : Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
97 8247186 : }
98 :
99 : #define PACK(s) ((ulong)(s)<<(sizeof(ulong)*8-16))
100 :
101 : static const ulong rem_4bit[16] = {
102 : PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
103 : PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
104 : PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
105 : PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
106 : };
107 :
108 : void
109 : fd_gcm_gmult_4bit( ulong Xi[2],
110 4001670 : fd_gcm128_t const Htable[16]) {
111 :
112 4001670 : fd_gcm128_t Z;
113 4001670 : int cnt = 15;
114 4001670 : ulong rem, nlo, nhi;
115 :
116 4001670 : nlo = ((uchar const *)Xi)[15];
117 4001670 : nhi = nlo >> 4;
118 4001670 : nlo &= 0xf;
119 :
120 4001670 : Z.hi = Htable[nlo].hi;
121 4001670 : Z.lo = Htable[nlo].lo;
122 :
123 64026720 : while (1) {
124 64026720 : rem = (ulong)Z.lo & 0xf;
125 64026720 : Z.lo = (Z.hi << 60) | (Z.lo >> 4);
126 64026720 : Z.hi = (Z.hi >> 4);
127 64026720 : Z.hi ^= rem_4bit[rem];
128 :
129 64026720 : Z.hi ^= Htable[nhi].hi;
130 64026720 : Z.lo ^= Htable[nhi].lo;
131 :
132 64026720 : if (--cnt < 0)
133 4001670 : break;
134 :
135 60025050 : nlo = ((uchar const *)Xi)[cnt];
136 60025050 : nhi = nlo >> 4;
137 60025050 : nlo &= 0xf;
138 :
139 60025050 : rem = (ulong)Z.lo & 0xf;
140 60025050 : Z.lo = (Z.hi << 60) | (Z.lo >> 4);
141 60025050 : Z.hi = (Z.hi >> 4);
142 60025050 : Z.hi ^= rem_4bit[rem];
143 :
144 60025050 : Z.hi ^= Htable[nlo].hi;
145 60025050 : Z.lo ^= Htable[nlo].lo;
146 60025050 : }
147 :
148 4001670 : Xi[0] = fd_ulong_bswap( Z.hi );
149 4001670 : Xi[1] = fd_ulong_bswap( Z.lo );
150 4001670 : }
151 :
152 : /*
153 : * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
154 : * details... Compiler-generated code doesn't seem to give any
155 : * performance improvement, at least not on x86[_64]. It's here
156 : * mostly as reference and a placeholder for possible future
157 : * non-trivial optimization[s]...
158 : */
159 : void
160 : fd_gcm_ghash_4bit( ulong Xi[2],
161 : fd_gcm128_t const Htable[16],
162 : uchar const * inp,
163 80391172 : ulong len ) {
164 :
165 80391172 : fd_gcm128_t Z;
166 80391172 : int cnt;
167 80391172 : ulong rem, nlo, nhi;
168 :
169 233760896 : do {
170 233760896 : cnt = 15;
171 233760896 : nlo = ((uchar const *)Xi)[15];
172 233760896 : nlo ^= inp[15];
173 233760896 : nhi = nlo >> 4;
174 233760896 : nlo &= 0xf;
175 :
176 233760896 : Z.hi = Htable[nlo].hi;
177 233760896 : Z.lo = Htable[nlo].lo;
178 :
179 3740174336 : while (1) {
180 3740174336 : rem = (ulong)Z.lo & 0xf;
181 3740174336 : Z.lo = (Z.hi << 60) | (Z.lo >> 4);
182 3740174336 : Z.hi = (Z.hi >> 4);
183 3740174336 : Z.hi ^= rem_4bit[rem];
184 :
185 3740174336 : Z.hi ^= Htable[nhi].hi;
186 3740174336 : Z.lo ^= Htable[nhi].lo;
187 :
188 3740174336 : if (--cnt < 0)
189 233760896 : break;
190 :
191 3506413440 : nlo = ((uchar const *)Xi)[cnt];
192 3506413440 : nlo ^= inp[cnt];
193 3506413440 : nhi = nlo >> 4;
194 3506413440 : nlo &= 0xf;
195 :
196 3506413440 : rem = (ulong)Z.lo & 0xf;
197 3506413440 : Z.lo = (Z.hi << 60) | (Z.lo >> 4);
198 3506413440 : Z.hi = (Z.hi >> 4);
199 3506413440 : Z.hi ^= rem_4bit[rem];
200 :
201 3506413440 : Z.hi ^= Htable[nlo].hi;
202 3506413440 : Z.lo ^= Htable[nlo].lo;
203 3506413440 : }
204 :
205 233760896 : Xi[0] = fd_ulong_bswap( Z.hi );
206 233760896 : Xi[1] = fd_ulong_bswap( Z.lo );
207 :
208 233760896 : inp += 16;
209 : /* Block size is 128 bits so len is a multiple of 16 */
210 233760896 : len -= 16;
211 233760896 : } while (len > 0);
212 80391172 : }
|