Line data Source code
1 : #ifndef HEADER_fd_src_ballet_reedsol_fd_reedsol_arith_avx2_h
2 : #define HEADER_fd_src_ballet_reedsol_fd_reedsol_arith_avx2_h
3 :
4 : #ifndef HEADER_fd_src_ballet_reedsol_fd_reedsol_private_h
5 : #error "Do not include this file directly; use fd_reedsol_private.h"
6 : #endif
7 :
8 : #include "../../util/simd/fd_avx.h"
9 :
10 : typedef wb_t gf_t;
11 :
12 94846728 : #define GF_WIDTH W_FOOTPRINT
13 :
14 : FD_PROTOTYPES_BEGIN
15 :
16 1072465238 : #define gf_ldu wb_ldu
17 1091620388 : #define gf_stu wb_stu
18 684090164 : #define gf_zero wb_zero
19 :
20 : extern uchar const fd_reedsol_arith_consts_avx_mul[] __attribute__((aligned(128)));
21 :
22 : /* TODO: This linkage is kinda wonky (maybe use FD_FN_UNUSED) if this
23 : include gets used more generally. The below currently needs to be
24 : available at compile time, not link time, to allow the optimizer to
25 : use it. */
26 :
27 : static uchar const fd_reedsol_arith_scale4[ 256UL ] = {
28 : 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
29 : 29, 13, 61, 45, 93, 77, 125, 109, 157, 141, 189, 173, 221, 205, 253, 237,
30 : 58, 42, 26, 10, 122, 106, 90, 74, 186, 170, 154, 138, 250, 234, 218, 202,
31 : 39, 55, 7, 23, 103, 119, 71, 87, 167, 183, 135, 151, 231, 247, 199, 215,
32 : 116, 100, 84, 68, 52, 36, 20, 4, 244, 228, 212, 196, 180, 164, 148, 132,
33 : 105, 121, 73, 89, 41, 57, 9, 25, 233, 249, 201, 217, 169, 185, 137, 153,
34 : 78, 94, 110, 126, 14, 30, 46, 62, 206, 222, 238, 254, 142, 158, 174, 190,
35 : 83, 67, 115, 99, 19, 3, 51, 35, 211, 195, 243, 227, 147, 131, 179, 163,
36 : 232, 248, 200, 216, 168, 184, 136, 152, 104, 120, 72, 88, 40, 56, 8, 24,
37 : 245, 229, 213, 197, 181, 165, 149, 133, 117, 101, 85, 69, 53, 37, 21, 5,
38 : 210, 194, 242, 226, 146, 130, 178, 162, 82, 66, 114, 98, 18, 2, 50, 34,
39 : 207, 223, 239, 255, 143, 159, 175, 191, 79, 95, 111, 127, 15, 31, 47, 63,
40 : 156, 140, 188, 172, 220, 204, 252, 236, 28, 12, 60, 44, 92, 76, 124, 108,
41 : 129, 145, 161, 177, 193, 209, 225, 241, 1, 17, 33, 49, 65, 81, 97, 113,
42 : 166, 182, 134, 150, 230, 246, 198, 214, 38, 54, 6, 22, 102, 118, 70, 86,
43 : 187, 171, 155, 139, 251, 235, 219, 203, 59, 43, 27, 11, 123, 107, 91, 75
44 : };
45 :
46 12378988448 : #define GF_ADD wb_xor
47 :
48 1014 : #define GF_OR wb_or
49 :
50 328395396 : #define GF_MUL( a, c ) (__extension__({ \
51 328395396 : wb_t _a = (a); \
52 328395396 : int _c = (c); \
53 328395396 : wb_t _lo = wb_and( _a, wb_bcast( 0x0F ) ); \
54 328395396 : wb_t _hi = wb_shr( _a, 4 ); \
55 328395396 : wb_t _p0 = _mm256_shuffle_epi8( wb_ld( fd_reedsol_arith_consts_avx_mul + 32*_c ), _lo ); \
56 328395396 : wb_t _p1 = _mm256_shuffle_epi8( wb_ld( fd_reedsol_arith_consts_avx_mul + 32*fd_reedsol_arith_scale4[ _c ] ), _hi ); \
57 328395396 : /* c is known at compile time, so this is not a runtime branch */ \
58 328395396 : (_c==0) ? wb_zero() : ( (_c==1) ? _a : wb_xor( _p0, _p1 ) ); \
59 328395396 : }))
60 :
61 192000 : #define GF_MUL_VAR( a, c ) (__extension__({ \
62 192000 : wb_t _a = (a); \
63 192000 : int _c = (c); \
64 192000 : wb_t _lo = wb_and( _a, wb_bcast( 0x0F ) ); \
65 192000 : wb_t _hi = wb_shr( _a, 4 ); \
66 192000 : wb_t _p0 = _mm256_shuffle_epi8( wb_ld( fd_reedsol_arith_consts_avx_mul + 32*_c ), _lo ); \
67 192000 : wb_t _p1 = _mm256_shuffle_epi8( wb_ld( fd_reedsol_arith_consts_avx_mul + 32*fd_reedsol_arith_scale4[ _c ] ), _hi ); \
68 192000 : wb_xor( _p0, _p1 ); \
69 192000 : }))
70 :
71 : #define GF_ANY( x ) (0 != _mm256_movemask_epi8( wb_ne( (x), wb_zero() ) ))
72 :
73 : FD_PROTOTYPES_END
74 :
75 : #endif /* HEADER_fd_src_ballet_reedsol_fd_reedsol_arith_avx2_h */
|