Line data Source code
1 : #ifndef HEADER_fd_src_util_simd_fd_avx_h 2 : #define HEADER_fd_src_util_simd_fd_avx_h 3 : 4 : #if FD_HAS_AVX 5 : 6 : /* An API for writing vectorized C/C++ code using 8-wide 32-bit ints, 7 : 8-wide 32-bit uints, 8-wide 32-bit floats, 4-wide 64-bit doubles, 8 : 4-wide 64-bit longs, 4-wide 64-bit ulongs and 8- or 4-wide logicals 9 : assuming a platform with AVX support. 10 : 11 : Essentially, all the usual C/C++ operations you can do on an int, 12 : uint, float, double, long, ulong or logical has a fast O(1) 13 : vectorized equivalent here. Most operations boil down to a single 14 : assembly instruction in most cases and the macros are robust. 15 : 16 : Further operations commonly used to transition from scalar/vector to 17 : vector/scalar code, to do cross lane data motion, etc are also 18 : provided to make it much easier to convert scalar implementations 19 : into highly optimized vectorized implementations. 20 : 21 : That is, this is a thin wrapper around Intel's AVX intrinsics to give 22 : it a sane type system and robust semantics for writing mixed type and 23 : mixed width vectorized code (including branching). This includes a 24 : lot of non-obvious tricks, fixes for ultra high density of 25 : irregularities in their intrinsics, implementations of missing 26 : intrinsics and lots of workarounds to get Intel AVX to behave sanely. 27 : 28 : A side effect is that this API also makes it easy to port code 29 : vectorized for AVX to non-Intel architectures. Just make 30 : implementations of these wrappers for the target platform and then, 31 : magically, code written in terms of this API has been ported. (This 32 : is similar to how CUDA works under the hood. Developers don't write 33 : GPU code ... they write CUDA code that is then adapted for the target 34 : architecture by the CUDA tooling at compile- or run-time.) 35 : 36 : Much like the fd_util_base.h primitive types, APIs in here generally 37 : aren't prefixed with fd_ given how aggressively they get used in 38 : writing compute intensive code. This is unlikely to matter 39 : practically given this API is both optional and limited to particular 40 : build targets (i.e. namespace collisions highly unlikely to occur 41 : accidentally). */ 42 : 43 : #include "../bits/fd_bits.h" 44 : #include <x86intrin.h> /* Include the intrinsics we are going to patch up */ 45 : 46 : /* Some useful constants */ 47 : 48 : #define W_WIDTH (8) /* Vector width / element count / lanes (32-bit elements) */ 49 105345228 : #define W_FOOTPRINT (32) /* Vector byte size */ 50 : #define W_ALIGN (32) /* Vector byte alignment required for aligned operations */ 51 : #define W_LG_WIDTH (3) /* log_2 W_WIDTH */ 52 : #define W_LG_FOOTPRINT (5) /* log_2 W_FOOTPRINT */ 53 : #define W_LG_ALIGN (5) /* log_2 W_ALIGN */ 54 : #define W_ATTR __attribute__((aligned(W_ALIGN))) 55 : 56 : /* Include all the APIs */ 57 : 58 : #include "fd_avx_wc.h" /* Vector conditional support */ 59 : #include "fd_avx_wf.h" /* Vector float support */ 60 : #include "fd_avx_wi.h" /* Vector int support */ 61 : #include "fd_avx_wu.h" /* Vector uint support */ 62 : #include "fd_avx_wd.h" /* Vector double support */ 63 : #include "fd_avx_wl.h" /* Vector long support */ 64 : #include "fd_avx_wv.h" /* Vector ulong support */ 65 : #include "fd_avx_wb.h" /* Vector uchar (byte) support */ 66 : #include "fd_avx_ws.h" /* Vector short support */ 67 : #include "fd_avx_wh.h" /* Vector ushort support */ 68 : 69 : #else 70 : #error "Build target does not support AVX wrappers" 71 : #endif 72 : 73 : #endif /* HEADER_fd_src_util_simd_fd_avx_h */ 74 :