Line data Source code
1 : #ifndef HEADER_fd_src_vinyl_fd_vinyl_base_h
2 : #define HEADER_fd_src_vinyl_fd_vinyl_base_h
3 :
4 : /* Vinyl implements a persistent interprocess shared key-val store.
5 : Unlike other fd key-val stores (including groove and funk_rec),
6 : vinyl's key value store is backed by non-volatile storage. Features
7 : include:
8 :
9 : - Massive batch requests processed async with zero-copy cut-through
10 : from the underlying I/O layer to an arbitrary number of clients.
11 :
12 : - Runtime plugin I/O backends to support various styles of non-volatile
13 : storage. This includes kernel-bypassing the VM and/or the file
14 : system (e.g. direct I/O to raw block devices, I/O uring, memory
15 : mapped I/O, posix I/O volumes, etc).
16 :
17 : - Massive amounts of transparent in memory LRU data caching with fine
18 : grained user control of cache eviction priorities.
19 :
20 : - Acquire/release semantics with explicit completions for explicit
21 : control of pointer lifetimes.
22 :
23 : - Optional lockfree O(1) metadata queries. E.g. hot-or-not query in
24 : sig-verify with I/O cost aware downstream packing.
25 :
26 : - Always in memory application specific metadata. E.g. expirations,
27 : balances, etc are always available fast O(1).
28 :
29 : - Application controllable persistence guarantees (including massively
30 : parallel recovery / fast startup, recovery from power failures, and
31 : recovery to earlier known good states).
32 :
33 : - Support for practically unlimited number of keys.
34 :
35 : - Non-blocking speculative reads
36 :
37 : - Fast ignoring existing values (e.g. fast overwriting).
38 :
39 : - Extensive user and internal consistency checking
40 :
41 : - Transparent background meta and data integrity checking
42 :
43 : - Transparent background data compression (with dynamic load adaption
44 : and user configuration for low jitter) .
45 :
46 : - Live runtime statistics.
47 :
48 : - ...
49 :
50 : The most basic configuration is shown below:
51 :
52 : (optional)
53 : (read only) +------------+
54 : ..........| vinyl_meta |<------------+
55 : . +------------+ |
56 : . |
57 : . +------+ |
58 : . +------>| cnc |<------+ |
59 : . | +------+ | |
60 : v v v v vinyl_bstream
61 : ========== +----------+ ============== vinyl_io +--------------------+
62 : = client =-->| vinyl_rq |-->= vinyl_tile = <--------> | non-volatile store |
63 : ========== +----------+ ============== +--------------------+
64 : ^ ^ | ^
65 : | | +----------+ | |
66 : | +-----| vinyl_cq |-----+ |
67 : | +----------+ |
68 : | |
69 : | +------------+ |
70 : +-------->| vinyl_data |<------------+
71 : +------------+
72 :
73 : - To interact with a vinyl instance, a client joins the vinyl
74 : command-and-control (cnc), vinyl data cache (vinyl_data) and (if
75 : useful to the client) vinyl metadata cache (vinyl_meta).
76 :
77 : - The client uses the cnc to connect to the vinyl instance and
78 : monitor the status of a vinyl instance. When connecting, the
79 : client specifies the vinyl request queue (vinyl_rq) the client will
80 : use to send batch requests to the vinyl instance and the vinyl
81 : completion queue (vinyl_cq) the client will use to receive batch
82 : request completion notifications.
83 :
84 : - Most client-vinyl interactions have acquire/release semantics.
85 :
86 : - When a client wants to read, modify, etc a batch of key-val pairs,
87 : the client issues a batch request to acquire the keys. The vinyl
88 : tile will order all incoming requests by clients for performance
89 : and do the necessary I/O operations asynchronously in the
90 : background. When finished, the vinyl instance will notify the
91 : client where the values and metadata are for the keys are located
92 : in the vinyl data cache on completion. This location will be
93 : stable until the client releases the keys. As such, the client can
94 : operate on the values and metadata in-place zero-copy as
95 : appropriate for the type of request.
96 :
97 : - When the client is done operating on some or all of these keys,
98 : the client issues a second request to release them. The vinyl
99 : instance will update the data, meta and non-volatile storage
100 : appropriately and notify the client when done.
101 :
102 : - The atomicity and ordering guarantees for requests from multiple
103 : are described in detail in fd_vinyl_rq.h.
104 :
105 : - Advanced optimizations support out-of-band completions, ignoring
106 : completions, and cut-through batch processing without waiting for a
107 : completion. Likewise, a rq can be shared by multiple vinyl
108 : instances (e.g. sharding the store over multiple vinyl instances).
109 : Further, a cq can be shared by multiple clients (and not
110 : necessarily the same clients issuing requests) to support advanced
111 : thread modelings (e.g. read pipelining).
112 :
113 : - ... */
114 :
115 : #include "../tango/fd_tango.h" /* For util and tango/cnc functionality */
116 :
117 : /* fd_vinyl error code API ********************************************/
118 :
119 : /* These mirror fd_map error codes exactly */
120 :
121 : #include "../util/tmpl/fd_map.h"
122 :
123 24894864 : #define FD_VINYL_SUCCESS FD_MAP_SUCCESS
124 3 : #define FD_VINYL_ERR_INVAL FD_MAP_ERR_INVAL
125 3 : #define FD_VINYL_ERR_AGAIN FD_MAP_ERR_AGAIN
126 28503072 : #define FD_VINYL_ERR_CORRUPT FD_MAP_ERR_CORRUPT
127 3747417 : #define FD_VINYL_ERR_EMPTY FD_MAP_ERR_EMPTY
128 3 : #define FD_VINYL_ERR_FULL FD_MAP_ERR_FULL
129 7502268 : #define FD_VINYL_ERR_KEY FD_MAP_ERR_KEY
130 :
131 : FD_PROTOTYPES_BEGIN
132 :
133 : /* fd_vinyl_strerror converts an FD_VINYL_SUCCESS / FD_VINYL_ERR_* code
134 : into a human readable cstr. The lifetime of the returned pointer is
135 : infinite. The returned pointer is always to a non-NULL cstr. */
136 :
137 : FD_FN_CONST char const *
138 : fd_vinyl_strerror( int err );
139 :
140 : FD_PROTOTYPES_END
141 :
142 : /* fd_vinyl_key_t API *************************************************/
143 :
144 : /* A fd_vinyl_key_t specifies the key type for vinyl key-val pair.
145 : Compact binary keys are encouraged but a cstr can be used so long as
146 : it has strlen(cstr)<FD_VINYL_KEY_FOOTPRINT and the characters c[i]
147 : for i in [strlen(cstr),FD_VINYL_KEY_FOOTPRINT) are zero. (Also, if
148 : encoding a cstr in a key, recommend using first byte to encode the
149 : strlen for accelerating cstr operations further but this is up to the
150 : user.) */
151 :
152 : #define FD_VINYL_KEY_ALIGN (8UL)
153 135020616 : #define FD_VINYL_KEY_FOOTPRINT (32UL)
154 :
155 : union __attribute__((aligned(FD_VINYL_KEY_ALIGN))) fd_vinyl_key {
156 : char c [ FD_VINYL_KEY_FOOTPRINT ];
157 : uchar uc[ FD_VINYL_KEY_FOOTPRINT ];
158 : ulong ul[ FD_VINYL_KEY_FOOTPRINT / sizeof(ulong) ];
159 : };
160 :
161 : typedef union fd_vinyl_key fd_vinyl_key_t;
162 :
163 : FD_PROTOTYPES_BEGIN
164 :
165 : /* fd_vinyl_key_init populates the fd_vinyl_key_t compatible memory
166 : region mem with the src_sz bytes pointed to by src. If src_sz is
167 : {less than,greater than} FD_VINYL_KEY_FOOTPRINT, it will be {zero
168 : padded,truncated} to FD_VINYL_KEY_FOOTPRINT bytes. Assumes mem and
169 : src point to stable valid non-overlapping regions in the caller's
170 : address space for the duration of the call. Retains no interest in
171 : mem or src. Returns mem as a fd_vinyl_key_t. */
172 :
173 : static inline fd_vinyl_key_t *
174 : fd_vinyl_key_init( void * FD_RESTRICT mem,
175 : void const * FD_RESTRICT src,
176 6000000 : ulong src_sz ) {
177 6000000 : fd_vinyl_key_t * k = (fd_vinyl_key_t *)mem;
178 6000000 : void * FD_RESTRICT dst = k->c;
179 6000000 : ulong csz = fd_ulong_min( src_sz, FD_VINYL_KEY_FOOTPRINT ); /* typically compile time */
180 6000000 : ulong zsz = FD_VINYL_KEY_FOOTPRINT - csz; /* " */
181 6000000 : if( zsz ) memset( dst, 0, FD_VINYL_KEY_FOOTPRINT ); /* " */
182 6000000 : if( csz ) memcpy( dst, src, csz ); /* " */
183 6000000 : return k;
184 6000000 : }
185 :
186 : /* fd_vinyl_key_ulong populates the fd_vinyl_key_t compatible memory
187 : region mem with the given ulongs. Assumes mem points to a stable
188 : valid region in the caller's address space for the duration of the
189 : call. Retains no interest in mem. Returns mem as a fd_vinyl_key_t. */
190 :
191 : static inline fd_vinyl_key_t *
192 : fd_vinyl_key_init_ulong( void * mem,
193 : ulong k0,
194 : ulong k1,
195 : ulong k2,
196 21000438 : ulong k3 ) {
197 21000438 : fd_vinyl_key_t * k = (fd_vinyl_key_t *)mem;
198 21000438 : k->ul[0] = k0; k->ul[1] = k1; k->ul[2] = k2; k->ul[3] = k3;
199 21000438 : return k;
200 21000438 : }
201 :
202 : /* fd_vinyl_key_eq tests keys ka and kb for equality. Assumes ka and
203 : kb point in the caller's address space to stable valid keys for the
204 : duration of the call. Retains no interest in ka or kb. Returns 1 if
205 : the keys are equal and 0 otherwise. */
206 :
207 : FD_FN_PURE static inline int
208 : fd_vinyl_key_eq( fd_vinyl_key_t const * ka,
209 79159932 : fd_vinyl_key_t const * kb ) {
210 79159932 : ulong const * a = ka->ul;
211 79159932 : ulong const * b = kb->ul;
212 79159932 : return !((a[0]^b[0]) | (a[1]^b[1]) | (a[2]^b[2]) | (a[3]^b[3])); /* tons of ILP and vectorizable */
213 79159932 : }
214 :
215 : /* fd_vinyl_key_memo hashes the arbitrary 64-bit integer seed and the
216 : key pointed to by k to a uniform quasi-random 64-bit integer. This
217 : hash function is meant to be high quality but not necessarily
218 : cryptographically secure. Assumes k points in the caller's address
219 : space to a valid key for the duration of the call. Retains no
220 : interest in k. Returns the hash (arbitrary). */
221 :
222 : #if FD_HAS_INT128
223 :
224 : /* If the target supports uint128, fd_vinyl_key_memo is seeded
225 : xxHash3 with 64-bit output size. (open source BSD licensed) */
226 :
227 : static inline ulong
228 136560726 : fd_xxh3_mul128_fold64_( ulong lhs, ulong rhs ) {
229 136560726 : uint128 product = (uint128)lhs * (uint128)rhs;
230 136560726 : return (ulong)product ^ (ulong)( product>>64 );
231 136560726 : }
232 :
233 : static inline ulong
234 : fd_xxh3_mix16b_( ulong i0, ulong i1,
235 : ulong s0, ulong s1,
236 136560726 : ulong seed ) {
237 136560726 : return fd_xxh3_mul128_fold64_( i0 ^ (s0 + seed), i1 ^ (s1 - seed) );
238 136560726 : }
239 :
240 : FD_FN_PURE static inline ulong
241 : fd_vinyl_key_memo( ulong seed,
242 68280363 : fd_vinyl_key_t const * k ) {
243 68280363 : ulong k0 = k->ul[0];
244 68280363 : ulong k1 = k->ul[1];
245 68280363 : ulong k2 = k->ul[2];
246 68280363 : ulong k3 = k->ul[3];
247 68280363 : ulong acc = 32 * 0x9E3779B185EBCA87ULL;
248 68280363 : acc += fd_xxh3_mix16b_( k0, k1, 0xbe4ba423396cfeb8UL, 0x1cad21f72c81017cUL, seed );
249 68280363 : acc += fd_xxh3_mix16b_( k2, k3, 0xdb979083e96dd4deUL, 0x1f67b3b7a4a44072UL, seed );
250 68280363 : acc = acc ^ (acc >> 37);
251 68280363 : acc *= 0x165667919E3779F9ULL;
252 68280363 : acc = acc ^ (acc >> 32);
253 68280363 : return acc;
254 68280363 : }
255 :
256 : #else /* Portable variant */
257 :
258 : FD_FN_PURE static inline ulong
259 : fd_vinyl_key_memo( ulong seed,
260 : fd_vinyl_key_t const * k ) {
261 : ulong const * a = k->ul;
262 : return fd_ulong_hash( a[0] ^ seed ) ^ fd_ulong_hash( a[1] ^ ( seed ^ 0x5555555555555555UL) ) ^
263 : fd_ulong_hash( a[2] ^ ( seed ^ 0xaaaaaaaaaaaaaaaaUL) ) ^ fd_ulong_hash( a[3] ^ ( seed ^ 0x5a5a5a5a5a5a5a5aUL) );
264 : /* tons of ILP and vectorizable */
265 : }
266 :
267 : #endif
268 :
269 : FD_PROTOTYPES_END
270 :
271 : /* fd_vinyl_info_t ABI ************************************************/
272 :
273 : /* A fd_vinyl_info_t gives the pair decoded value size and any
274 : application specific info (e.g. ctime, mtime, balance, expire, etc).
275 : This structure is directly used in the bstream, vinyl meta cache, and
276 : vinyl val data cache to reduce data marshalling, memory copies,
277 : random access, etc between clients, tiles, caches and I/O layers and
278 : to allow extra integrity checking. The application can use whatever
279 : structure for the fd_vinyl_info_t so long as it is 8 byte aligned,
280 : FD_VINYL_INFO_SZ in size and the first 4 bytes are the decoded pair
281 : val size as a uint, in [0,FD_VINYL_VAL_MAX]. */
282 :
283 3000000 : #define FD_VINYL_INFO_SZ (16UL)
284 :
285 : union fd_vinyl_info {
286 : uint val_sz;
287 : uchar uc[ FD_VINYL_INFO_SZ ];
288 : uint ui[ FD_VINYL_INFO_SZ / sizeof(uint ) ];
289 : ulong ul[ FD_VINYL_INFO_SZ / sizeof(ulong) ];
290 : };
291 :
292 : typedef union fd_vinyl_info fd_vinyl_info_t;
293 :
294 : /* FD_VINYL_VAL_MAX is the maximum encoded and decoded byte size of a
295 : pair val. The value below is:
296 :
297 : 10MiB + block_sz - sizeof(ctl) - sizeof(key) - sizeof(info) - sizeof(ftr)
298 : 512 8 32 16 16
299 :
300 : where the adjustments make it possible create a vinyl_data sizeclass
301 : whose val_max is exactly FD_VINYL_VAL_MAX. */
302 :
303 6000069 : #define FD_VINYL_VAL_MAX (10486200UL)
304 :
305 : #endif /* HEADER_fd_src_vinyl_fd_vinyl_base_h */
|