Line data Source code
1 : #ifndef HEADER_fd_src_tango_mcache_fd_mcache_h
2 : #define HEADER_fd_src_tango_mcache_fd_mcache_h
3 :
4 : #include "../fd_tango_base.h"
5 :
6 : /* FD_MCACHE_{ALIGN,FOOTPRINT} specify the alignment and footprint
7 : needed for a mcache with depth entries and an application region of
8 : size app_sz. ALIGN is at least FD_FRAG_META_ALIGN and recommended to
9 : be at least double cache line to mitigate various kinds of false
10 : sharing. depth and app_sz are assumed to be valid (i.e. depth is an
11 : integer power of 2 of at least FD_MCACHE_BLOCK and the combination
12 : will not require a footprint larger than ULONG_MAX). These are
13 : provided to facilitate compile time mcache declarations. */
14 :
15 41766 : #define FD_MCACHE_ALIGN (128UL)
16 : #define FD_MCACHE_FOOTPRINT( depth, app_sz ) \
17 : FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
18 : FD_MCACHE_ALIGN, 128UL ), /* hdr */ \
19 : FD_MCACHE_ALIGN, FD_MCACHE_SEQ_CNT*sizeof(ulong) ), /* seq */ \
20 : FD_MCACHE_ALIGN, (depth)*sizeof(fd_frag_meta_t) ), /* meta */ \
21 : FD_MCACHE_ALIGN, (app_sz) ), /* app */ \
22 : FD_MCACHE_ALIGN )
23 :
24 : /* FD_MCACHE_SEQ_CNT specifies the number of entries in the mcache's seq
25 : storage region. It is aligned FD_MCACHE_ALIGN. Multiples of 16 have
26 : good Feng Shui. seq[0] has special meaning; see below for details. */
27 :
28 102 : #define FD_MCACHE_SEQ_CNT (16UL)
29 :
30 : /* FD_MCACHE_{LG_BLOCK,LG_INTERLEAVE,BLOCK} specifies how recent
31 : fragment meta data should be packed into mcaches. LG_BLOCK should be
32 : in [1,64). LG_INTERLEAVE should be in [0,FD_MCACHE_BLOCK). BLOCK ==
33 : 2^LG_BLOCK. See below for more details. */
34 :
35 : #define FD_MCACHE_LG_BLOCK (7)
36 : #define FD_MCACHE_LG_INTERLEAVE (0)
37 15297 : #define FD_MCACHE_BLOCK (128UL) /* == 2^FD_MCACHE_LG_BLOCK, explicit to workaround compiler limitations */
38 :
39 : FD_PROTOTYPES_BEGIN
40 :
41 : /* Construction API */
42 :
43 : /* fd_mcache_{align,footprint} return the required alignment and
44 : footprint of a memory region suitable for use as mcache with depth
45 : entries. align returns FD_MCACHE_ALIGN. If depth is invalid (e.g.
46 : not an integer power-of-2 >= FD_MCACHE_BLOCK or the footprint is
47 : larger than ULONG_MAX), footprint will silently return 0 (and thus
48 : can be used by the caller to validate mcache configuration
49 : parameters). */
50 :
51 : FD_FN_CONST ulong
52 : fd_mcache_align( void );
53 :
54 : FD_FN_CONST ulong
55 : fd_mcache_footprint( ulong depth,
56 : ulong app_sz );
57 :
58 : /* fd_mcache_new formats an unused memory region for use as a mcache.
59 : shmem is a non-NULL pointer to this region in the local address space
60 : with the required footprint and alignment. depth is the number of
61 : cache entries (should be an integer power of 2 >= FD_MCACHE_BLOCK).
62 : The mcache will also have an app_sz byte application region for
63 : application specific usage. seq0 is the initial fragment sequence
64 : number a producer should use for this mcache.
65 :
66 : The cache entries will be initialized such all queries for any
67 : sequence number will fail immediately after creation. They will
68 : further be initialized such that for any consumer initialized to
69 : start receiving a sequence number at or after seq0 will think it is
70 : ahead of the producer (such that it will wait for its sequence number
71 : cleanly instead of immediately trying to recover a gap). Conversely,
72 : consumers initialized to start receiving a sequence number before
73 : seq0 will think they are behind the producer (thus realize it is been
74 : incorrectly initialized and can recover appropriately). Anybody who
75 : looks at the mcache entries directly will also see the entries are
76 : initialized to have zero sz (such that they shouldn't try deference
77 : any fragment payloads), have the SOM and EOM bits set (so they
78 : shouldn't try to interpret the entry as part of some message spread
79 : over multiple fragments) and have the ERR bit set (so they don't
80 : think there is any validity to the meta data or payload).
81 :
82 : The application region will be initialized to zero.
83 :
84 : Returns shmem (and the memory region it points to will be formatted
85 : as a mcache, caller is not joined) on success and NULL on failure
86 : (logs details). Reasons for failure include obviously bad shmem or
87 : bad depth. */
88 :
89 : void *
90 : fd_mcache_new( void * shmem,
91 : ulong depth,
92 : ulong app_sz,
93 : ulong seq0 );
94 :
95 : /* fd_mcache_join joins the caller to the mcache. shmcache points to
96 : the first byte of the memory region backing the mcache in the
97 : caller's address space.
98 :
99 : Returns a pointer in the local address space to the mcache's entries
100 : on success (IMPORTANT! THIS IS NOT JUST A CAST OF SHMCACHE) and NULL
101 : on failure (logs details). Reasons for failure are that shmcache is
102 : obviously not a pointer to memory region holding a mcache. Every
103 : successful join should have a matching leave. The lifetime of the
104 : join is until the matching leave or thread group is terminated.
105 :
106 : Entries are indexed [0,depth) and the mapping from sequence number to
107 : depth is nontrivial (see below for accessors and mapping functions).
108 : There is no restrictions on the number of joins overall and a single
109 : thread can join multiple times (all joins to the same shmcache laddr
110 : will return same mcache laddr). */
111 :
112 : fd_frag_meta_t *
113 : fd_mcache_join( void * shmcache );
114 :
115 : /* fd_mcache_leave leaves a current local join. Returns a pointer to
116 : the underlying shared memory region on success (IMPORTANT! THIS IS
117 : NOT JUST A CAST OF MCACHE) and NULL on failure (logs details).
118 : Reasons for failure include mcache is NULL. */
119 :
120 : void *
121 : fd_mcache_leave( fd_frag_meta_t const * mcache );
122 :
123 : /* fd_mcache_delete unformats a memory region used as a mcache. Assumes
124 : nobody is joined to the region. Returns a pointer to the underlying
125 : shared memory region or NULL if used obviously in error (e.g.
126 : shmcache is obviously not a mcache ... logs details). The ownership
127 : of the memory region is transferred to the caller. */
128 :
129 : void *
130 : fd_mcache_delete( void * shmcache );
131 :
132 : /* Accessor API */
133 :
134 : /* fd_mcache_{depth,seq0} return the values corresponding to those use
135 : at the mcache's construction. Assume mcache is a current local join. */
136 :
137 : FD_FN_PURE ulong fd_mcache_depth ( fd_frag_meta_t const * mcache );
138 : FD_FN_PURE ulong fd_mcache_app_sz( fd_frag_meta_t const * mcache );
139 : FD_FN_PURE ulong fd_mcache_seq0 ( fd_frag_meta_t const * mcache );
140 :
141 : /* fd_mcache_seq_laddr returns location in the caller's local address
142 : space of mcache's sequence array. This array is indexed
143 : [0,FD_MCACHE_SEQ_CNT) with FD_MCACHE_ALIGN alignment (double cache
144 : line). laddr_const is a const correct version. Assumes mcache is a
145 : current local join. The lifetime of the returned pointer is the same
146 : as the underlying join.
147 :
148 : seq[0] has special meaning. Specifically, sequence numbers in
149 : [seq0,seq[0]) cyclic are guaranteed to have been published. seq[0]
150 : is not strictly atomically updated by the producer when it publishes
151 : so seq[0] can lag the most recently published sequence number
152 : somewhat. As seq[0] is moderately to aggressively frequently updated
153 : by the mcache's producer (depending on the application), this is on
154 : its own cache line pair to avoid false sharing. seq[0] is mostly
155 : used for monitoring, initialization and support for some methods for
156 : unreliable consumer overrun handling.
157 :
158 : The meaning of the remaining sequence numbers is application
159 : dependent. Application should try to restrict any use of these to
160 : ones that are seq[0] cache-friendly (e.g. use for producer write
161 : oriented cases or use for rarely used cases). */
162 :
163 : FD_FN_CONST ulong const * fd_mcache_seq_laddr_const( fd_frag_meta_t const * mcache );
164 : FD_FN_CONST ulong * fd_mcache_seq_laddr ( fd_frag_meta_t * mcache );
165 :
166 : /* fd_mcache_app_laddr returns location in the caller's local address
167 : space of memory set aside for application specific usage. Assumes
168 : mcache is a current local join. The lifetime of the returned pointer
169 : is the same as the underlying join. This region has FD_MCACHE_ALIGN
170 : alignment (double cache line) and is fd_mcache_app_sz( mcache ) in
171 : size. laddr_const is a const-correct version. */
172 :
173 : FD_FN_PURE uchar const * fd_mcache_app_laddr_const( fd_frag_meta_t const * mcache );
174 : FD_FN_PURE uchar * fd_mcache_app_laddr ( fd_frag_meta_t * mcache );
175 :
176 : /* fd_mcache_seq_query atomically reads the mcache's seq[0] (e.g. from
177 : fd_mcache_seq_laddr_const) to get a lower bound of where the producer
178 : is at in sequence space (in the sense that the producer guarantees it
179 : has produced all sequence numbers strictly before the return value
180 : cyclic). This is usually done at consumer startup and, for some
181 : unreliable consumer overrun handling, during consumer overrun
182 : recovery. It is strongly recommended for consumers to avoid using
183 : this as much as possible to limit cache line ping-ponging with the
184 : producer. */
185 :
186 : static inline ulong
187 3000117 : fd_mcache_seq_query( ulong const * _seq ) {
188 3000117 : FD_COMPILER_MFENCE();
189 3000117 : ulong seq = FD_VOLATILE_CONST( *_seq );
190 3000117 : FD_COMPILER_MFENCE();
191 3000117 : return seq;
192 3000117 : }
193 :
194 : /* fd_mcache_seq_update updates the mcache's seq[0] (e.g. from
195 : fd_mcache_seq_laddr) above where the producer a lower bound of where
196 : the producer is currently at (in the sense that the producer has
197 : produced all sequence numbers strictly before seq cyclic). This
198 : should be monotonically non-decreasing. This should be done
199 : moderately frequently (e.g. in background housekeeping) after the
200 : producer has moved forward in sequence space since the last update.
201 : Even more aggressively is usually fine. This should also be done
202 : when the producer is shutdown to facilitate cleanly restarting a
203 : producer and what not. This also serves as a compiler memory fence
204 : to ensure the sequence number is updated at a well defined point in
205 : the instruction stream (e.g. so that compiler doesn't move any stores
206 : from before the update to after the above). */
207 :
208 : static inline void
209 : fd_mcache_seq_update( ulong * _seq,
210 82599471 : ulong seq ) {
211 82599471 : FD_COMPILER_MFENCE();
212 82599471 : FD_VOLATILE( *_seq ) = seq;
213 82599471 : FD_COMPILER_MFENCE();
214 82599471 : }
215 :
216 : /* fd_mcache_line_idx returns the index of the cache line in a depth
217 : entry mcache (depth is assumed to be a power of 2) where the
218 : metadata for the frag with sequence number seq will be stored when it
219 : is in cache. Outside of startup transients, a mcache is guaranteed
220 : to exactly hold the depth most recently sequence numbers (the act of
221 : publishing a new sequence number atomically unpublishes the oldest
222 : sequence number implicitly).
223 :
224 : FD_MCACHE_LG_INTERLEAVE is in [0,FD_MCACHE_LG_BLOCK) and controls the
225 : details of this mapping. LG_INTERLEAVE 0 indicates no interleaving.
226 : Values from 1 to LG_BLOCK space out sequential frag meta data in
227 : memory to avoid false sharing between producers and fast consumers to
228 : keep fast consumers low latency while keeping frag meta data storage
229 : compact in memory to help throughput of slow consumers.
230 :
231 : Specifically, at a LG_INTERLEAVE of i with s byte frag meta data,
232 : meta data storage for sequential frags is typically s*2^i bytes
233 : apart. To avoid wasting memory and bandwidth, the interleaving is
234 : implemented by doing a rotation of the least LG_BLOCK bits of the lg
235 : depth bits of the sequence number (NOTE: this imposes a requirement
236 : that mcaches have at least a depth of 2^LG_BLOCK fragments). This
237 : yields a frag sequence number to line idx mapping that avoids false
238 : sharing for fast consumers and maintains compactness, avoids TLB
239 : thrashing (even if meta data is backed by normal pages) and exploits
240 : CPU data and TLB prefetching behavior for slow consumers.
241 :
242 : How useful block interleaving is somewhat application dependent.
243 : Different values have different trade offs between optimizing for
244 : fast and slow consumers and for different sizes of meta data and
245 : different page size backing memory.
246 :
247 : Using 0 / B for FD_MCACHE_LG_INTERLEAVE / LG_BLOCK will disable meta
248 : data interleaving while still requiring mcaches be at least 2^B in
249 : size. This implicitly optimizes for slow consumers. Something like
250 : 2 / 7 (with a 32-byte size 32-byte aligned fd_frag_meta_t and a
251 : mcache that is at least normal page aligned) will access cached meta
252 : data in sequential blocks of 128 message fragments that are normal
253 : page size and aligned while meta data within those blocks will
254 : typically be strided at double DRAM cache line granularity. As such,
255 : fast consumers (e.g. those within 32 of the producers) will rarely
256 : have false sharing with the producers as nearby sequence numbers are
257 : on different DRAM cache line pairs. And slow consumers (e.g. ones
258 : that fall more than 128 fragments behind) will access meta data in a
259 : very DRAM cache friendly / data prefetcher / TLB friendly / bandwidth
260 : efficient manner (and without needing to load any prefilterable
261 : payload data while completely avoiding memory being written by the
262 : producer). That is, it typically has good balance of performance for
263 : both fast and slow consumers simultaneously. */
264 :
265 : #if FD_MCACHE_LG_INTERLEAVE==0
266 :
267 : FD_FN_CONST static inline ulong /* Will be in [0,depth) */
268 : fd_mcache_line_idx( ulong seq,
269 24220721626 : ulong depth ) { /* Assumed power of 2 >= BLOCK */
270 24220721626 : return seq & (depth-1UL);
271 24220721626 : }
272 :
273 : #else
274 :
275 : FD_FN_CONST static inline ulong /* Will be in [0,depth) */
276 : fd_mcache_line_idx( ulong seq,
277 : ulong depth ) { /* Assumed power of 2 >= BLOCK */
278 : ulong block_mask = FD_MCACHE_BLOCK - 1UL; /* Compile time */
279 : ulong page_mask = (depth-1UL) & (~block_mask); /* Typically compile time or loop invariant */
280 : ulong page = seq & page_mask;
281 : ulong bank = (seq << FD_MCACHE_LG_INTERLEAVE) & block_mask;
282 : ulong idx = (seq & block_mask) >> (FD_MCACHE_LG_BLOCK-FD_MCACHE_LG_INTERLEAVE);
283 : return page | bank | idx;
284 : }
285 :
286 : #endif
287 :
288 : /* fd_mcache_publish inserts the metadata for frag seq into the given
289 : depth entry mcache in a way compatible with FD_MCACHE_WAIT and
290 : FD_MCACHE_WAIT_SSE (but not FD_MCACHE_WAIT_AVX ... see FD_MCACHE_WAIT
291 : for more details). This implicitly evicts the metadata for the
292 : sequence number currently stored at fd_mcache_line_idx( seq, depth ).
293 : In the typical case where sequence numbers are published into the
294 : mcache sequentially, the evicted metadata is typically for frag
295 : seq-depth (cyclic). This does no error checking or the like as it is
296 : frequently used in ultra high performance contexts. This operation
297 : implies a compiler mfence to the caller. */
298 :
299 : static inline void
300 : fd_mcache_publish( fd_frag_meta_t * mcache, /* Assumed a current local join */
301 : ulong depth, /* Assumed an integer power-of-2 >= BLOCK */
302 : ulong seq,
303 : ulong sig,
304 : ulong chunk, /* Assumed in [0,UINT_MAX] */
305 : ulong sz, /* Assumed in [0,USHORT_MAX] */
306 : ulong ctl, /* Assumed in [0,USHORT_MAX] */
307 : ulong tsorig, /* Assumed in [0,UINT_MAX] */
308 876505307 : ulong tspub ) { /* Assumed in [0,UINT_MAX] */
309 876505307 : fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
310 876505307 : FD_COMPILER_MFENCE();
311 876505307 : meta->seq = fd_seq_dec( seq, 1UL );
312 876505307 : FD_COMPILER_MFENCE();
313 876505307 : meta->sig = sig;
314 876505307 : meta->chunk = (uint )chunk;
315 876505307 : meta->sz = (ushort)sz;
316 876505307 : meta->ctl = (ushort)ctl;
317 876505307 : meta->tsorig = (uint )tsorig;
318 876505307 : meta->tspub = (uint )tspub;
319 876505307 : FD_COMPILER_MFENCE();
320 876505307 : meta->seq = seq;
321 876505307 : FD_COMPILER_MFENCE();
322 876505307 : }
323 :
324 : #if FD_HAS_SSE
325 :
326 : /* fd_mcache_publish_sse is a SSE implementation of fd_mcache_publish.
327 : It is compatible with FD_MCACHE_WAIT and FD_MCACHE_WAIT_SSE. */
328 :
329 : static inline void
330 : fd_mcache_publish_sse( fd_frag_meta_t * mcache, /* Assumed a current local join */
331 : ulong depth, /* Assumed an integer power-of-2 >= BLOCK */
332 : ulong seq,
333 : ulong sig,
334 : ulong chunk, /* Assumed in [0,UINT_MAX] */
335 : ulong sz, /* Assumed in [0,USHORT_MAX] */
336 : ulong ctl, /* Assumed in [0,USHORT_MAX] */
337 : ulong tsorig, /* Assumed in [0,UINT_MAX] */
338 90 : ulong tspub ) { /* Assumed in [0,UINT_MAX] */
339 90 : fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
340 90 : __m128i meta_sse0 = fd_frag_meta_sse0( fd_seq_dec( seq, 1UL ), sig );
341 90 : __m128i meta_sse1 = fd_frag_meta_sse1( chunk, sz, ctl, tsorig, tspub );
342 90 : FD_COMPILER_MFENCE();
343 90 : FD_VOLATILE( meta->sse0 ) = meta_sse0;
344 90 : FD_COMPILER_MFENCE();
345 90 : FD_VOLATILE( meta->sse1 ) = meta_sse1;
346 90 : FD_COMPILER_MFENCE();
347 90 : meta->seq = seq;
348 90 : FD_COMPILER_MFENCE();
349 90 : }
350 :
351 : #endif
352 :
353 : #if FD_HAS_AVX
354 :
355 : /* fd_mcache_publish_avx is an AVX implementation of fd_mcache_publish.
356 : It is compatible with FD_MCACHE_WAIT, FD_MCACHE_WAIT_SSE and
357 : FD_MCACHE_WAIT_AVX. It requires a target for which aligned AVX
358 : stores are guaranteed atomic under the hood (see below for more
359 : details). */
360 :
361 : static inline void
362 : fd_mcache_publish_avx( fd_frag_meta_t * mcache, /* Assumed a current local join */
363 : ulong depth, /* Assumed an integer power-of-2 >= BLOCK */
364 : ulong seq,
365 : ulong sig,
366 : ulong chunk, /* Assumed in [0,UINT_MAX] */
367 : ulong sz, /* Assumed in [0,USHORT_MAX] */
368 : ulong ctl, /* Assumed in [0,USHORT_MAX] */
369 : ulong tsorig, /* Assumed in [0,UINT_MAX] */
370 65841 : ulong tspub ) { /* Assumed in [0,UINT_MAX] */
371 65841 : fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
372 65841 : __m256i meta_avx = fd_frag_meta_avx( seq, sig, chunk, sz, ctl, tsorig, tspub );
373 65841 : FD_COMPILER_MFENCE();
374 : /* _mm256_store_si256( &meta->avx, meta_avx );
375 : Some versions of Clang break up a 256-bit store intrinsic into two
376 : 128-bit stores, which is not atomic. Use a volatile 256-bit store
377 : as a workaround. */
378 65841 : FD_VOLATILE( meta->avx ) = meta_avx;
379 65841 : FD_COMPILER_MFENCE();
380 65841 : }
381 :
382 : #endif
383 :
384 : /* FD_MCACHE_WAIT does a bounded wait for a producer to transmit a
385 : particular frag.
386 :
387 : meta (fd_frag_meta_t * compatible) is the location on the caller
388 : where the wait should save the found metadata. This typically
389 : points to a stack temporary.
390 :
391 : mline (fd_frag_meta_t const * compatible) will be
392 : mcache + fd_mcache_line_idx( seq_expected, depth )
393 : when the wait does not time out. This is the location where the
394 : caller can verify (after any speculative processing of seq_expected)
395 : the producer did not clobber the consumer during the processing.
396 :
397 : seq_found (ulong compatible) will be the sequence number found at
398 : mline when the wait does not time out. This will be seq_expected
399 : on a successful wait.
400 :
401 : seq_diff (long compatible) will be how many sequence numbers ahead
402 : of seq_expected when the wait does not time out
403 : fd_seq_diff( seq_found, seq_expected )
404 : This will be zero on a successful wait. This will be positive
405 : otherwise and a lower bound of how far behind the consumer is from
406 : the producer (and seq_found will typically be a reasonably recently
407 : produced sequence number).
408 :
409 : poll_max (ulong compatible) is the number of times FD_MCACHE_WAIT
410 : will poll the mcache of the given depth for seq_expected before
411 : timing out. poll_max should be positive on input. (Note: using
412 : ULONG_MAX for poll_max practically turns this into a blocking wait as
413 : this take hundreds of years to complete on realistic platforms.)
414 : If poll max is zero on completion of the, the wait timed out.
415 :
416 : mcache (fd_frag_meta_t const * compatible) is a current local join to
417 : the mcache the producer uses to cache metadata for the frags it is
418 : producing.
419 :
420 : depth (a ulong compatible power of two of at least FD_MCACHE_BLOCK)
421 : is the number of entries in mcache.
422 :
423 : seq_expected (ulong compatible) is the sequence number to wait to be
424 : produced.
425 :
426 : On completion of the WAIT, if poll_max is zero, the WAIT timed out
427 : and none of the other outputs (meta, mline, seq_found, seq_diff)
428 : should be trusted. If poll_max is non-zero, it will be the original
429 : poll_max value decremented by the number of polls it took for the
430 : WAIT to complete and the WAIT did not timeout.
431 :
432 : When the WAIT did not timeout, mline, seq_found and seq_diff can be
433 : trusted. If seq_diff is positive, the caller has fallen more than
434 : depth behind the producer such that metadata for frag seq_expected is
435 : no longer available via the mcache. IMPORTANT! *META MIGHT NOT BE
436 : VALID FOR SEQ_FOUND WHEN CONSUMER HAS FALLEN BEHIND (e.g. if the
437 : producer is paused after it starts writing metadata but before it has
438 : completed writing it ... an unreliable overrun consumer that reads
439 : the metadata while the producer is paused will observe metadata that
440 : is a mix of the new metadata and old metadata with a bogus sequence
441 : number on it). seq_diff is a lower bound of how far the caller has
442 : fallen behind the producer and seq_found is a lower bound of where
443 : producer is currently at.
444 :
445 : Otherwise, the caller is within depth of the producer and *meta will
446 : be a local copy of the desired metadata.
447 :
448 : TL;DR Typical usage:
449 :
450 : ... Example HPC receiver run loop setup
451 :
452 : ulong poll_max = ... number of polls until next housekeeping (positive)
453 : fd_frag_meta_t const * mcache = ... local join to producer's mcache
454 : ulong depth = ... producer's mcache depth
455 : ulong rx_seq = ... next sequence number to receive from producer
456 :
457 : ... Example HPC receiver run loop structure
458 :
459 : for(;;) {
460 :
461 : fd_frag_meta_t meta[1];
462 : fd_frag_meta_t const * mline;
463 : ulong tx_seq;
464 : long seq_diff;
465 : FD_MCACHE_WAIT( meta, mline, tx_seq, seq_diff, poll_max, mcache, depth, rx_seq );
466 :
467 : ... At this point, poll_max can be trusted and has been
468 : ... decremented the number of polls that were done by the wait
469 : ... from its value at the start of the wait. We either timed
470 : ... out waiting, detected we've been overrun or received the
471 : ... desired meta data.
472 :
473 : if( FD_UNLIKELY( !poll_max ) ) {
474 :
475 : ... We timed out. Do background housekeeping.
476 :
477 : poll_max = ... Reload for the next housekeeping (positive and
478 : ... ideally somewhat randomized each time). Value
479 : ... depends on how aggressively the run loop needs
480 : ... to do background tasks such as
481 : ... command-and-control interactions, monitoring
482 : ... diagnostics, maintenance, etc).
483 :
484 : continue;
485 : }
486 :
487 : ... At this point, poll_max, mline, tx_seq and seq_diff can be
488 : ... trusted. We either have been overrun or received the desired
489 : ... metadata. poll_max>0 and seq_diff==fd_seq_diff(tx_seq,rx_seq).
490 :
491 : if( FD_UNLIKELY( seq_diff ) ) {
492 :
493 : ... We got overrun by the producer. tx_seq is an estimate
494 : ... (typically within depth and often much closer) of where the
495 : ... producer currently is at. Technically, this branch should
496 : ... never be exercised on reliable consumers but is a generally
497 : ... good idea regardless to detect / protect against flow
498 : ... control misconfigurations, bugs in the consumer, etc.
499 : ... Overrun handling could be as simple as "rx_seq = tx_seq;"
500 : ... here (but applications will typically do more elaborate
501 : ... application specific handling)
502 :
503 : continue;
504 : }
505 :
506 : ... We received meta data for frag rx_seq. At this point, meta,
507 : ... tx_seq, seq_diff and poll_max can be trusted. poll_max>=0UL,
508 : ... tx_seq==rx_seq and seq_diff==0L.
509 :
510 : ... Process meta->* at the run loop's leisure and speculatively
511 : ... process actual frag data as necessary here.
512 :
513 : tx_seq = fd_frag_meta_seq_query( mline );
514 : if( FD_UNLIKELY( fd_seq_ne( tx_seq, rx_seq ) ) ) {
515 :
516 : ... We got overrun by the producer while speculatively
517 : ... processing data pointed to by meta. Same considerations
518 : ... as above for overrun handling.
519 :
520 : continue;
521 : }
522 :
523 : ... Advance to the producer's next sequence number.
524 :
525 : rx_seq = fd_seq_inc( rx_seq, 1UL );
526 : }
527 :
528 : This assumes the producer either writes the entire metadata cache
529 : line atomically (on targets where aligned AVX writes are in fact
530 : atomic) or writes the metadata cache line in a particular order:
531 :
532 : FD_COMPILER_MFENCE();
533 : mcache_line->seq = fd_seq_dec( seq, 1UL ); // atomically marks cache line as in the process of writing seq
534 : // This implicitly atomically evicts frag metadata for cache line
535 : // seq-depth cycle
536 : FD_COMPILER_MFENCE();
537 : ... update the actual cache line body without changing mcache_line->seq ...
538 : FD_COMPILER_MFENCE();
539 : mcache_line->seq = seq; // atomically marks metadata for frag seq as available for consumers
540 : FD_COMPILER_MFENCE();
541 :
542 : Note that above writes can be SSE accelerated on AVX platforms (where
543 : aligned SSE writes are guaranteed to be atomic) as:
544 :
545 : FD_COMPILER_MFENCE();
546 : _mm_store_si128( &mcache_line->sse0, fd_frag_meta_sse0( fd_seq_dec( seq, 1UL ), sig );
547 : FD_COMPILER_MFENCE();
548 : _mm_store_si128( &mcache_line->sse1, fd_frag_meta_sse1( chunk, sz, ctl, tsorig, tspub );
549 : FD_COMPILER_MFENCE();
550 : mcache_line->seq = seq;
551 : FD_COMPILER_MFENCE();
552 :
553 : Note that the above uses no expensive atomic operations or hardware
554 : memory fences under the hood as these are not required for x86-style
555 : cache coherency. Specifically, Intel Architecture Software Developer
556 : Manual 3A-8-9:
557 :
558 : "Reads are not reordered with other reads."
559 :
560 : and 3A-8-10:
561 :
562 : "Writes by a single processor are observed in the same order by all
563 : processors."
564 :
565 : This makes heavy use of compiler memory fences though to insure that
566 : compiler optimizations do not reorder how the operations are issued
567 : to CPUs (and thus also imply the operation acts as a compiler memory
568 : fence overall).
569 :
570 : Non-x86 platforms that use different cache coherency models may
571 : require modification of the below to use more explicit fencing or
572 : what not.
573 :
574 : The below is implemented as a macro to facilitate use in ultra high
575 : performance run loops and support multiple return values. This macro
576 : is robust (e.g. it evaluates its argument a minimal number of times). */
577 :
578 0 : #define FD_MCACHE_WAIT( meta, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do { \
579 0 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
580 0 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
581 0 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
582 0 : fd_frag_meta_t * _fd_mcache_wait_meta = (meta); \
583 0 : ulong _fd_mcache_wait_seq_found; \
584 0 : long _fd_mcache_wait_seq_diff; \
585 0 : ulong _fd_mcache_wait_poll_max = (poll_max); \
586 0 : for(;;) { \
587 0 : FD_COMPILER_MFENCE(); \
588 0 : _fd_mcache_wait_seq_found = _fd_mcache_wait_mline->seq; /* atomic */ \
589 0 : FD_COMPILER_MFENCE(); \
590 0 : *_fd_mcache_wait_meta = *_fd_mcache_wait_mline; /* probably non-atomic, typically fast L1 cache hit */ \
591 0 : FD_COMPILER_MFENCE(); \
592 0 : ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */ \
593 0 : FD_COMPILER_MFENCE(); \
594 0 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
595 0 : int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
596 0 : | (!--_fd_mcache_wait_poll_max); \
597 0 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
598 0 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
599 0 : FD_SPIN_PAUSE(); \
600 0 : } \
601 0 : (mline) = _fd_mcache_wait_mline; \
602 0 : (seq_found) = _fd_mcache_wait_seq_found; \
603 0 : (seq_diff) = _fd_mcache_wait_seq_diff; \
604 0 : (poll_max) = _fd_mcache_wait_poll_max; \
605 0 : } while(0)
606 :
607 : /* FD_MCACHE_WAIT_REG: similar to FD_MCACHE_WAIT but uses (nominally)
608 : registers to hold the metadata instead of a local buffer. */
609 :
610 : #define FD_MCACHE_WAIT_REG( sig, chunk, sz, ctl, tsorig, tspub, mline, seq_found, seq_diff, poll_max, \
611 14121998516 : mcache, depth, seq_expected ) do { \
612 14121998516 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
613 14121998516 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
614 14121998516 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
615 14121998516 : ulong _fd_mcache_wait_poll_max = (poll_max); \
616 14121998516 : ulong _fd_mcache_wait_sig; \
617 14121998516 : ulong _fd_mcache_wait_chunk; \
618 14121998516 : ulong _fd_mcache_wait_sz; \
619 14121998516 : ulong _fd_mcache_wait_ctl; \
620 14121998516 : ulong _fd_mcache_wait_tsorig; \
621 14121998516 : ulong _fd_mcache_wait_tspub; \
622 14121998516 : ulong _fd_mcache_wait_seq_found; \
623 14121998516 : long _fd_mcache_wait_seq_diff; \
624 27944503291 : for(;;) { \
625 27944503291 : FD_COMPILER_MFENCE(); \
626 27944503291 : _fd_mcache_wait_seq_found = _fd_mcache_wait_mline->seq; /* atomic */ \
627 27944503291 : FD_COMPILER_MFENCE(); \
628 27944503291 : _fd_mcache_wait_sig = _fd_mcache_wait_mline->sig; \
629 27944503291 : _fd_mcache_wait_chunk = (ulong)_fd_mcache_wait_mline->chunk; \
630 27944503291 : _fd_mcache_wait_sz = (ulong)_fd_mcache_wait_mline->sz; \
631 27944503291 : _fd_mcache_wait_ctl = (ulong)_fd_mcache_wait_mline->ctl; \
632 27944503291 : _fd_mcache_wait_tsorig = (ulong)_fd_mcache_wait_mline->tsorig; \
633 27944503291 : _fd_mcache_wait_tspub = (ulong)_fd_mcache_wait_mline->tspub; \
634 27944503291 : FD_COMPILER_MFENCE(); \
635 27944503291 : ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */ \
636 27944503291 : FD_COMPILER_MFENCE(); \
637 27944503291 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
638 27944503291 : int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
639 27944503291 : | (!--_fd_mcache_wait_poll_max); \
640 27944503291 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
641 27944503291 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
642 27944503291 : FD_SPIN_PAUSE(); \
643 13822504775 : } \
644 14121998516 : (sig) = _fd_mcache_wait_sig; \
645 14121998516 : (chunk) = _fd_mcache_wait_chunk; \
646 14121998516 : (sz) = _fd_mcache_wait_sz; \
647 14121998516 : (ctl) = _fd_mcache_wait_ctl; \
648 14121998516 : (tsorig) = _fd_mcache_wait_tsorig; \
649 14121998516 : (tspub) = _fd_mcache_wait_tspub; \
650 14121998516 : (mline) = _fd_mcache_wait_mline; \
651 14121998516 : (seq_found) = _fd_mcache_wait_seq_found; \
652 14121998516 : (seq_diff) = _fd_mcache_wait_seq_diff; \
653 14121998516 : (poll_max) = _fd_mcache_wait_poll_max; \
654 14121998516 : } while(0)
655 :
656 : #if FD_HAS_AVX
657 :
658 : /* FD_MCACHE_WAIT_SSE: similar to FD_MCACHE_WAIT but uses a pair of SSE
659 : registers to hold the metadata instead of a local buffer. This is
660 : only valid on targets with the FD_HAS_AVX capability (see
661 : fd_tango_base.h for details on Intel's atomicity guarantees). */
662 :
663 : #define FD_MCACHE_WAIT_SSE( meta_sse0, meta_sse1, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do { \
664 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
665 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
666 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
667 : __m128i _fd_mcache_wait_meta_sse0; \
668 : __m128i _fd_mcache_wait_meta_sse1; \
669 : ulong _fd_mcache_wait_seq_found; \
670 : long _fd_mcache_wait_seq_diff; \
671 : ulong _fd_mcache_wait_poll_max = (poll_max); \
672 : for(;;) { \
673 : FD_COMPILER_MFENCE(); \
674 : _fd_mcache_wait_meta_sse0 = _mm_load_si128( &_fd_mcache_wait_mline->sse0 ); /* atomic */ \
675 : FD_COMPILER_MFENCE(); \
676 : _fd_mcache_wait_meta_sse1 = _mm_load_si128( &_fd_mcache_wait_mline->sse1 ); /* atomic, typ fast L1 hit */ \
677 : FD_COMPILER_MFENCE(); \
678 : ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */ \
679 : FD_COMPILER_MFENCE(); \
680 : _fd_mcache_wait_seq_found = fd_frag_meta_sse0_seq( _fd_mcache_wait_meta_sse0 ); \
681 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
682 : int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
683 : | (!--_fd_mcache_wait_poll_max); \
684 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
685 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
686 : FD_SPIN_PAUSE(); \
687 : } \
688 : (meta_sse0) = _fd_mcache_wait_meta_sse0; \
689 : (meta_sse1) = _fd_mcache_wait_meta_sse1; \
690 : (mline) = _fd_mcache_wait_mline; \
691 : (seq_found) = _fd_mcache_wait_seq_found; \
692 : (seq_diff) = _fd_mcache_wait_seq_diff; \
693 : (poll_max) = _fd_mcache_wait_poll_max; \
694 : } while(0)
695 :
696 : /* FD_MCACHE_WAIT_AVX: similar to FD_MCACHE_WAIT_SSE but uses a single
697 : AVX register to hold the found metadata instead of a local buffer.
698 : This is only valid for targets that have atomic AVX load / stores
699 : (not guaranteed across all AVX supporting CPUs and Intel is
700 : deliberately vague about which ones do have it) and a producer that
701 : similarly uses atomic AVX writes for metadata publication. On the
702 : overrun case here, meta_avx will in fact be the metadata for the
703 : overrun sequence number. */
704 :
705 : #define FD_MCACHE_WAIT_AVX( meta_avx, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do { \
706 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
707 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
708 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
709 : __m256i _fd_mcache_wait_meta_avx; \
710 : ulong _fd_mcache_wait_seq_found; \
711 : long _fd_mcache_wait_seq_diff; \
712 : ulong _fd_mcache_wait_poll_max = (poll_max); \
713 : for(;;) { \
714 : FD_COMPILER_MFENCE(); \
715 : _fd_mcache_wait_meta_avx = _mm256_load_si256( &_fd_mcache_wait_mline->avx ); /* atomic */ \
716 : FD_COMPILER_MFENCE(); \
717 : _fd_mcache_wait_seq_found = fd_frag_meta_avx_seq( _fd_mcache_wait_meta_avx ); \
718 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
719 : int _fd_mcache_wait_done = (_fd_mcache_wait_seq_diff>=0L) | (!--_fd_mcache_wait_poll_max); \
720 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
721 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
722 : FD_SPIN_PAUSE(); \
723 : } \
724 : (meta_avx) = _fd_mcache_wait_meta_avx; \
725 : (mline) = _fd_mcache_wait_mline; \
726 : (seq_found) = _fd_mcache_wait_seq_found; \
727 : (seq_diff) = _fd_mcache_wait_seq_diff; \
728 : (poll_max) = _fd_mcache_wait_poll_max; \
729 : } while(0)
730 :
731 : #endif
732 :
733 : /* fd_mcache_query returns seq_query if seq_query is still in the mcache
734 : (assumed to be a current local mcache join) with depth entries (depth
735 : is assumed to be an integer power of two of at least
736 : FD_MCACHE_BLOCK). It will return a sequence number before seq_query
737 : if the seq_query has not yet been published. It will return a
738 : sequence after seq_query if seq_query is no longer available in the
739 : mcache. In this last case, seq_query will be typically be within
740 : depth of the most recently published sequence number as of some point
741 : in time between when the call was made and the call returned (in many
742 : common uses, this is typically very very close to most recently
743 : published sequence number). This acts as a compiler memory fence. */
744 :
745 : static inline ulong
746 : fd_mcache_query( fd_frag_meta_t const * mcache,
747 : ulong depth,
748 9000000 : ulong seq_query ) {
749 9000000 : return fd_frag_meta_seq_query( mcache + fd_mcache_line_idx( seq_query, depth ) );
750 9000000 : }
751 :
752 : FD_PROTOTYPES_END
753 :
754 : #endif /* HEADER_fd_src_tango_mcache_fd_mcache_h */
755 :
|