Line data Source code
1 : #ifndef HEADER_fd_src_tango_mcache_fd_mcache_h
2 : #define HEADER_fd_src_tango_mcache_fd_mcache_h
3 :
4 : #include "../fd_tango_base.h"
5 :
6 : /* FD_MCACHE_{ALIGN,FOOTPRINT} specify the alignment and footprint
7 : needed for a mcache with depth entries and an application region of
8 : size app_sz. ALIGN is at least FD_FRAG_META_ALIGN and recommended to
9 : be at least double cache line to mitigate various kinds of false
10 : sharing. depth and app_sz are assumed to be valid (i.e. depth is an
11 : integer power of 2 of at least FD_MCACHE_BLOCK and the combination
12 : will not require a footprint larger than ULONG_MAX). These are
13 : provided to facilitate compile time mcache declarations. */
14 :
15 12273 : #define FD_MCACHE_ALIGN (128UL)
16 : #define FD_MCACHE_FOOTPRINT( depth, app_sz ) \
17 : FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
18 : FD_MCACHE_ALIGN, 128UL ), /* hdr */ \
19 : FD_MCACHE_ALIGN, FD_MCACHE_SEQ_CNT*sizeof(ulong) ), /* seq */ \
20 : FD_MCACHE_ALIGN, (depth)*sizeof(fd_frag_meta_t) ), /* meta */ \
21 : FD_MCACHE_ALIGN, (app_sz) ), /* app */ \
22 : FD_MCACHE_ALIGN )
23 :
24 : /* FD_MCACHE_SEQ_CNT specifies the number of entries in the mcache's seq
25 : storage region. It is aligned FD_MCACHE_ALIGN. Multiples of 16 have
26 : good Feng Shui. seq[0] has special meaning; see below for details. */
27 :
28 102 : #define FD_MCACHE_SEQ_CNT (16UL)
29 :
30 : /* FD_MCACHE_{LG_BLOCK,LG_INTERLEAVE,BLOCK} specifies how recent
31 : fragment meta data should be packed into mcaches. LG_BLOCK should be
32 : in [1,64). LG_INTERLEAVE should be in [0,FD_MCACHE_BLOCK). BLOCK ==
33 : 2^LG_BLOCK. See below for more details. */
34 :
35 : #define FD_MCACHE_LG_BLOCK (7)
36 : #define FD_MCACHE_LG_INTERLEAVE (0)
37 : #define FD_MCACHE_BLOCK (128UL) /* == 2^FD_MCACHE_LG_BLOCK, explicit to workaround compiler limitations */
38 :
39 : FD_PROTOTYPES_BEGIN
40 :
41 : /* Construction API */
42 :
43 : /* fd_mcache_{align,footprint} return the required alignment and
44 : footprint of a memory region suitable for use as mcache with depth
45 : entries. align returns FD_MCACHE_ALIGN. If depth is invalid (e.g.
46 : not an integer power-of-2 >= FD_MCACHE_BLOCK or the footprint is
47 : larger than ULONG_MAX), footprint will silently return 0 (and thus
48 : can be used by the caller to validate mcache configuration
49 : parameters). */
50 :
51 : FD_FN_CONST ulong
52 : fd_mcache_align( void );
53 :
54 : FD_FN_CONST ulong
55 : fd_mcache_footprint( ulong depth,
56 : ulong app_sz );
57 :
58 : /* fd_mcache_new formats an unused memory region for use as a mcache.
59 : shmem is a non-NULL pointer to this region in the local address space
60 : with the required footprint and alignment. depth is the number of
61 : cache entries (should be an integer power of 2 >= FD_MCACHE_BLOCK).
62 : The mcache will also have an app_sz byte application region for
63 : application specific usage. seq0 is the initial fragment sequence
64 : number a producer should use for this mcache.
65 :
66 : The cache entries will be initialized such all queries for any
67 : sequence number will fail immediately after creation. They will
68 : further be initialized such that for any consumer initialized to
69 : start receiving a sequence number at or after seq0 will think it is
70 : ahead of the producer (such that it will wait for its sequence number
71 : cleanly instead of immediately trying to recover a gap). Conversely,
72 : consumers initialized to start receiving a sequence number before
73 : seq0 will think they are behind the producer (thus realize it is been
74 : incorrectly initialized and can recover appropriately). Anybody who
75 : looks at the mcache entries directly will also see the entries are
76 : initialized to have zero sz (such that they shouldn't try deference
77 : any fragment payloads), have the SOM and EOM bits set (so they
78 : shouldn't try to interpret the entry as part of some message spread
79 : over multiple fragments) and have the ERR bit set (so they don't
80 : think there is any validity to the meta data or payload).
81 :
82 : The application region will be initialized to zero.
83 :
84 : Returns shmem (and the memory region it points to will be formatted
85 : as a mcache, caller is not joined) on success and NULL on failure
86 : (logs details). Reasons for failure include obviously bad shmem or
87 : bad depth. */
88 :
89 : void *
90 : fd_mcache_new( void * shmem,
91 : ulong depth,
92 : ulong app_sz,
93 : ulong seq0 );
94 :
95 : /* fd_mcache_join joins the caller to the mcache. shmcache points to
96 : the first byte of the memory region backing the mcache in the
97 : caller's address space.
98 :
99 : Returns a pointer in the local address space to the mcache's entries
100 : on success (IMPORTANT! THIS IS NOT JUST A CAST OF SHMCACHE) and NULL
101 : on failure (logs details). Reasons for failure are that shmcache is
102 : obviously not a pointer to memory region holding a mcache. Every
103 : successful join should have a matching leave. The lifetime of the
104 : join is until the matching leave or thread group is terminated.
105 :
106 : Entries are indexed [0,depth) and the mapping from sequence number to
107 : depth is nontrivial (see below for accessors and mapping functions).
108 : There is no restrictions on the number of joins overall and a single
109 : thread can join multiple times (all joins to the same shmcache laddr
110 : will return same mcache laddr). */
111 :
112 : fd_frag_meta_t *
113 : fd_mcache_join( void * shmcache );
114 :
115 : /* fd_mcache_leave leaves a current local join. Returns a pointer to
116 : the underlying shared memory region on success (IMPORTANT! THIS IS
117 : NOT JUST A CAST OF MCACHE) and NULL on failure (logs details).
118 : Reasons for failure include mcache is NULL. */
119 :
120 : void *
121 : fd_mcache_leave( fd_frag_meta_t const * mcache );
122 :
123 : /* fd_mcache_delete unformats a memory region used as a mcache. Assumes
124 : nobody is joined to the region. Returns a pointer to the underlying
125 : shared memory region or NULL if used obviously in error (e.g.
126 : shmcache is obviously not a mcache ... logs details). The ownership
127 : of the memory region is transferred to the caller. */
128 :
129 : void *
130 : fd_mcache_delete( void * shmcache );
131 :
132 : /* Accessor API */
133 :
134 : /* fd_mcache_{depth,seq0} return the values corresponding to those use
135 : at the mcache's construction. Assume mcache is a current local join. */
136 :
137 : FD_FN_PURE ulong fd_mcache_depth ( fd_frag_meta_t const * mcache );
138 : FD_FN_PURE ulong fd_mcache_app_sz( fd_frag_meta_t const * mcache );
139 : FD_FN_PURE ulong fd_mcache_seq0 ( fd_frag_meta_t const * mcache );
140 :
141 : /* fd_mcache_seq_laddr returns location in the caller's local address
142 : space of mcache's sequence array. This array is indexed
143 : [0,FD_MCACHE_SEQ_CNT) with FD_MCACHE_ALIGN alignment (double cache
144 : line). laddr_const is a const correct version. Assumes mcache is a
145 : current local join. The lifetime of the returned pointer is the same
146 : as the underlying join.
147 :
148 : seq[0] has special meaning. Specifically, sequence numbers in
149 : [seq0,seq[0]) cyclic are guaranteed to have been published. seq[0]
150 : is not strictly atomically updated by the producer when it publishes
151 : so seq[0] can lag the most recently published sequence number
152 : somewhat. As seq[0] is moderately to aggressively frequently updated
153 : by the mcache's producer (depending on the application), this is on
154 : its own cache line pair to avoid false sharing. seq[0] is mostly
155 : used for monitoring, initialization and support for some methods for
156 : unreliable consumer overrun handling.
157 :
158 : The meaning of the remaining sequence numbers is application
159 : dependent. Application should try to restrict any use of these to
160 : ones that are seq[0] cache-friendly (e.g. use for producer write
161 : oriented cases or use for rarely used cases). */
162 :
163 : FD_FN_CONST ulong const * fd_mcache_seq_laddr_const( fd_frag_meta_t const * mcache );
164 : FD_FN_CONST ulong * fd_mcache_seq_laddr ( fd_frag_meta_t * mcache );
165 :
166 : /* fd_mcache_app_laddr returns location in the caller's local address
167 : space of memory set aside for application specific usage. Assumes
168 : mcache is a current local join. The lifetime of the returned pointer
169 : is the same as the underlying join. This region has FD_MCACHE_ALIGN
170 : alignment (double cache line) and is fd_mcache_app_sz( mcache ) in
171 : size. laddr_const is a const-correct version. */
172 :
173 : FD_FN_PURE uchar const * fd_mcache_app_laddr_const( fd_frag_meta_t const * mcache );
174 : FD_FN_PURE uchar * fd_mcache_app_laddr ( fd_frag_meta_t * mcache );
175 :
176 : /* fd_mcache_seq_query atomically reads the mcache's seq[0] (e.g. from
177 : fd_mcache_seq_laddr_const) to get a lower bound of where the producer
178 : is at in sequence space (in the sense that the producer guarantees it
179 : has produced all sequence numbers strictly before the return value
180 : cyclic). This is usually done at consumer startup and, for some
181 : unreliable consumer overrun handling, during consumer overrun
182 : recovery. It is strongly recommended for consumers to avoid using
183 : this as much as possible to limit cache line ping-ponging with the
184 : producer. */
185 :
186 : static inline ulong
187 3000105 : fd_mcache_seq_query( ulong const * _seq ) {
188 3000105 : FD_COMPILER_MFENCE();
189 3000105 : ulong seq = FD_VOLATILE_CONST( *_seq );
190 3000105 : FD_COMPILER_MFENCE();
191 3000105 : return seq;
192 3000105 : }
193 :
194 : /* fd_mcache_seq_update updates the mcache's seq[0] (e.g. from
195 : fd_mcache_seq_laddr) above where the producer a lower bound of where
196 : the producer is currently at (in the sense that the producer has
197 : produced all sequence numbers strictly before seq cyclic). This
198 : should be monotonically non-decreasing. This should be done
199 : moderately frequently (e.g. in background housekeeping) after the
200 : producer has moved forward in sequence space since the last update.
201 : Even more aggressively is usually fine. This should also be done
202 : when the producer is shutdown to facilitate cleanly restarting a
203 : producer and what not. This also serves as a compiler memory fence
204 : to ensure the sequence number is updated at a well defined point in
205 : the instruction stream (e.g. so that compiler doesn't move any stores
206 : from before the update to after the above). */
207 :
208 : static inline void
209 : fd_mcache_seq_update( ulong * _seq,
210 3036130 : ulong seq ) {
211 3036130 : FD_COMPILER_MFENCE();
212 3036130 : FD_VOLATILE( *_seq ) = seq;
213 3036130 : FD_COMPILER_MFENCE();
214 3036130 : }
215 :
216 : /* fd_mcache_line_idx returns the index of the cache line in a depth
217 : entry mcache (depth is assumed to be a power of 2) where the
218 : metadata for the frag with sequence number seq will be stored when it
219 : is in cache. Outside of startup transients, a mcache is guaranteed
220 : to exactly hold the depth most recently sequence numbers (the act of
221 : publishing a new sequence number atomically unpublishes the oldest
222 : sequence number implicitly).
223 :
224 : FD_MCACHE_LG_INTERLEAVE is in [0,FD_MCACHE_LG_BLOCK) and controls the
225 : details of this mapping. LG_INTERLEAVE 0 indicates no interleaving.
226 : Values from 1 to LG_BLOCK space out sequential frag meta data in
227 : memory to avoid false sharing between producers and fast consumers to
228 : keep fast consumers low latency while keeping frag meta data storage
229 : compact in memory to help throughput of slow consumers.
230 :
231 : Specifically, at a LG_INTERLEAVE of i with s byte frag meta data,
232 : meta data storage for sequential frags is typically s*2^i bytes
233 : apart. To avoid wasting memory and bandwidth, the interleaving is
234 : implemented by doing a rotation of the least LG_BLOCK bits of the lg
235 : depth bits of the sequence number (NOTE: this imposes a requirement
236 : that mcaches have at least a depth of 2^LG_BLOCK fragments). This
237 : yields a frag sequence number to line idx mapping that avoids false
238 : sharing for fast consumers and maintains compactness, avoids TLB
239 : thrashing (even if meta data is backed by normal pages) and exploits
240 : CPU data and TLB prefetching behavior for slow consumers.
241 :
242 : How useful block interleaving is somewhat application dependent.
243 : Different values have different trade offs between optimizing for
244 : fast and slow consumers and for different sizes of meta data and
245 : different page size backing memory.
246 :
247 : Using 0 / B for FD_MCACHE_LG_INTERLEAVE / LG_BLOCK will disable meta
248 : data interleaving while still requiring mcaches be at least 2^B in
249 : size. This implicitly optimizes for slow consumers. Something like
250 : 2 / 7 (with a 32-byte size 32-byte aligned fd_frag_meta_t and a
251 : mcache that is at least normal page aligned) will access cached meta
252 : data in sequential blocks of 128 message fragments that are normal
253 : page size and aligned while meta data within those blocks will
254 : typically be strided at double DRAM cache line granularity. As such,
255 : fast consumers (e.g. those within 32 of the producers) will rarely
256 : have false sharing with the producers as nearby sequence numbers are
257 : on different DRAM cache line pairs. And slow consumers (e.g. ones
258 : that fall more than 128 fragments behind) will access meta data in a
259 : very DRAM cache friendly / data prefetcher / TLB friendly / bandwidth
260 : efficient manner (and without needing to load any prefilterable
261 : payload data while completely avoiding memory being written by the
262 : producer). That is, it typically has good balance of performance for
263 : both fast and slow consumers simultaneously. */
264 :
265 : #if FD_MCACHE_LG_INTERLEAVE==0
266 :
267 : FD_FN_CONST static inline ulong /* Will be in [0,depth) */
268 : fd_mcache_line_idx( ulong seq,
269 9388301983 : ulong depth ) { /* Assumed power of 2 >= BLOCK */
270 9388301983 : return seq & (depth-1UL);
271 9388301983 : }
272 :
273 : #else
274 :
275 : FD_FN_CONST static inline ulong /* Will be in [0,depth) */
276 : fd_mcache_line_idx( ulong seq,
277 : ulong depth ) { /* Assumed power of 2 >= BLOCK */
278 : ulong block_mask = FD_MCACHE_BLOCK - 1UL; /* Compile time */
279 : ulong page_mask = (depth-1UL) & (~block_mask); /* Typically compile time or loop invariant */
280 : ulong page = seq & page_mask;
281 : ulong bank = (seq << FD_MCACHE_LG_INTERLEAVE) & block_mask;
282 : ulong idx = (seq & block_mask) >> (FD_MCACHE_LG_BLOCK-FD_MCACHE_LG_INTERLEAVE);
283 : return page | bank | idx;
284 : }
285 :
286 : #endif
287 :
288 : /* fd_mcache_publish inserts the metadata for frag seq into the given
289 : depth entry mcache in a way compatible with FD_MCACHE_WAIT and
290 : FD_MCACHE_WAIT_SSE (but not FD_MCACHE_WAIT_AVX ... see FD_MCACHE_WAIT
291 : for more details). This implicitly evicts the metadata for the
292 : sequence number currently stored at fd_mcache_line_idx( seq, depth ).
293 : In the typical case where sequence numbers are published into the
294 : mcache sequentially, the evicted metadata is typically for frag
295 : seq-depth (cyclic). This does no error checking or the like as it is
296 : frequently used in ultra high performance contexts. This operation
297 : implies a compiler mfence to the caller. */
298 :
299 : static inline void
300 : fd_mcache_publish( fd_frag_meta_t * mcache, /* Assumed a current local join */
301 : ulong depth, /* Assumed an integer power-of-2 >= BLOCK */
302 : ulong seq,
303 : ulong sig,
304 : ulong chunk, /* Assumed in [0,UINT_MAX] */
305 : ulong sz, /* Assumed in [0,USHORT_MAX] */
306 : ulong ctl, /* Assumed in [0,USHORT_MAX] */
307 : ulong tsorig, /* Assumed in [0,UINT_MAX] */
308 12726485 : ulong tspub ) { /* Assumed in [0,UINT_MAX] */
309 12726485 : fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
310 12726485 : FD_COMPILER_MFENCE();
311 12726485 : meta->seq = fd_seq_dec( seq, 1UL );
312 12726485 : FD_COMPILER_MFENCE();
313 12726485 : meta->sig = sig;
314 12726485 : meta->chunk = (uint )chunk;
315 12726485 : meta->sz = (ushort)sz;
316 12726485 : meta->ctl = (ushort)ctl;
317 12726485 : meta->tsorig = (uint )tsorig;
318 12726485 : meta->tspub = (uint )tspub;
319 12726485 : FD_COMPILER_MFENCE();
320 12726485 : meta->seq = seq;
321 12726485 : FD_COMPILER_MFENCE();
322 12726485 : }
323 :
324 : #if FD_HAS_SSE
325 :
326 : /* fd_mcache_publish_sse is a SSE implementation of fd_mcache_publish.
327 : It is compatible with FD_MCACHE_WAIT and FD_MCACHE_WAIT_SSE. */
328 :
329 : static inline void
330 : fd_mcache_publish_sse( fd_frag_meta_t * mcache, /* Assumed a current local join */
331 : ulong depth, /* Assumed an integer power-of-2 >= BLOCK */
332 : ulong seq,
333 : ulong sig,
334 : ulong chunk, /* Assumed in [0,UINT_MAX] */
335 : ulong sz, /* Assumed in [0,USHORT_MAX] */
336 : ulong ctl, /* Assumed in [0,USHORT_MAX] */
337 : ulong tsorig, /* Assumed in [0,UINT_MAX] */
338 0 : ulong tspub ) { /* Assumed in [0,UINT_MAX] */
339 0 : fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
340 0 : __m128i meta_sse0 = fd_frag_meta_sse0( fd_seq_dec( seq, 1UL ), sig );
341 0 : __m128i meta_sse1 = fd_frag_meta_sse1( chunk, sz, ctl, tsorig, tspub );
342 0 : FD_COMPILER_MFENCE();
343 0 : _mm_store_si128( &meta->sse0, meta_sse0 );
344 0 : FD_COMPILER_MFENCE();
345 0 : _mm_store_si128( &meta->sse1, meta_sse1 );
346 0 : FD_COMPILER_MFENCE();
347 0 : meta->seq = seq;
348 0 : FD_COMPILER_MFENCE();
349 0 : }
350 :
351 : #endif
352 :
353 : #if FD_HAS_AVX
354 :
355 : /* fd_mcache_publish_avx is an AVX implementation of fd_mcache_publish.
356 : It is compatible with FD_MCACHE_WAIT, FD_MCACHE_WAIT_SSE and
357 : FD_MCACHE_WAIT_AVX. It requires a target for which aligned AVX
358 : stores are guaranteed atomic under the hood (see below for more
359 : details). */
360 :
361 : static inline void
362 : fd_mcache_publish_avx( fd_frag_meta_t * mcache, /* Assumed a current local join */
363 : ulong depth, /* Assumed an integer power-of-2 >= BLOCK */
364 : ulong seq,
365 : ulong sig,
366 : ulong chunk, /* Assumed in [0,UINT_MAX] */
367 : ulong sz, /* Assumed in [0,USHORT_MAX] */
368 : ulong ctl, /* Assumed in [0,USHORT_MAX] */
369 : ulong tsorig, /* Assumed in [0,UINT_MAX] */
370 65874 : ulong tspub ) { /* Assumed in [0,UINT_MAX] */
371 65874 : fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
372 65874 : __m256i meta_avx = fd_frag_meta_avx( seq, sig, chunk, sz, ctl, tsorig, tspub );
373 65874 : FD_COMPILER_MFENCE();
374 65874 : _mm256_store_si256( &meta->avx, meta_avx );
375 65874 : FD_COMPILER_MFENCE();
376 65874 : }
377 :
378 : #endif
379 :
380 : /* FD_MCACHE_WAIT does a bounded wait for a producer to transmit a
381 : particular frag.
382 :
383 : meta (fd_frag_meta_t * compatible) is the location on the caller
384 : where the wait should save the found metadata. This typically
385 : points to a stack temporary.
386 :
387 : mline (fd_frag_meta_t const * compatible) will be
388 : mcache + fd_mcache_line_idx( seq_expected, depth )
389 : when the wait does not time out. This is the location where the
390 : caller can verify (after any speculative processing of seq_expected)
391 : the producer did not clobber the consumer during the processing.
392 :
393 : seq_found (ulong compatible) will be the sequence number found at
394 : mline when the wait does not time out. This will be seq_expected
395 : on a successful wait.
396 :
397 : seq_diff (long compatible) will be how many sequence numbers ahead
398 : of seq_expected when the wait does not time out
399 : fd_seq_diff( seq_found, seq_expected )
400 : This will be zero on a successful wait. This will be positive
401 : otherwise and a lower bound of how far behind the consumer is from
402 : the producer (and seq_found will typically be a reasonably recently
403 : produced sequence number).
404 :
405 : poll_max (ulong compatible) is the number of times FD_MCACHE_WAIT
406 : will poll the mcache of the given depth for seq_expected before
407 : timing out. poll_max should be positive on input. (Note: using
408 : ULONG_MAX for poll_max practically turns this into a blocking wait as
409 : this take hundreds of years to complete on realistic platforms.)
410 : If poll max is zero on completion of the, the wait timed out.
411 :
412 : mcache (fd_frag_meta_t const * compatible) is a current local join to
413 : the mcache the producer uses to cache metadata for the frags it is
414 : producing.
415 :
416 : depth (a ulong compatible power of two of at least FD_MCACHE_BLOCK)
417 : is the number of entries in mcache.
418 :
419 : seq_expected (ulong compatible) is the sequence number to wait to be
420 : produced.
421 :
422 : On completion of the WAIT, if poll_max is zero, the WAIT timed out
423 : and none of the other outputs (meta, mline, seq_found, seq_diff)
424 : should be trusted. If poll_max is non-zero, it will be the original
425 : poll_max value decremented by the number of polls it took for the
426 : WAIT to complete and the WAIT did not timeout.
427 :
428 : When the WAIT did not timeout, mline, seq_found and seq_diff can be
429 : trusted. If seq_diff is positive, the caller has fallen more than
430 : depth behind the producer such that metadata for frag seq_expected is
431 : no longer available via the mcache. IMPORTANT! *META MIGHT NOT BE
432 : VALID FOR SEQ_FOUND WHEN CONSUMER HAS FALLEN BEHIND (e.g. if the
433 : producer is paused after it starts writing metadata but before it has
434 : completed writing it ... an unreliable overrun consumer that reads
435 : the metadata while the producer is paused will observe metadata that
436 : is a mix of the new metadata and old metadata with a bogus sequence
437 : number on it). seq_diff is a lower bound of how far the caller has
438 : fallen behind the producer and seq_found is a lower bound of where
439 : producer is currently at.
440 :
441 : Otherwise, the caller is within depth of the producer and *meta will
442 : be a local copy of the desired metadata.
443 :
444 : TL;DR Typical usage:
445 :
446 : ... Example HPC receiver run loop setup
447 :
448 : ulong poll_max = ... number of polls until next housekeeping (positive)
449 : fd_frag_meta_t const * mcache = ... local join to producer's mcache
450 : ulong depth = ... producer's mcache depth
451 : ulong rx_seq = ... next sequence number to receive from producer
452 :
453 : ... Example HPC receiver run loop structure
454 :
455 : for(;;) {
456 :
457 : fd_frag_meta_t meta[1];
458 : fd_frag_meta_t const * mline;
459 : ulong tx_seq;
460 : long seq_diff;
461 : FD_MCACHE_WAIT( meta, mline, tx_seq, seq_diff, poll_max, mcache, depth, rx_seq );
462 :
463 : ... At this point, poll_max can be trusted and has been
464 : ... decremented the number of polls that were done by the wait
465 : ... from its value at the start of the wait. We either timed
466 : ... out waiting, detected we've been overrun or received the
467 : ... desired meta data.
468 :
469 : if( FD_UNLIKELY( !poll_max ) ) {
470 :
471 : ... We timed out. Do background housekeeping.
472 :
473 : poll_max = ... Reload for the next housekeeping (positive and
474 : ... ideally somewhat randomized each time). Value
475 : ... depends on how aggressively the run loop needs
476 : ... to do background tasks such as
477 : ... command-and-control interactions, monitoring
478 : ... diagnostics, maintenance, etc).
479 :
480 : continue;
481 : }
482 :
483 : ... At this point, poll_max, mline, tx_seq and seq_diff can be
484 : ... trusted. We either have been overrun or received the desired
485 : ... metadata. poll_max>0 and seq_diff==fd_seq_diff(tx_seq,rx_seq).
486 :
487 : if( FD_UNLIKELY( seq_diff ) ) {
488 :
489 : ... We got overrun by the producer. tx_seq is an estimate
490 : ... (typically within depth and often much closer) of where the
491 : ... producer currently is at. Technically, this branch should
492 : ... never be exercised on reliable consumers but is a generally
493 : ... good idea regardless to detect / protect against flow
494 : ... control misconfigurations, bugs in the consumer, etc.
495 : ... Overrun handling could be as simple as "rx_seq = tx_seq;"
496 : ... here (but applications will typically do more elaborate
497 : ... application specific handling)
498 :
499 : continue;
500 : }
501 :
502 : ... We received meta data for frag rx_seq. At this point, meta,
503 : ... tx_seq, seq_diff and poll_max can be trusted. poll_max>=0UL,
504 : ... tx_seq==rx_seq and seq_diff==0L.
505 :
506 : ... Process meta->* at the run loop's leisure and speculatively
507 : ... process actual frag data as necessary here.
508 :
509 : tx_seq = fd_frag_meta_seq_query( mline );
510 : if( FD_UNLIKELY( fd_seq_ne( tx_seq, rx_seq ) ) ) {
511 :
512 : ... We got overrun by the producer while speculatively
513 : ... processing data pointed to by meta. Same considerations
514 : ... as above for overrun handling.
515 :
516 : continue;
517 : }
518 :
519 : ... Advance to the producer's next sequence number.
520 :
521 : rx_seq = fd_seq_inc( rx_seq, 1UL );
522 : }
523 :
524 : This assumes the producer either writes the entire metadata cache
525 : line atomically (on targets where aligned AVX writes are in fact
526 : atomic) or writes the metadata cache line in a particular order:
527 :
528 : FD_COMPILER_MFENCE();
529 : mcache_line->seq = fd_seq_dec( seq, 1UL ); // atomically marks cache line as in the process of writing seq
530 : // This implicitly atomically evicts frag metadata for cache line
531 : // seq-depth cycle
532 : FD_COMPILER_MFENCE();
533 : ... update the actual cache line body without changing mcache_line->seq ...
534 : FD_COMPILER_MFENCE();
535 : mcache_line->seq = seq; // atomically marks metadata for frag seq as available for consumers
536 : FD_COMPILER_MFENCE();
537 :
538 : Note that above writes can be SSE accelerated on AVX platforms (where
539 : aligned SSE writes are guaranteed to be atomic) as:
540 :
541 : FD_COMPILER_MFENCE();
542 : _mm_store_si128( &mcache_line->sse0, fd_frag_meta_sse0( fd_seq_dec( seq, 1UL ), sig );
543 : FD_COMPILER_MFENCE();
544 : _mm_store_si128( &mcache_line->sse1, fd_frag_meta_sse1( chunk, sz, ctl, tsorig, tspub );
545 : FD_COMPILER_MFENCE();
546 : mcache_line->seq = seq;
547 : FD_COMPILER_MFENCE();
548 :
549 : Note that the above uses no expensive atomic operations or hardware
550 : memory fences under the hood as these are not required for x86-style
551 : cache coherency. Specifically, Intel Architecture Software Developer
552 : Manual 3A-8-9:
553 :
554 : "Reads are not reordered with other reads."
555 :
556 : and 3A-8-10:
557 :
558 : "Writes by a single processor are observed in the same order by all
559 : processors."
560 :
561 : This makes heavy use of compiler memory fences though to insure that
562 : compiler optimizations do not reorder how the operations are issued
563 : to CPUs (and thus also imply the operation acts as a compiler memory
564 : fence overall).
565 :
566 : Non-x86 platforms that use different cache coherency models may
567 : require modification of the below to use more explicit fencing or
568 : what not.
569 :
570 : The below is implemented as a macro to facilitate use in ultra high
571 : performance run loops and support multiple return values. This macro
572 : is robust (e.g. it evaluates its argument a minimal number of times). */
573 :
574 0 : #define FD_MCACHE_WAIT( meta, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do { \
575 0 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
576 0 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
577 0 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
578 0 : fd_frag_meta_t * _fd_mcache_wait_meta = (meta); \
579 0 : ulong _fd_mcache_wait_seq_found; \
580 0 : long _fd_mcache_wait_seq_diff; \
581 0 : ulong _fd_mcache_wait_poll_max = (poll_max); \
582 0 : for(;;) { \
583 0 : FD_COMPILER_MFENCE(); \
584 0 : _fd_mcache_wait_seq_found = _fd_mcache_wait_mline->seq; /* atomic */ \
585 0 : FD_COMPILER_MFENCE(); \
586 0 : *_fd_mcache_wait_meta = *_fd_mcache_wait_mline; /* probably non-atomic, typically fast L1 cache hit */ \
587 0 : FD_COMPILER_MFENCE(); \
588 0 : ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */ \
589 0 : FD_COMPILER_MFENCE(); \
590 0 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
591 0 : int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
592 0 : | (!--_fd_mcache_wait_poll_max); \
593 0 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
594 0 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
595 0 : FD_SPIN_PAUSE(); \
596 0 : } \
597 0 : (mline) = _fd_mcache_wait_mline; \
598 0 : (seq_found) = _fd_mcache_wait_seq_found; \
599 0 : (seq_diff) = _fd_mcache_wait_seq_diff; \
600 0 : (poll_max) = _fd_mcache_wait_poll_max; \
601 0 : } while(0)
602 :
603 : /* FD_MCACHE_WAIT_REG: similar to FD_MCACHE_WAIT but uses (nominally)
604 : registers to hold the metadata instead of a local buffer. */
605 :
606 : #define FD_MCACHE_WAIT_REG( sig, chunk, sz, ctl, tsorig, tspub, mline, seq_found, seq_diff, poll_max, \
607 153631907 : mcache, depth, seq_expected ) do { \
608 153631907 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
609 153631907 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
610 153631907 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
611 153631907 : ulong _fd_mcache_wait_poll_max = (poll_max); \
612 153631907 : ulong _fd_mcache_wait_sig; \
613 153631907 : ulong _fd_mcache_wait_chunk; \
614 153631907 : ulong _fd_mcache_wait_sz; \
615 153631907 : ulong _fd_mcache_wait_ctl; \
616 153631907 : ulong _fd_mcache_wait_tsorig; \
617 153631907 : ulong _fd_mcache_wait_tspub; \
618 153631907 : ulong _fd_mcache_wait_seq_found; \
619 153631907 : long _fd_mcache_wait_seq_diff; \
620 3317208878 : for(;;) { \
621 3317208878 : FD_COMPILER_MFENCE(); \
622 3317208878 : _fd_mcache_wait_seq_found = _fd_mcache_wait_mline->seq; /* atomic */ \
623 3317208878 : FD_COMPILER_MFENCE(); \
624 3317208878 : _fd_mcache_wait_sig = _fd_mcache_wait_mline->sig; \
625 3317208878 : _fd_mcache_wait_chunk = (ulong)_fd_mcache_wait_mline->chunk; \
626 3317208878 : _fd_mcache_wait_sz = (ulong)_fd_mcache_wait_mline->sz; \
627 3317208878 : _fd_mcache_wait_ctl = (ulong)_fd_mcache_wait_mline->ctl; \
628 3317208878 : _fd_mcache_wait_tsorig = (ulong)_fd_mcache_wait_mline->tsorig; \
629 3317208878 : _fd_mcache_wait_tspub = (ulong)_fd_mcache_wait_mline->tspub; \
630 3317208878 : FD_COMPILER_MFENCE(); \
631 3317208878 : ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */ \
632 3317208878 : FD_COMPILER_MFENCE(); \
633 3317208878 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
634 3317208878 : int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
635 3317208878 : | (!--_fd_mcache_wait_poll_max); \
636 3317208878 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
637 3317208878 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
638 3317208878 : FD_SPIN_PAUSE(); \
639 3163576971 : } \
640 153631907 : (sig) = _fd_mcache_wait_sig; \
641 153631907 : (chunk) = _fd_mcache_wait_chunk; \
642 153631907 : (sz) = _fd_mcache_wait_sz; \
643 153631907 : (ctl) = _fd_mcache_wait_ctl; \
644 153631907 : (tsorig) = _fd_mcache_wait_tsorig; \
645 153631907 : (tspub) = _fd_mcache_wait_tspub; \
646 153631907 : (mline) = _fd_mcache_wait_mline; \
647 153631907 : (seq_found) = _fd_mcache_wait_seq_found; \
648 153631907 : (seq_diff) = _fd_mcache_wait_seq_diff; \
649 153631907 : (poll_max) = _fd_mcache_wait_poll_max; \
650 153631907 : } while(0)
651 :
652 : #if FD_HAS_AVX
653 :
654 : /* FD_MCACHE_WAIT_SSE: similar to FD_MCACHE_WAIT but uses a pair of SSE
655 : registers to hold the metadata instead of a local buffer. This is
656 : only valid on targets with the FD_HAS_AVX capability (see
657 : fd_tango_base.h for details on Intel's atomicity guarantees). */
658 :
659 : #define FD_MCACHE_WAIT_SSE( meta_sse0, meta_sse1, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do { \
660 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
661 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
662 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
663 : __m128i _fd_mcache_wait_meta_sse0; \
664 : __m128i _fd_mcache_wait_meta_sse1; \
665 : ulong _fd_mcache_wait_seq_found; \
666 : long _fd_mcache_wait_seq_diff; \
667 : ulong _fd_mcache_wait_poll_max = (poll_max); \
668 : for(;;) { \
669 : FD_COMPILER_MFENCE(); \
670 : _fd_mcache_wait_meta_sse0 = _mm_load_si128( &_fd_mcache_wait_mline->sse0 ); /* atomic */ \
671 : FD_COMPILER_MFENCE(); \
672 : _fd_mcache_wait_meta_sse1 = _mm_load_si128( &_fd_mcache_wait_mline->sse1 ); /* atomic, typ fast L1 hit */ \
673 : FD_COMPILER_MFENCE(); \
674 : ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */ \
675 : FD_COMPILER_MFENCE(); \
676 : _fd_mcache_wait_seq_found = fd_frag_meta_sse0_seq( _fd_mcache_wait_meta_sse0 ); \
677 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
678 : int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
679 : | (!--_fd_mcache_wait_poll_max); \
680 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
681 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
682 : FD_SPIN_PAUSE(); \
683 : } \
684 : (meta_sse0) = _fd_mcache_wait_meta_sse0; \
685 : (meta_sse1) = _fd_mcache_wait_meta_sse1; \
686 : (mline) = _fd_mcache_wait_mline; \
687 : (seq_found) = _fd_mcache_wait_seq_found; \
688 : (seq_diff) = _fd_mcache_wait_seq_diff; \
689 : (poll_max) = _fd_mcache_wait_poll_max; \
690 : } while(0)
691 :
692 : /* FD_MCACHE_WAIT_AVX: similar to FD_MCACHE_WAIT_SSE but uses a single
693 : AVX register to hold the found metadata instead of a local buffer.
694 : This is only valid for targets that have atomic AVX load / stores
695 : (not guaranteed across all AVX supporting CPUs and Intel is
696 : deliberately vague about which ones do have it) and a producer that
697 : similarly uses atomic AVX writes for metadata publication. On the
698 : overrun case here, meta_avx will in fact be the metadata for the
699 : overrun sequence number. */
700 :
701 : #define FD_MCACHE_WAIT_AVX( meta_avx, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do { \
702 : ulong _fd_mcache_wait_seq_expected = (seq_expected); \
703 : fd_frag_meta_t const * _fd_mcache_wait_mline = (mcache) \
704 : + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
705 : __m256i _fd_mcache_wait_meta_avx; \
706 : ulong _fd_mcache_wait_seq_found; \
707 : long _fd_mcache_wait_seq_diff; \
708 : ulong _fd_mcache_wait_poll_max = (poll_max); \
709 : for(;;) { \
710 : FD_COMPILER_MFENCE(); \
711 : _fd_mcache_wait_meta_avx = _mm256_load_si256( &_fd_mcache_wait_mline->avx ); /* atomic */ \
712 : FD_COMPILER_MFENCE(); \
713 : _fd_mcache_wait_seq_found = fd_frag_meta_avx_seq( _fd_mcache_wait_meta_avx ); \
714 : _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected ); \
715 : int _fd_mcache_wait_done = (_fd_mcache_wait_seq_diff>=0L) | (!--_fd_mcache_wait_poll_max); \
716 : FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */ \
717 : if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */ \
718 : FD_SPIN_PAUSE(); \
719 : } \
720 : (meta_avx) = _fd_mcache_wait_meta_avx; \
721 : (mline) = _fd_mcache_wait_mline; \
722 : (seq_found) = _fd_mcache_wait_seq_found; \
723 : (seq_diff) = _fd_mcache_wait_seq_diff; \
724 : (poll_max) = _fd_mcache_wait_poll_max; \
725 : } while(0)
726 :
727 : #endif
728 :
729 : /* fd_mcache_query returns seq_query if seq_query is still in the mcache
730 : (assumed to be a current local mcache join) with depth entries (depth
731 : is assumed to be an integer power of two of at least
732 : FD_MCACHE_BLOCK). It will return a sequence number before seq_query
733 : if the seq_query has not yet been published. It will return a
734 : sequence after seq_query if seq_query is no longer available in the
735 : mcache. In this last case, seq_query will be typically be within
736 : depth of the most recently published sequence number as of some point
737 : in time between when the call was made and the call returned (in many
738 : common uses, this is typically very very close to most recently
739 : published sequence number). This acts as a compiler memory fence. */
740 :
741 : static inline ulong
742 : fd_mcache_query( fd_frag_meta_t const * mcache,
743 : ulong depth,
744 9000000 : ulong seq_query ) {
745 9000000 : return fd_frag_meta_seq_query( mcache + fd_mcache_line_idx( seq_query, depth ) );
746 9000000 : }
747 :
748 : FD_PROTOTYPES_END
749 :
750 : #endif /* HEADER_fd_src_tango_mcache_fd_mcache_h */
751 :
|