LCOV - code coverage report
Current view: top level - tango/mcache - fd_mcache.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 85 125 68.0 %
Date: 2024-11-13 11:58:15 Functions: 23 805 2.9 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_tango_mcache_fd_mcache_h
       2             : #define HEADER_fd_src_tango_mcache_fd_mcache_h
       3             : 
       4             : #include "../fd_tango_base.h"
       5             : 
       6             : /* FD_MCACHE_{ALIGN,FOOTPRINT} specify the alignment and footprint
       7             :    needed for a mcache with depth entries and an application region of
       8             :    size app_sz.  ALIGN is at least FD_FRAG_META_ALIGN and recommended to
       9             :    be at least double cache line to mitigate various kinds of false
      10             :    sharing.  depth and app_sz are assumed to be valid (i.e. depth is an
      11             :    integer power of 2 of at least FD_MCACHE_BLOCK and the combination
      12             :    will not require a footprint larger than ULONG_MAX).  These are
      13             :    provided to facilitate compile time mcache declarations. */
      14             : 
      15       12273 : #define FD_MCACHE_ALIGN (128UL)
      16             : #define FD_MCACHE_FOOTPRINT( depth, app_sz )                                                              \
      17             :   FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
      18             :     FD_MCACHE_ALIGN, 128UL                           ), /* hdr  */                                        \
      19             :     FD_MCACHE_ALIGN, FD_MCACHE_SEQ_CNT*sizeof(ulong) ), /* seq  */                                        \
      20             :     FD_MCACHE_ALIGN, (depth)*sizeof(fd_frag_meta_t)  ), /* meta */                                        \
      21             :     FD_MCACHE_ALIGN, (app_sz)                        ), /* app  */                                        \
      22             :     FD_MCACHE_ALIGN )
      23             : 
      24             : /* FD_MCACHE_SEQ_CNT specifies the number of entries in the mcache's seq
      25             :    storage region.  It is aligned FD_MCACHE_ALIGN.  Multiples of 16 have
      26             :    good Feng Shui.  seq[0] has special meaning; see below for details. */
      27             : 
      28         102 : #define FD_MCACHE_SEQ_CNT (16UL)
      29             : 
      30             : /* FD_MCACHE_{LG_BLOCK,LG_INTERLEAVE,BLOCK} specifies how recent
      31             :    fragment meta data should be packed into mcaches.  LG_BLOCK should be
      32             :    in [1,64).  LG_INTERLEAVE should be in [0,FD_MCACHE_BLOCK).  BLOCK ==
      33             :    2^LG_BLOCK.  See below for more details. */
      34             : 
      35             : #define FD_MCACHE_LG_BLOCK      (7)
      36             : #define FD_MCACHE_LG_INTERLEAVE (0)
      37             : #define FD_MCACHE_BLOCK         (128UL) /* == 2^FD_MCACHE_LG_BLOCK, explicit to workaround compiler limitations */
      38             : 
      39             : FD_PROTOTYPES_BEGIN
      40             : 
      41             : /* Construction API */
      42             : 
      43             : /* fd_mcache_{align,footprint} return the required alignment and
      44             :    footprint of a memory region suitable for use as mcache with depth
      45             :    entries.  align returns FD_MCACHE_ALIGN.  If depth is invalid (e.g.
      46             :    not an integer power-of-2 >= FD_MCACHE_BLOCK or the footprint is
      47             :    larger than ULONG_MAX), footprint will silently return 0 (and thus
      48             :    can be used by the caller to validate mcache configuration
      49             :    parameters). */
      50             : 
      51             : FD_FN_CONST ulong
      52             : fd_mcache_align( void );
      53             : 
      54             : FD_FN_CONST ulong
      55             : fd_mcache_footprint( ulong depth,
      56             :                      ulong app_sz );
      57             : 
      58             : /* fd_mcache_new formats an unused memory region for use as a mcache.
      59             :    shmem is a non-NULL pointer to this region in the local address space
      60             :    with the required footprint and alignment.  depth is the number of
      61             :    cache entries (should be an integer power of 2 >= FD_MCACHE_BLOCK).
      62             :    The mcache will also have an app_sz byte application region for
      63             :    application specific usage.  seq0 is the initial fragment sequence
      64             :    number a producer should use for this mcache.
      65             : 
      66             :    The cache entries will be initialized such all queries for any
      67             :    sequence number will fail immediately after creation.  They will
      68             :    further be initialized such that for any consumer initialized to
      69             :    start receiving a sequence number at or after seq0 will think it is
      70             :    ahead of the producer (such that it will wait for its sequence number
      71             :    cleanly instead of immediately trying to recover a gap).  Conversely,
      72             :    consumers initialized to start receiving a sequence number before
      73             :    seq0 will think they are behind the producer (thus realize it is been
      74             :    incorrectly initialized and can recover appropriately).  Anybody who
      75             :    looks at the mcache entries directly will also see the entries are
      76             :    initialized to have zero sz (such that they shouldn't try deference
      77             :    any fragment payloads), have the SOM and EOM bits set (so they
      78             :    shouldn't try to interpret the entry as part of some message spread
      79             :    over multiple fragments) and have the ERR bit set (so they don't
      80             :    think there is any validity to the meta data or payload).
      81             : 
      82             :    The application region will be initialized to zero.
      83             : 
      84             :    Returns shmem (and the memory region it points to will be formatted
      85             :    as a mcache, caller is not joined) on success and NULL on failure
      86             :    (logs details).  Reasons for failure include obviously bad shmem or
      87             :    bad depth. */
      88             : 
      89             : void *
      90             : fd_mcache_new( void * shmem,
      91             :                ulong  depth,
      92             :                ulong  app_sz,
      93             :                ulong  seq0 );
      94             : 
      95             : /* fd_mcache_join joins the caller to the mcache.  shmcache points to
      96             :    the first byte of the memory region backing the mcache in the
      97             :    caller's address space.
      98             : 
      99             :    Returns a pointer in the local address space to the mcache's entries
     100             :    on success (IMPORTANT! THIS IS NOT JUST A CAST OF SHMCACHE) and NULL
     101             :    on failure (logs details).  Reasons for failure are that shmcache is
     102             :    obviously not a pointer to memory region holding a mcache.  Every
     103             :    successful join should have a matching leave.  The lifetime of the
     104             :    join is until the matching leave or thread group is terminated.
     105             : 
     106             :    Entries are indexed [0,depth) and the mapping from sequence number to
     107             :    depth is nontrivial (see below for accessors and mapping functions).
     108             :    There is no restrictions on the number of joins overall and a single
     109             :    thread can join multiple times (all joins to the same shmcache laddr
     110             :    will return same mcache laddr). */
     111             : 
     112             : fd_frag_meta_t *
     113             : fd_mcache_join( void * shmcache );
     114             : 
     115             : /* fd_mcache_leave leaves a current local join.  Returns a pointer to
     116             :    the underlying shared memory region on success (IMPORTANT!  THIS IS
     117             :    NOT JUST A CAST OF MCACHE) and NULL on failure (logs details).
     118             :    Reasons for failure include mcache is NULL. */
     119             : 
     120             : void *
     121             : fd_mcache_leave( fd_frag_meta_t const * mcache );
     122             : 
     123             : /* fd_mcache_delete unformats a memory region used as a mcache.  Assumes
     124             :    nobody is joined to the region.  Returns a pointer to the underlying
     125             :    shared memory region or NULL if used obviously in error (e.g.
     126             :    shmcache is obviously not a mcache ...  logs details).  The ownership
     127             :    of the memory region is transferred to the caller. */
     128             : 
     129             : void *
     130             : fd_mcache_delete( void * shmcache );
     131             : 
     132             : /* Accessor API */
     133             : 
     134             : /* fd_mcache_{depth,seq0} return the values corresponding to those use
     135             :    at the mcache's construction.  Assume mcache is a current local join. */
     136             : 
     137             : FD_FN_PURE ulong fd_mcache_depth ( fd_frag_meta_t const * mcache );
     138             : FD_FN_PURE ulong fd_mcache_app_sz( fd_frag_meta_t const * mcache );
     139             : FD_FN_PURE ulong fd_mcache_seq0  ( fd_frag_meta_t const * mcache );
     140             : 
     141             : /* fd_mcache_seq_laddr returns location in the caller's local address
     142             :    space of mcache's sequence array.  This array is indexed
     143             :    [0,FD_MCACHE_SEQ_CNT) with FD_MCACHE_ALIGN alignment (double cache
     144             :    line).  laddr_const is a const correct version.  Assumes mcache is a
     145             :    current local join.  The lifetime of the returned pointer is the same
     146             :    as the underlying join.
     147             : 
     148             :    seq[0] has special meaning.  Specifically, sequence numbers in
     149             :    [seq0,seq[0]) cyclic are guaranteed to have been published.  seq[0]
     150             :    is not strictly atomically updated by the producer when it publishes
     151             :    so seq[0] can lag the most recently published sequence number
     152             :    somewhat.  As seq[0] is moderately to aggressively frequently updated
     153             :    by the mcache's producer (depending on the application), this is on
     154             :    its own cache line pair to avoid false sharing.  seq[0] is mostly
     155             :    used for monitoring, initialization and support for some methods for
     156             :    unreliable consumer overrun handling.
     157             : 
     158             :    The meaning of the remaining sequence numbers is application
     159             :    dependent.  Application should try to restrict any use of these to
     160             :    ones that are seq[0] cache-friendly (e.g. use for producer write
     161             :    oriented cases or use for rarely used cases). */
     162             : 
     163             : FD_FN_CONST ulong const * fd_mcache_seq_laddr_const( fd_frag_meta_t const * mcache );
     164             : FD_FN_CONST ulong *       fd_mcache_seq_laddr      ( fd_frag_meta_t *       mcache );
     165             : 
     166             : /* fd_mcache_app_laddr returns location in the caller's local address
     167             :    space of memory set aside for application specific usage.  Assumes
     168             :    mcache is a current local join.  The lifetime of the returned pointer
     169             :    is the same as the underlying join.  This region has FD_MCACHE_ALIGN
     170             :    alignment (double cache line) and is fd_mcache_app_sz( mcache ) in
     171             :    size.  laddr_const is a const-correct version. */
     172             : 
     173             : FD_FN_PURE uchar const * fd_mcache_app_laddr_const( fd_frag_meta_t const * mcache );
     174             : FD_FN_PURE uchar *       fd_mcache_app_laddr      ( fd_frag_meta_t *       mcache );
     175             : 
     176             : /* fd_mcache_seq_query atomically reads the mcache's seq[0] (e.g. from
     177             :    fd_mcache_seq_laddr_const) to get a lower bound of where the producer
     178             :    is at in sequence space (in the sense that the producer guarantees it
     179             :    has produced all sequence numbers strictly before the return value
     180             :    cyclic).  This is usually done at consumer startup and, for some
     181             :    unreliable consumer overrun handling, during consumer overrun
     182             :    recovery.  It is strongly recommended for consumers to avoid using
     183             :    this as much as possible to limit cache line ping-ponging with the
     184             :    producer. */
     185             : 
     186             : static inline ulong
     187     3000105 : fd_mcache_seq_query( ulong const * _seq ) {
     188     3000105 :   FD_COMPILER_MFENCE();
     189     3000105 :   ulong seq = FD_VOLATILE_CONST( *_seq );
     190     3000105 :   FD_COMPILER_MFENCE();
     191     3000105 :   return seq;
     192     3000105 : }
     193             : 
     194             : /* fd_mcache_seq_update updates the mcache's seq[0] (e.g. from
     195             :    fd_mcache_seq_laddr) above where the producer a lower bound of where
     196             :    the producer is currently at (in the sense that the producer has
     197             :    produced all sequence numbers strictly before seq cyclic).  This
     198             :    should be monotonically non-decreasing.  This should be done
     199             :    moderately frequently (e.g. in background housekeeping) after the
     200             :    producer has moved forward in sequence space since the last update.
     201             :    Even more aggressively is usually fine.  This should also be done
     202             :    when the producer is shutdown to facilitate cleanly restarting a
     203             :    producer and what not.  This also serves as a compiler memory fence
     204             :    to ensure the sequence number is updated at a well defined point in
     205             :    the instruction stream (e.g. so that compiler doesn't move any stores
     206             :    from before the update to after the above). */
     207             : 
     208             : static inline void
     209             : fd_mcache_seq_update( ulong * _seq,
     210     3036130 :                       ulong   seq ) {
     211     3036130 :   FD_COMPILER_MFENCE();
     212     3036130 :   FD_VOLATILE( *_seq ) = seq;
     213     3036130 :   FD_COMPILER_MFENCE();
     214     3036130 : }
     215             : 
     216             : /* fd_mcache_line_idx returns the index of the cache line in a depth
     217             :    entry mcache (depth is assumed to be a power of 2) where the
     218             :    metadata for the frag with sequence number seq will be stored when it
     219             :    is in cache.  Outside of startup transients, a mcache is guaranteed
     220             :    to exactly hold the depth most recently sequence numbers (the act of
     221             :    publishing a new sequence number atomically unpublishes the oldest
     222             :    sequence number implicitly).
     223             : 
     224             :    FD_MCACHE_LG_INTERLEAVE is in [0,FD_MCACHE_LG_BLOCK) and controls the
     225             :    details of this mapping.  LG_INTERLEAVE 0 indicates no interleaving.
     226             :    Values from 1 to LG_BLOCK space out sequential frag meta data in
     227             :    memory to avoid false sharing between producers and fast consumers to
     228             :    keep fast consumers low latency while keeping frag meta data storage
     229             :    compact in memory to help throughput of slow consumers.
     230             : 
     231             :    Specifically, at a LG_INTERLEAVE of i with s byte frag meta data,
     232             :    meta data storage for sequential frags is typically s*2^i bytes
     233             :    apart.  To avoid wasting memory and bandwidth, the interleaving is
     234             :    implemented by doing a rotation of the least LG_BLOCK bits of the lg
     235             :    depth bits of the sequence number (NOTE: this imposes a requirement
     236             :    that mcaches have at least a depth of 2^LG_BLOCK fragments).  This
     237             :    yields a frag sequence number to line idx mapping that avoids false
     238             :    sharing for fast consumers and maintains compactness, avoids TLB
     239             :    thrashing (even if meta data is backed by normal pages) and exploits
     240             :    CPU data and TLB prefetching behavior for slow consumers.
     241             : 
     242             :    How useful block interleaving is somewhat application dependent.
     243             :    Different values have different trade offs between optimizing for
     244             :    fast and slow consumers and for different sizes of meta data and
     245             :    different page size backing memory.
     246             : 
     247             :    Using 0 / B for FD_MCACHE_LG_INTERLEAVE / LG_BLOCK will disable meta
     248             :    data interleaving while still requiring mcaches be at least 2^B in
     249             :    size.  This implicitly optimizes for slow consumers.  Something like
     250             :    2 / 7 (with a 32-byte size 32-byte aligned fd_frag_meta_t and a
     251             :    mcache that is at least normal page aligned) will access cached meta
     252             :    data in sequential blocks of 128 message fragments that are normal
     253             :    page size and aligned while meta data within those blocks will
     254             :    typically be strided at double DRAM cache line granularity.  As such,
     255             :    fast consumers (e.g. those within 32 of the producers) will rarely
     256             :    have false sharing with the producers as nearby sequence numbers are
     257             :    on different DRAM cache line pairs.  And slow consumers (e.g. ones
     258             :    that fall more than 128 fragments behind) will access meta data in a
     259             :    very DRAM cache friendly / data prefetcher / TLB friendly / bandwidth
     260             :    efficient manner (and without needing to load any prefilterable
     261             :    payload data while completely avoiding memory being written by the
     262             :    producer).  That is, it typically has good balance of performance for
     263             :    both fast and slow consumers simultaneously. */
     264             : 
     265             : #if FD_MCACHE_LG_INTERLEAVE==0
     266             : 
     267             : FD_FN_CONST static inline ulong /* Will be in [0,depth) */
     268             : fd_mcache_line_idx( ulong seq,
     269  9388301983 :                     ulong depth ) { /* Assumed power of 2 >= BLOCK */
     270  9388301983 :   return seq & (depth-1UL);
     271  9388301983 : }
     272             : 
     273             : #else
     274             : 
     275             : FD_FN_CONST static inline ulong /* Will be in [0,depth) */
     276             : fd_mcache_line_idx( ulong seq,
     277             :                     ulong depth ) { /* Assumed power of 2 >= BLOCK */
     278             :   ulong block_mask = FD_MCACHE_BLOCK - 1UL; /* Compile time */
     279             :   ulong page_mask  = (depth-1UL) & (~block_mask);    /* Typically compile time or loop invariant */
     280             :   ulong page = seq & page_mask;
     281             :   ulong bank = (seq << FD_MCACHE_LG_INTERLEAVE) & block_mask;
     282             :   ulong idx  = (seq & block_mask) >> (FD_MCACHE_LG_BLOCK-FD_MCACHE_LG_INTERLEAVE);
     283             :   return page | bank | idx;
     284             : }
     285             : 
     286             : #endif
     287             : 
     288             : /* fd_mcache_publish inserts the metadata for frag seq into the given
     289             :    depth entry mcache in a way compatible with FD_MCACHE_WAIT and
     290             :    FD_MCACHE_WAIT_SSE (but not FD_MCACHE_WAIT_AVX ... see FD_MCACHE_WAIT
     291             :    for more details).  This implicitly evicts the metadata for the
     292             :    sequence number currently stored at fd_mcache_line_idx( seq, depth ).
     293             :    In the typical case where sequence numbers are published into the
     294             :    mcache sequentially, the evicted metadata is typically for frag
     295             :    seq-depth (cyclic).  This does no error checking or the like as it is
     296             :    frequently used in ultra high performance contexts.  This operation
     297             :    implies a compiler mfence to the caller. */
     298             : 
     299             : static inline void
     300             : fd_mcache_publish( fd_frag_meta_t * mcache,   /* Assumed a current local join */
     301             :                    ulong            depth,    /* Assumed an integer power-of-2 >= BLOCK */
     302             :                    ulong            seq,
     303             :                    ulong            sig,
     304             :                    ulong            chunk,    /* Assumed in [0,UINT_MAX] */
     305             :                    ulong            sz,       /* Assumed in [0,USHORT_MAX] */
     306             :                    ulong            ctl,      /* Assumed in [0,USHORT_MAX] */
     307             :                    ulong            tsorig,   /* Assumed in [0,UINT_MAX] */
     308    12726485 :                    ulong            tspub ) { /* Assumed in [0,UINT_MAX] */
     309    12726485 :   fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
     310    12726485 :   FD_COMPILER_MFENCE();
     311    12726485 :   meta->seq    = fd_seq_dec( seq, 1UL );
     312    12726485 :   FD_COMPILER_MFENCE();
     313    12726485 :   meta->sig    =         sig;
     314    12726485 :   meta->chunk  = (uint  )chunk;
     315    12726485 :   meta->sz     = (ushort)sz;
     316    12726485 :   meta->ctl    = (ushort)ctl;
     317    12726485 :   meta->tsorig = (uint  )tsorig;
     318    12726485 :   meta->tspub  = (uint  )tspub;
     319    12726485 :   FD_COMPILER_MFENCE();
     320    12726485 :   meta->seq    = seq;
     321    12726485 :   FD_COMPILER_MFENCE();
     322    12726485 : }
     323             : 
     324             : #if FD_HAS_SSE
     325             : 
     326             : /* fd_mcache_publish_sse is a SSE implementation of fd_mcache_publish.
     327             :    It is compatible with FD_MCACHE_WAIT and FD_MCACHE_WAIT_SSE. */
     328             : 
     329             : static inline void
     330             : fd_mcache_publish_sse( fd_frag_meta_t * mcache,   /* Assumed a current local join */
     331             :                        ulong            depth,    /* Assumed an integer power-of-2 >= BLOCK */
     332             :                        ulong            seq,
     333             :                        ulong            sig,
     334             :                        ulong            chunk,    /* Assumed in [0,UINT_MAX] */
     335             :                        ulong            sz,       /* Assumed in [0,USHORT_MAX] */
     336             :                        ulong            ctl,      /* Assumed in [0,USHORT_MAX] */
     337             :                        ulong            tsorig,   /* Assumed in [0,UINT_MAX] */
     338           0 :                        ulong            tspub ) { /* Assumed in [0,UINT_MAX] */
     339           0 :   fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
     340           0 :   __m128i meta_sse0 = fd_frag_meta_sse0( fd_seq_dec( seq, 1UL ), sig );
     341           0 :   __m128i meta_sse1 = fd_frag_meta_sse1( chunk, sz, ctl, tsorig, tspub );
     342           0 :   FD_COMPILER_MFENCE();
     343           0 :   _mm_store_si128( &meta->sse0, meta_sse0 );
     344           0 :   FD_COMPILER_MFENCE();
     345           0 :   _mm_store_si128( &meta->sse1, meta_sse1 );
     346           0 :   FD_COMPILER_MFENCE();
     347           0 :   meta->seq = seq;
     348           0 :   FD_COMPILER_MFENCE();
     349           0 : }
     350             : 
     351             : #endif
     352             : 
     353             : #if FD_HAS_AVX
     354             : 
     355             : /* fd_mcache_publish_avx is an AVX implementation of fd_mcache_publish.
     356             :    It is compatible with FD_MCACHE_WAIT, FD_MCACHE_WAIT_SSE and
     357             :    FD_MCACHE_WAIT_AVX.  It requires a target for which aligned AVX
     358             :    stores are guaranteed atomic under the hood (see below for more
     359             :    details). */
     360             : 
     361             : static inline void
     362             : fd_mcache_publish_avx( fd_frag_meta_t * mcache,   /* Assumed a current local join */
     363             :                        ulong            depth,    /* Assumed an integer power-of-2 >= BLOCK */
     364             :                        ulong            seq,
     365             :                        ulong            sig,
     366             :                        ulong            chunk,    /* Assumed in [0,UINT_MAX] */
     367             :                        ulong            sz,       /* Assumed in [0,USHORT_MAX] */
     368             :                        ulong            ctl,      /* Assumed in [0,USHORT_MAX] */
     369             :                        ulong            tsorig,   /* Assumed in [0,UINT_MAX] */
     370       65874 :                        ulong            tspub ) { /* Assumed in [0,UINT_MAX] */
     371       65874 :   fd_frag_meta_t * meta = mcache + fd_mcache_line_idx( seq, depth );
     372       65874 :   __m256i meta_avx = fd_frag_meta_avx( seq, sig, chunk, sz, ctl, tsorig, tspub );
     373       65874 :   FD_COMPILER_MFENCE();
     374       65874 :   _mm256_store_si256( &meta->avx, meta_avx );
     375       65874 :   FD_COMPILER_MFENCE();
     376       65874 : }
     377             : 
     378             : #endif
     379             : 
     380             : /* FD_MCACHE_WAIT does a bounded wait for a producer to transmit a
     381             :    particular frag.
     382             : 
     383             :    meta (fd_frag_meta_t * compatible) is the location on the caller
     384             :    where the wait should save the found metadata.  This typically
     385             :    points to a stack temporary.
     386             : 
     387             :    mline (fd_frag_meta_t const * compatible) will be
     388             :      mcache + fd_mcache_line_idx( seq_expected, depth )
     389             :    when the wait does not time out.  This is the location where the
     390             :    caller can verify (after any speculative processing of seq_expected)
     391             :    the producer did not clobber the consumer during the processing.
     392             : 
     393             :    seq_found (ulong compatible) will be the sequence number found at
     394             :    mline when the wait does not time out.  This will be seq_expected
     395             :    on a successful wait.
     396             : 
     397             :    seq_diff (long compatible) will be how many sequence numbers ahead
     398             :    of seq_expected when the wait does not time out
     399             :      fd_seq_diff( seq_found, seq_expected )
     400             :    This will be zero on a successful wait.  This will be positive
     401             :    otherwise and a lower bound of how far behind the consumer is from
     402             :    the producer (and seq_found will typically be a reasonably recently
     403             :    produced sequence number).
     404             : 
     405             :    poll_max (ulong compatible) is the number of times FD_MCACHE_WAIT
     406             :    will poll the mcache of the given depth for seq_expected before
     407             :    timing out.  poll_max should be positive on input.  (Note: using
     408             :    ULONG_MAX for poll_max practically turns this into a blocking wait as
     409             :    this take hundreds of years to complete on realistic platforms.)
     410             :    If poll max is zero on completion of the, the wait timed out.
     411             : 
     412             :    mcache (fd_frag_meta_t const * compatible) is a current local join to
     413             :    the mcache the producer uses to cache metadata for the frags it is
     414             :    producing.
     415             : 
     416             :    depth (a ulong compatible power of two of at least FD_MCACHE_BLOCK)
     417             :    is the number of entries in mcache.
     418             : 
     419             :    seq_expected (ulong compatible) is the sequence number to wait to be
     420             :    produced.
     421             : 
     422             :    On completion of the WAIT, if poll_max is zero, the WAIT timed out
     423             :    and none of the other outputs (meta, mline, seq_found, seq_diff)
     424             :    should be trusted.  If poll_max is non-zero, it will be the original
     425             :    poll_max value decremented by the number of polls it took for the
     426             :    WAIT to complete and the WAIT did not timeout.
     427             : 
     428             :    When the WAIT did not timeout, mline, seq_found and seq_diff can be
     429             :    trusted.  If seq_diff is positive, the caller has fallen more than
     430             :    depth behind the producer such that metadata for frag seq_expected is
     431             :    no longer available via the mcache.  IMPORTANT!  *META MIGHT NOT BE
     432             :    VALID FOR SEQ_FOUND WHEN CONSUMER HAS FALLEN BEHIND (e.g. if the
     433             :    producer is paused after it starts writing metadata but before it has
     434             :    completed writing it ... an unreliable overrun consumer that reads
     435             :    the metadata while the producer is paused will observe metadata that
     436             :    is a mix of the new metadata and old metadata with a bogus sequence
     437             :    number on it).  seq_diff is a lower bound of how far the caller has
     438             :    fallen behind the producer and seq_found is a lower bound of where
     439             :    producer is currently at.
     440             : 
     441             :    Otherwise, the caller is within depth of the producer and *meta will
     442             :    be a local copy of the desired metadata.
     443             : 
     444             :    TL;DR  Typical usage:
     445             : 
     446             :      ... Example HPC receiver run loop setup
     447             : 
     448             :      ulong                  poll_max = ... number of polls until next housekeeping (positive)
     449             :      fd_frag_meta_t const * mcache   = ... local join to producer's mcache
     450             :      ulong                  depth    = ... producer's mcache depth
     451             :      ulong                  rx_seq   = ... next sequence number to receive from producer
     452             : 
     453             :      ... Example HPC receiver run loop structure
     454             : 
     455             :      for(;;) {
     456             : 
     457             :        fd_frag_meta_t         meta[1];
     458             :        fd_frag_meta_t const * mline;
     459             :        ulong                  tx_seq;
     460             :        long                   seq_diff;
     461             :        FD_MCACHE_WAIT( meta, mline, tx_seq, seq_diff, poll_max, mcache, depth, rx_seq );
     462             : 
     463             :        ... At this point, poll_max can be trusted and has been
     464             :        ... decremented the number of polls that were done by the wait
     465             :        ... from its value at the start of the wait.  We either timed
     466             :        ... out waiting, detected we've been overrun or received the
     467             :        ... desired meta data.
     468             : 
     469             :        if( FD_UNLIKELY( !poll_max ) ) {
     470             : 
     471             :          ... We timed out.  Do background housekeeping.
     472             : 
     473             :          poll_max = ... Reload for the next housekeeping (positive and
     474             :                     ... ideally somewhat randomized each time).  Value
     475             :                     ... depends on how aggressively the run loop needs
     476             :                     ... to do background tasks such as
     477             :                     ... command-and-control interactions, monitoring
     478             :                     ... diagnostics, maintenance, etc).
     479             : 
     480             :          continue;
     481             :        }
     482             : 
     483             :        ... At this point, poll_max, mline, tx_seq and seq_diff can be
     484             :        ... trusted.  We either have been overrun or received the desired
     485             :        ... metadata.  poll_max>0 and seq_diff==fd_seq_diff(tx_seq,rx_seq).
     486             : 
     487             :        if( FD_UNLIKELY( seq_diff ) ) {
     488             : 
     489             :          ... We got overrun by the producer.  tx_seq is an estimate
     490             :          ... (typically within depth and often much closer) of where the
     491             :          ... producer currently is at.  Technically, this branch should
     492             :          ... never be exercised on reliable consumers but is a generally
     493             :          ... good idea regardless to detect / protect against flow
     494             :          ... control misconfigurations, bugs in the consumer, etc.
     495             :          ... Overrun handling could be as simple as "rx_seq = tx_seq;"
     496             :          ... here (but applications will typically do more elaborate
     497             :          ... application specific handling)
     498             : 
     499             :          continue;
     500             :        }
     501             : 
     502             :        ... We received meta data for frag rx_seq.  At this point, meta,
     503             :        ... tx_seq, seq_diff and poll_max can be trusted.  poll_max>=0UL,
     504             :        ... tx_seq==rx_seq and seq_diff==0L.
     505             : 
     506             :        ... Process meta->* at the run loop's leisure and speculatively
     507             :        ... process actual frag data as necessary here.
     508             : 
     509             :        tx_seq = fd_frag_meta_seq_query( mline );
     510             :        if( FD_UNLIKELY( fd_seq_ne( tx_seq, rx_seq ) ) ) {
     511             : 
     512             :          ... We got overrun by the producer while speculatively
     513             :          ... processing data pointed to by meta.  Same considerations
     514             :          ... as above for overrun handling.
     515             : 
     516             :          continue;
     517             :        }
     518             : 
     519             :        ... Advance to the producer's next sequence number.
     520             : 
     521             :        rx_seq = fd_seq_inc( rx_seq, 1UL );
     522             :      }
     523             : 
     524             :    This assumes the producer either writes the entire metadata cache
     525             :    line atomically (on targets where aligned AVX writes are in fact
     526             :    atomic) or writes the metadata cache line in a particular order:
     527             : 
     528             :      FD_COMPILER_MFENCE();
     529             :      mcache_line->seq = fd_seq_dec( seq, 1UL ); // atomically marks cache line as in the process of writing seq
     530             :                                                 // This implicitly atomically evicts frag metadata for cache line
     531             :                                                 // seq-depth cycle
     532             :      FD_COMPILER_MFENCE();
     533             :      ... update the actual cache line body without changing mcache_line->seq ...
     534             :      FD_COMPILER_MFENCE();
     535             :      mcache_line->seq = seq; // atomically marks metadata for frag seq as available for consumers
     536             :      FD_COMPILER_MFENCE();
     537             : 
     538             :    Note that above writes can be SSE accelerated on AVX platforms (where
     539             :    aligned SSE writes are guaranteed to be atomic) as:
     540             : 
     541             :      FD_COMPILER_MFENCE();
     542             :      _mm_store_si128( &mcache_line->sse0, fd_frag_meta_sse0( fd_seq_dec( seq, 1UL ), sig );
     543             :      FD_COMPILER_MFENCE();
     544             :      _mm_store_si128( &mcache_line->sse1, fd_frag_meta_sse1( chunk, sz, ctl, tsorig, tspub );
     545             :      FD_COMPILER_MFENCE();
     546             :      mcache_line->seq = seq;
     547             :      FD_COMPILER_MFENCE();
     548             : 
     549             :    Note that the above uses no expensive atomic operations or hardware
     550             :    memory fences under the hood as these are not required for x86-style
     551             :    cache coherency.  Specifically, Intel Architecture Software Developer
     552             :    Manual 3A-8-9:
     553             : 
     554             :      "Reads are not reordered with other reads."
     555             : 
     556             :    and 3A-8-10:
     557             : 
     558             :      "Writes by a single processor are observed in the same order by all
     559             :      processors."
     560             : 
     561             :    This makes heavy use of compiler memory fences though to insure that
     562             :    compiler optimizations do not reorder how the operations are issued
     563             :    to CPUs (and thus also imply the operation acts as a compiler memory
     564             :    fence overall).
     565             : 
     566             :    Non-x86 platforms that use different cache coherency models may
     567             :    require modification of the below to use more explicit fencing or
     568             :    what not.
     569             : 
     570             :    The below is implemented as a macro to facilitate use in ultra high
     571             :    performance run loops and support multiple return values.  This macro
     572             :    is robust (e.g. it evaluates its argument a minimal number of times). */
     573             : 
     574           0 : #define FD_MCACHE_WAIT( meta, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do {                    \
     575           0 :     ulong                  _fd_mcache_wait_seq_expected = (seq_expected);                                                 \
     576           0 :     fd_frag_meta_t const * _fd_mcache_wait_mline        = (mcache)                                                        \
     577           0 :                                                         + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) );    \
     578           0 :     fd_frag_meta_t *       _fd_mcache_wait_meta         = (meta);                                                         \
     579           0 :     ulong                  _fd_mcache_wait_seq_found;                                                                     \
     580           0 :     long                   _fd_mcache_wait_seq_diff;                                                                      \
     581           0 :     ulong                  _fd_mcache_wait_poll_max     = (poll_max);                                                     \
     582           0 :     for(;;) {                                                                                                             \
     583           0 :       FD_COMPILER_MFENCE();                                                                                               \
     584           0 :       _fd_mcache_wait_seq_found = _fd_mcache_wait_mline->seq; /* atomic */                                                \
     585           0 :       FD_COMPILER_MFENCE();                                                                                               \
     586           0 :       *_fd_mcache_wait_meta = *_fd_mcache_wait_mline; /* probably non-atomic, typically fast L1 cache hit */              \
     587           0 :       FD_COMPILER_MFENCE();                                                                                               \
     588           0 :       ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */              \
     589           0 :       FD_COMPILER_MFENCE();                                                                                               \
     590           0 :       _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected );                  \
     591           0 :       int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
     592           0 :                                | (!--_fd_mcache_wait_poll_max);                                                           \
     593           0 :       FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */               \
     594           0 :       if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */       \
     595           0 :       FD_SPIN_PAUSE();                                                                                                    \
     596           0 :     }                                                                                                                     \
     597           0 :     (mline)     = _fd_mcache_wait_mline;                                                                                  \
     598           0 :     (seq_found) = _fd_mcache_wait_seq_found;                                                                              \
     599           0 :     (seq_diff)  = _fd_mcache_wait_seq_diff;                                                                               \
     600           0 :     (poll_max)  = _fd_mcache_wait_poll_max;                                                                               \
     601           0 :   } while(0)
     602             : 
     603             : /* FD_MCACHE_WAIT_REG: similar to FD_MCACHE_WAIT but uses (nominally)
     604             :    registers to hold the metadata instead of a local buffer. */
     605             : 
     606             : #define FD_MCACHE_WAIT_REG( sig, chunk, sz, ctl, tsorig, tspub, mline, seq_found, seq_diff, poll_max,                     \
     607   153631907 :                             mcache, depth, seq_expected ) do {                                                            \
     608   153631907 :     ulong                  _fd_mcache_wait_seq_expected = (seq_expected);                                                 \
     609   153631907 :     fd_frag_meta_t const * _fd_mcache_wait_mline        = (mcache)                                                        \
     610   153631907 :                                                         + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) );    \
     611   153631907 :     ulong                  _fd_mcache_wait_poll_max     = (poll_max);                                                     \
     612   153631907 :     ulong                  _fd_mcache_wait_sig;                                                                           \
     613   153631907 :     ulong                  _fd_mcache_wait_chunk;                                                                         \
     614   153631907 :     ulong                  _fd_mcache_wait_sz;                                                                            \
     615   153631907 :     ulong                  _fd_mcache_wait_ctl;                                                                           \
     616   153631907 :     ulong                  _fd_mcache_wait_tsorig;                                                                        \
     617   153631907 :     ulong                  _fd_mcache_wait_tspub;                                                                         \
     618   153631907 :     ulong                  _fd_mcache_wait_seq_found;                                                                     \
     619   153631907 :     long                   _fd_mcache_wait_seq_diff;                                                                      \
     620  3317208878 :     for(;;) {                                                                                                             \
     621  3317208878 :       FD_COMPILER_MFENCE();                                                                                               \
     622  3317208878 :       _fd_mcache_wait_seq_found = _fd_mcache_wait_mline->seq; /* atomic */                                                \
     623  3317208878 :       FD_COMPILER_MFENCE();                                                                                               \
     624  3317208878 :       _fd_mcache_wait_sig       =        _fd_mcache_wait_mline->sig;                                                      \
     625  3317208878 :       _fd_mcache_wait_chunk     = (ulong)_fd_mcache_wait_mline->chunk;                                                    \
     626  3317208878 :       _fd_mcache_wait_sz        = (ulong)_fd_mcache_wait_mline->sz;                                                       \
     627  3317208878 :       _fd_mcache_wait_ctl       = (ulong)_fd_mcache_wait_mline->ctl;                                                      \
     628  3317208878 :       _fd_mcache_wait_tsorig    = (ulong)_fd_mcache_wait_mline->tsorig;                                                   \
     629  3317208878 :       _fd_mcache_wait_tspub     = (ulong)_fd_mcache_wait_mline->tspub;                                                    \
     630  3317208878 :       FD_COMPILER_MFENCE();                                                                                               \
     631  3317208878 :       ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */              \
     632  3317208878 :       FD_COMPILER_MFENCE();                                                                                               \
     633  3317208878 :       _fd_mcache_wait_seq_diff = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected );                  \
     634  3317208878 :       int _fd_mcache_wait_done = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
     635  3317208878 :                                | (!--_fd_mcache_wait_poll_max);                                                           \
     636  3317208878 :       FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */               \
     637  3317208878 :       if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */       \
     638  3317208878 :       FD_SPIN_PAUSE();                                                                                                    \
     639  3163576971 :     }                                                                                                                     \
     640   153631907 :     (sig)       = _fd_mcache_wait_sig;                                                                                    \
     641   153631907 :     (chunk)     = _fd_mcache_wait_chunk;                                                                                  \
     642   153631907 :     (sz)        = _fd_mcache_wait_sz;                                                                                     \
     643   153631907 :     (ctl)       = _fd_mcache_wait_ctl;                                                                                    \
     644   153631907 :     (tsorig)    = _fd_mcache_wait_tsorig;                                                                                 \
     645   153631907 :     (tspub)     = _fd_mcache_wait_tspub;                                                                                  \
     646   153631907 :     (mline)     = _fd_mcache_wait_mline;                                                                                  \
     647   153631907 :     (seq_found) = _fd_mcache_wait_seq_found;                                                                              \
     648   153631907 :     (seq_diff)  = _fd_mcache_wait_seq_diff;                                                                               \
     649   153631907 :     (poll_max)  = _fd_mcache_wait_poll_max;                                                                               \
     650   153631907 :   } while(0)
     651             : 
     652             : #if FD_HAS_AVX
     653             : 
     654             : /* FD_MCACHE_WAIT_SSE: similar to FD_MCACHE_WAIT but uses a pair of SSE
     655             :    registers to hold the metadata instead of a local buffer.  This is
     656             :    only valid on targets with the FD_HAS_AVX capability (see
     657             :    fd_tango_base.h for details on Intel's atomicity guarantees). */
     658             : 
     659             : #define FD_MCACHE_WAIT_SSE( meta_sse0, meta_sse1, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do { \
     660             :     ulong                  _fd_mcache_wait_seq_expected = (seq_expected);                                                  \
     661             :     fd_frag_meta_t const * _fd_mcache_wait_mline        = (mcache)                                                         \
     662             :                                                         + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) );     \
     663             :     __m128i                _fd_mcache_wait_meta_sse0;                                                                      \
     664             :     __m128i                _fd_mcache_wait_meta_sse1;                                                                      \
     665             :     ulong                  _fd_mcache_wait_seq_found;                                                                      \
     666             :     long                   _fd_mcache_wait_seq_diff;                                                                       \
     667             :     ulong                  _fd_mcache_wait_poll_max     = (poll_max);                                                      \
     668             :     for(;;) {                                                                                                              \
     669             :       FD_COMPILER_MFENCE();                                                                                                \
     670             :       _fd_mcache_wait_meta_sse0 = _mm_load_si128( &_fd_mcache_wait_mline->sse0 ); /* atomic */                             \
     671             :       FD_COMPILER_MFENCE();                                                                                                \
     672             :       _fd_mcache_wait_meta_sse1 = _mm_load_si128( &_fd_mcache_wait_mline->sse1 ); /* atomic, typ fast L1 hit */            \
     673             :       FD_COMPILER_MFENCE();                                                                                                \
     674             :       ulong _fd_mcache_wait_seq_test = _fd_mcache_wait_mline->seq; /* atomic, typically fast L1 cache hit */               \
     675             :       FD_COMPILER_MFENCE();                                                                                                \
     676             :       _fd_mcache_wait_seq_found = fd_frag_meta_sse0_seq( _fd_mcache_wait_meta_sse0 );                                      \
     677             :       _fd_mcache_wait_seq_diff  = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected );                  \
     678             :       int _fd_mcache_wait_done  = ((_fd_mcache_wait_seq_found==_fd_mcache_wait_seq_test) & (_fd_mcache_wait_seq_diff>=0L)) \
     679             :                                 | (!--_fd_mcache_wait_poll_max);                                                           \
     680             :       FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */                \
     681             :       if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */        \
     682             :       FD_SPIN_PAUSE();                                                                                                     \
     683             :     }                                                                                                                      \
     684             :     (meta_sse0) = _fd_mcache_wait_meta_sse0;                                                                               \
     685             :     (meta_sse1) = _fd_mcache_wait_meta_sse1;                                                                               \
     686             :     (mline)     = _fd_mcache_wait_mline;                                                                                   \
     687             :     (seq_found) = _fd_mcache_wait_seq_found;                                                                               \
     688             :     (seq_diff)  = _fd_mcache_wait_seq_diff;                                                                                \
     689             :     (poll_max)  = _fd_mcache_wait_poll_max;                                                                                \
     690             :   } while(0)
     691             : 
     692             : /* FD_MCACHE_WAIT_AVX: similar to FD_MCACHE_WAIT_SSE but uses a single
     693             :    AVX register to hold the found metadata instead of a local buffer.
     694             :    This is only valid for targets that have atomic AVX load / stores
     695             :    (not guaranteed across all AVX supporting CPUs and Intel is
     696             :    deliberately vague about which ones do have it) and a producer that
     697             :    similarly uses atomic AVX writes for metadata publication.  On the
     698             :    overrun case here, meta_avx will in fact be the metadata for the
     699             :    overrun sequence number. */
     700             : 
     701             : #define FD_MCACHE_WAIT_AVX( meta_avx, mline, seq_found, seq_diff, poll_max, mcache, depth, seq_expected ) do {         \
     702             :     ulong                  _fd_mcache_wait_seq_expected = (seq_expected);                                              \
     703             :     fd_frag_meta_t const * _fd_mcache_wait_mline        = (mcache)                                                     \
     704             :                                                         + fd_mcache_line_idx( _fd_mcache_wait_seq_expected, (depth) ); \
     705             :     __m256i                _fd_mcache_wait_meta_avx;                                                                   \
     706             :     ulong                  _fd_mcache_wait_seq_found;                                                                  \
     707             :     long                   _fd_mcache_wait_seq_diff;                                                                   \
     708             :     ulong                  _fd_mcache_wait_poll_max     = (poll_max);                                                  \
     709             :     for(;;) {                                                                                                          \
     710             :       FD_COMPILER_MFENCE();                                                                                            \
     711             :       _fd_mcache_wait_meta_avx  = _mm256_load_si256( &_fd_mcache_wait_mline->avx ); /* atomic */                       \
     712             :       FD_COMPILER_MFENCE();                                                                                            \
     713             :       _fd_mcache_wait_seq_found = fd_frag_meta_avx_seq( _fd_mcache_wait_meta_avx );                                    \
     714             :       _fd_mcache_wait_seq_diff  = fd_seq_diff( _fd_mcache_wait_seq_found, _fd_mcache_wait_seq_expected );              \
     715             :       int _fd_mcache_wait_done  = (_fd_mcache_wait_seq_diff>=0L) | (!--_fd_mcache_wait_poll_max);                      \
     716             :       FD_COMPILER_FORGET( _fd_mcache_wait_done ); /* inhibit compiler from turning this into branch nest */            \
     717             :       if( FD_LIKELY( _fd_mcache_wait_done ) ) break; /* opt for exit, single exit to help spin_pause cpu hinting */    \
     718             :       FD_SPIN_PAUSE();                                                                                                 \
     719             :     }                                                                                                                  \
     720             :     (meta_avx)  = _fd_mcache_wait_meta_avx;                                                                            \
     721             :     (mline)     = _fd_mcache_wait_mline;                                                                               \
     722             :     (seq_found) = _fd_mcache_wait_seq_found;                                                                           \
     723             :     (seq_diff)  = _fd_mcache_wait_seq_diff;                                                                            \
     724             :     (poll_max)  = _fd_mcache_wait_poll_max;                                                                            \
     725             :   } while(0)
     726             : 
     727             : #endif
     728             : 
     729             : /* fd_mcache_query returns seq_query if seq_query is still in the mcache
     730             :    (assumed to be a current local mcache join) with depth entries (depth
     731             :    is assumed to be an integer power of two of at least
     732             :    FD_MCACHE_BLOCK).  It will return a sequence number before seq_query
     733             :    if the seq_query has not yet been published.  It will return a
     734             :    sequence after seq_query if seq_query is no longer available in the
     735             :    mcache.  In this last case, seq_query will be typically be within
     736             :    depth of the most recently published sequence number as of some point
     737             :    in time between when the call was made and the call returned (in many
     738             :    common uses, this is typically very very close to most recently
     739             :    published sequence number).  This acts as a compiler memory fence. */
     740             : 
     741             : static inline ulong
     742             : fd_mcache_query( fd_frag_meta_t const * mcache,
     743             :                  ulong                  depth,
     744     9000000 :                  ulong                  seq_query ) {
     745     9000000 :   return fd_frag_meta_seq_query( mcache + fd_mcache_line_idx( seq_query, depth ) );
     746     9000000 : }
     747             : 
     748             : FD_PROTOTYPES_END
     749             : 
     750             : #endif /* HEADER_fd_src_tango_mcache_fd_mcache_h */
     751             : 

Generated by: LCOV version 1.14