Line data Source code
1 : #ifndef HEADER_fd_src_tango_fd_tango_base_h
2 : #define HEADER_fd_src_tango_fd_tango_base_h
3 :
4 : /* Tango messaging concepts:
5 :
6 : - Each message comes from a single local origin. Each origin has a
7 : 13-bit id that uniquely identifies it within a set of message
8 : producers and consumers for the lifetime of the set. Origins
9 : typically include a mixture of network receiving devices, local
10 : message publishers, etc. Applications might restrict the set of
11 : origins / add additional context / structure to origins id as
12 : need.
13 :
14 : - Messages are partitioned into one or more disjoint fragments. The
15 : number of message payload bytes in a message fragment is in
16 : [0,2^16). That is, message fragment size is any 16-bit unsigned
17 : int (thus bounded and variable). Zero sized fragments are
18 : legitimate (e.g. one use case for this is heartbeating a stalled
19 : send of a large multi-fragment message). Note that this is large
20 : enough to allow a maximum size UDP payload to be published in a
21 : single message fragment. Applications might choose to impose
22 : additional limitations on message fragmentation.
23 :
24 : - Each fragment has a 64-bit sequence number that is unique over a
25 : (potentially dynamic) set of communicating message producers and
26 : consumers for the lifetime of that set. Note that the use of a
27 : 64-bit sequence number means that sequence number reuse is not an
28 : issue practically (would take hundreds of years even at highly
29 : local unrealistic messaging rates from producers to consumers).
30 : Note also that it is possible to use a smaller sequence number and
31 : deal with the implications of sequence number reuse via a number of
32 : standard techniques (epochs, TCP timestamp style, etc ... possibly
33 : with some minor additional constraints). This is not done here for
34 : code simplicity / robustness / flexibility.
35 :
36 : - Message fragment sequence numbers increase sequentially with no
37 : gaps over the set of all producers for the set's lifetime. As
38 : such, if a consumer encounters a gap in fragment sequence numbers,
39 : it knows it was overrun and has lost a message fragment (but
40 : typically that consumer does not know the origin of the lost
41 : fragment and needs to react accordingly).
42 :
43 : - The message fragment sequence numbers increase monotonically but
44 : not necessarily sequentially as the fragments from messages from
45 : different origins may be interleaved in fragment sequence number.
46 :
47 : - Each fragment is timestamped accordingly to when its origin first
48 : started producing it (tsorig) and when it was made first available
49 : for consumers (tspub). As these are used mostly for monitoring and
50 : diagnostic purposes, they are stored in a temporally and/or
51 : precision compressed representation to free up room for other
52 : metadata.
53 :
54 : - tsorig is measured on the origin's wallclock and the tspub is
55 : measured on the consumer facing publisher's wallclock (these are
56 : often the same wallclock). As such, tsorig from the same origin
57 : will be monotonically increasing and tspub will be monotonically
58 : increasing across all fragments from all origins.
59 :
60 : - The wallclocks used for the timestamping should be reasonably well
61 : synchronized in the sense described in util/log. As such
62 : timestamps measured by the same wallclocks will be exactly
63 : spatially comparable and approximately temporally comparable and
64 : timestamps measured by different wallclocks are both approximately
65 : spatially and temporally comparable. Applications might chose to
66 : use things like preexisting host globally synchronized hardware
67 : tickcounters (e.g. RDTSC) for these instead of the system wallclock
68 : to reduce overheads.
69 :
70 : - Message fragments are distributed strictly in order. There is no
71 : inherent limit to the number of fragments in a message.
72 : Applications might impose additional restrictions as appropriate
73 : for their needs.
74 :
75 : - To facilitate message reassembly, each fragment has a set of
76 : control bits that specify message boundaries and other conditions
77 : that might occur during message distribution.
78 :
79 : * SOM ("start-of-message"): This indicates this fragment starts a
80 : message from the fragment's origin.
81 :
82 : * EOM ("end-of-message"): This indicates this fragment ends a
83 : message from the fragment's origin. If a consumer sees all the
84 : fragment sequence numbers between the sequence number of an SOM
85 : fragment from an origin to the sequence number of an EOM fragment
86 : from that origin inclusive, it knows that it has received all
87 : fragments for that message without loss from that origin.
88 :
89 : * ERR ("error"): This indicates that the _entire_ message to which
90 : the fragment belongs should be considered as corrupt (e.g. CRC
91 : checks that happen at the very end of network packet reception
92 : are the typical reason for this and these inherent cannot be
93 : checked until the last fragment).
94 :
95 : - To facilitate high performance message distribution, each fragment
96 : has a 64-bit message signature. How the signature is used is
97 : application defined. A typical use case is to have the first
98 : fragment of a message signify (in an application dependent way)
99 : which consumers are definitely known a priori to be uninterested in
100 : the message (such that those consumer doesn't have to spend any
101 : bandwidth or compute to reassemble or parse message payloads while
102 : still preserving common sequencing and ordering of all messages
103 : between all consumers).
104 :
105 : - For similar reasons, recent message fragments are typically stored
106 : in two separate caches: A fragment metadata cache ("mcache", which
107 : behaves like a hybrid of a ring and a direct mapped cache ... it
108 : maps recently published fragment sequence numbers to fragment
109 : metadata) and a fragment payload cache (which is more flexibly
110 : allocated at "chunk" granularity as per the capabilities and needs
111 : of the individual origins). */
112 :
113 : #include "../util/fd_util.h"
114 :
115 : #if FD_HAS_SSE /* also covers FD_HAS_AVX */
116 : #include <x86intrin.h>
117 : #endif
118 :
119 : /* FD_CHUNK_{LG_SZ,ALIGN,FOOTPRINT,SZ} describe the granularity of
120 : message fragment payload allocations. ALIGN==FOOTPRINT==SZ==2^LG_SZ
121 : and recommend this to be something like a cache line practically. */
122 :
123 81087062 : #define FD_CHUNK_LG_SZ (6)
124 15 : #define FD_CHUNK_ALIGN (64UL) /* == 2^FD_CHUNK_LG_SZ, explicit to workaround compiler limitations */
125 : #define FD_CHUNK_FOOTPRINT (64UL) /* " */
126 7076875 : #define FD_CHUNK_SZ (64UL) /* " */
127 :
128 : /* FD_CHUNK_{LG_SZ,ALIGN,FOOTPRINT,SZ} describe the coarse layout of
129 : message fragment structures.
130 : sizeof(fd_frag_meta_t)==ALIGN==FOOTPRINT==SZ==2^LG_SZ. Recommend
131 : this to be something like a positive integer multiple or an integer
132 : power of two divisor of a cache line size. */
133 :
134 : #define FD_FRAG_META_LG_SZ (5)
135 : #define FD_FRAG_META_ALIGN (32UL) /* == 2^FD_FRAG_META_LG_SZ, explicit to workaround compiler limitations */
136 : #define FD_FRAG_META_FOOTPRINT (32UL) /* " */
137 : #define FD_FRAG_META_SZ (32UL) /* " */
138 :
139 : /* FD_FRAG_META_ORIG_MAX specifies the maximum number of message origins
140 : that are supported. Origins ids are in [0,FD_FRAG_META_ORIG_MAX). */
141 :
142 : #define FD_FRAG_META_ORIG_MAX (8192UL)
143 :
144 : /* fd_frag_meta_t specifies the message fragment metadata. */
145 :
146 : union __attribute__((aligned(FD_FRAG_META_ALIGN))) fd_frag_meta {
147 :
148 : struct {
149 :
150 : /* First aligned SSE word ... these are strictly updated atomically */
151 :
152 : ulong seq; /* naturally atomic r/w, frag sequence number. */
153 : ulong sig; /* naturally atomic r/w, application defined message signature for fast consumer side filtering
154 : performance is best if this is updated atomically with seq */
155 :
156 : /* Second aligned SSE word ... these are typically updated
157 : atomically but there is no guarantee both SSE words are jointly
158 : updated atomically. */
159 :
160 : uint chunk; /* naturally atomic r/w, compressed relative location of first byte of the frag in data region. */
161 : ushort sz; /* naturally atomic r/w, Frag size in bytes. */
162 : ushort ctl; /* naturally atomic r/w, Message reassembly control bits (origin/clock domain, SOM/EOM/ERR flags) */
163 : uint tsorig; /* naturally atomic r/w, Message diagnostic compressed timestamps */
164 : uint tspub; /* naturally atomic r/w, " */
165 :
166 : };
167 :
168 :
169 : /* Intel architecture manual 3A section 8.1.1 (April 2022):
170 :
171 : Processors that enumerate support for Intel AVX (by setting the
172 : feature flag CPUID.01H:ECX.AVX[bit 28]) guarantee that the
173 : 16-byte memory operations performed by the following instructions
174 : will always be carried out atomically:
175 :
176 : - MOVAPD, MOVAPS, and MOVDQA.
177 : - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
178 : - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with
179 : EVEX.128 and k0 (masking disabled).
180 :
181 : (Note that these instructions require the linear addresses of
182 : their memory operands to be 16-byte aligned.)
183 :
184 : That is accesses to "sse0" and "sse1" below are atomic when AVX
185 : support is available given the overall structure alignment,
186 : appropriate intrinsics and what not. Accesses to avx are likely
187 : atomic on many x86 platforms but this is not guaranteed and such
188 : should not be assumed. */
189 :
190 : # if FD_HAS_SSE
191 : struct {
192 : __m128i sse0; /* naturally atomic r/w, covers seq and sig */
193 : __m128i sse1; /* naturally atomic r/w, covers chunk, sz, ctl, tsorig and tspub */
194 : };
195 : # endif
196 :
197 : # if FD_HAS_AVX
198 : __m256i avx; /* Possibly non-atomic but can hold the metadata in a single register */
199 : # endif
200 :
201 : };
202 :
203 : typedef union fd_frag_meta fd_frag_meta_t;
204 :
205 : FD_PROTOTYPES_BEGIN
206 :
207 : /* fd_seq_{lt,le,eq,ne,ge,gt} compare 64-bit sequence numbers with
208 : proper handling of sequence number wrapping (e.g. if, for example, we
209 : decide to randomize the initial sequence numbers used by an
210 : application for security reasons and by chance pick a sequence number
211 : near 2^64 such that wrapping sequence numbers 0 occurs. That is,
212 : sequence number reuse is not an issue practically in a real world
213 : application but sequence number wrapping is if we want to support
214 : things like initial sequence number randomization for security.
215 :
216 : fd_seq_{inc,dec} returns the result of incrementing/decrementing
217 : sequence number a delta times.
218 :
219 : fd_seq_diff returns the how many sequence numbers a is ahead of b.
220 : Positive/negative values means a is in the future/past of b. Zero
221 : indicates a and b are the same.
222 :
223 : In general operations on sequence numbers are strongly encouraged to
224 : use this macros as such facilitates updating code to accommodate
225 : things like changing the width of a sequence number. */
226 :
227 3076520523 : FD_FN_CONST static inline int fd_seq_lt( ulong a, ulong b ) { return ((long)(a-b))< 0L; }
228 0 : FD_FN_CONST static inline int fd_seq_le( ulong a, ulong b ) { return ((long)(a-b))<=0L; }
229 3073425600 : FD_FN_CONST static inline int fd_seq_eq( ulong a, ulong b ) { return a==b; }
230 9283717184 : FD_FN_CONST static inline int fd_seq_ne( ulong a, ulong b ) { return a!=b; }
231 3000000 : FD_FN_CONST static inline int fd_seq_ge( ulong a, ulong b ) { return ((long)(a-b))>=0L; }
232 3071759349 : FD_FN_CONST static inline int fd_seq_gt( ulong a, ulong b ) { return ((long)(a-b))> 0L; }
233 :
234 9363416301 : FD_FN_CONST static inline ulong fd_seq_inc( ulong a, ulong delta ) { return a+delta; }
235 23335646 : FD_FN_CONST static inline ulong fd_seq_dec( ulong a, ulong delta ) { return a-delta; }
236 :
237 3317707120 : FD_FN_CONST static inline long fd_seq_diff( ulong a, ulong b ) { return (long)(a-b); }
238 :
239 : /* fd_chunk_to_laddr: returns a pointer in the local address space to
240 : the first byte of the chunk with the given compressed relative
241 : address chunk given the pointer in the local address space of the
242 : chunk whose index is 0 (chunk0). fd_chunk_to_laddr_const is for
243 : const-correctness.
244 :
245 : fd_laddr_to_chunk: vica versa. */
246 :
247 : FD_FN_CONST static inline void * /* Will be aligned FD_CHUNK_ALIGN and in [ chunk0, chunk0 + FD_CHUNK_SZ*(UINT_MAX+1) ) */
248 : fd_chunk_to_laddr( void * chunk0, /* Assumed aligned FD_CHUNK_ALIGN */
249 5270043 : ulong chunk ) { /* Assumed in [0,UINT_MAX] */
250 5270043 : return (void *)(((ulong)chunk0) + (chunk << FD_CHUNK_LG_SZ));
251 5270043 : }
252 :
253 : FD_FN_CONST static inline void const *
254 : fd_chunk_to_laddr_const( void const * chunk0,
255 65011715 : ulong chunk ) {
256 65011715 : return (void const *)(((ulong)chunk0) + (chunk << FD_CHUNK_LG_SZ));
257 65011715 : }
258 :
259 : FD_FN_CONST static inline ulong /* Will be in [0,UINT_MAX] */
260 : fd_laddr_to_chunk( void const * chunk0, /* Assumed aligned FD_CHUNK_ALIGN */
261 65880 : void const * laddr ) { /* Assumed aligned FD_CHUNK_ALIGN and in [ chunk0, chunk0 + FD_CHUNK_SZ*(UINT_MAX+1) ) */
262 65880 : return (((ulong)laddr)-((ulong)chunk0)) >> FD_CHUNK_LG_SZ;
263 65880 : }
264 :
265 : /* fd_frag_meta_seq_query returns the sequence number pointed to by meta
266 : as atomically observed at some point of time between when the call
267 : was made and the call returns. Assumes meta is valid. This acts as
268 : a compiler memory fence. */
269 :
270 : static inline ulong
271 74011712 : fd_frag_meta_seq_query( fd_frag_meta_t const * meta ) { /* Assumed non-NULL */
272 74011712 : FD_COMPILER_MFENCE();
273 74011712 : ulong seq = FD_VOLATILE_CONST( meta->seq );
274 74011712 : FD_COMPILER_MFENCE();
275 74011712 : return seq;
276 74011712 : }
277 :
278 : #if FD_HAS_SSE
279 :
280 : /* fd_frag_meta_seq_sig_query returns the sequence number and signature
281 : pointed to by meta in one atomic read, same semantics as
282 : fd_frag_meta_seq_query. */
283 : static inline __m128i
284 0 : fd_frag_meta_seq_sig_query( fd_frag_meta_t const * meta ) { /* Assumed non-NULL */
285 0 : FD_COMPILER_MFENCE();
286 0 : __m128i sse0 = _mm_load_si128( &meta->sse0 );
287 0 : FD_COMPILER_MFENCE();
288 0 : return sse0;
289 0 : }
290 :
291 : #endif
292 :
293 : /* fd_frag_meta_ctl, fd_frag_meta_ctl_{som,eom,err} pack and unpack the
294 : fd_frag message reassembly control bits. */
295 :
296 : FD_FN_CONST static inline ulong /* In [0,2^16) */
297 : fd_frag_meta_ctl( ulong orig, /* Assumed in [0,FD_FRAG_META_ORIG_MAX) */
298 : int som, /* 0 for false, non-zero for true */
299 : int eom, /* 0 for false, non-zero for true */
300 8585554 : int err ) { /* 0 for false, non-zero for true */
301 8585554 : return ((ulong)!!som) | (((ulong)!!eom)<<1) | (((ulong)!!err)<<2) | (orig<<3);
302 8585554 : }
303 :
304 0 : FD_FN_CONST static inline ulong fd_frag_meta_ctl_orig( ulong ctl ) { return ctl>>3; }
305 3000000 : FD_FN_CONST static inline int fd_frag_meta_ctl_som ( ulong ctl ) { return (int)( ctl & 1UL); }
306 3000000 : FD_FN_CONST static inline int fd_frag_meta_ctl_eom ( ulong ctl ) { return (int)((ctl>>1) & 1UL); }
307 3000000 : FD_FN_CONST static inline int fd_frag_meta_ctl_err ( ulong ctl ) { return (int)((ctl>>2) & 1UL); }
308 :
309 : #if FD_HAS_SSE
310 :
311 : FD_FN_CONST static inline __m128i
312 : fd_frag_meta_sse0( ulong seq,
313 0 : ulong sig ) {
314 0 : return _mm_set_epi64x( (long)sig, (long)seq ); /* Backward Intel ... sigh */
315 0 : }
316 :
317 0 : FD_FN_CONST static inline ulong fd_frag_meta_sse0_seq( __m128i sse0 ) { return (ulong)_mm_extract_epi64( sse0, 0 ); }
318 0 : FD_FN_CONST static inline ulong fd_frag_meta_sse0_sig( __m128i sse0 ) { return (ulong)_mm_extract_epi64( sse0, 1 ); }
319 :
320 : FD_FN_CONST static inline __m128i
321 : fd_frag_meta_sse1( ulong chunk, /* Assumed 32-bit */
322 : ulong sz, /* Assumed 16 bit */
323 : ulong ctl, /* Assumed 16-bit */
324 : ulong tsorig, /* Assumed 32-bit */
325 0 : ulong tspub ) { /* Assumed 32-bit */
326 0 : return _mm_set_epi64x( (long)(tsorig | (tspub<<32)),
327 0 : (long)(chunk | (sz<<32) | (ctl<<48)) ); /* Backward Intel ... sigh */
328 0 : }
329 :
330 0 : FD_FN_CONST static inline ulong fd_frag_meta_sse1_chunk ( __m128i sse1 ) { return (ulong)(uint )_mm_extract_epi32( sse1, 0 ); }
331 0 : FD_FN_CONST static inline ulong fd_frag_meta_sse1_sz ( __m128i sse1 ) { return (ulong)(ushort)_mm_extract_epi16( sse1, 2 ); }
332 0 : FD_FN_CONST static inline ulong fd_frag_meta_sse1_ctl ( __m128i sse1 ) { return (ulong)(ushort)_mm_extract_epi16( sse1, 3 ); }
333 0 : FD_FN_CONST static inline ulong fd_frag_meta_sse1_tsorig( __m128i sse1 ) { return (ulong)(uint )_mm_extract_epi32( sse1, 2 ); }
334 0 : FD_FN_CONST static inline ulong fd_frag_meta_sse1_tspub ( __m128i sse1 ) { return (ulong)(uint )_mm_extract_epi32( sse1, 3 ); }
335 :
336 : #endif
337 : #if FD_HAS_AVX
338 :
339 : FD_FN_CONST static inline __m256i
340 : fd_frag_meta_avx( ulong seq,
341 : ulong sig,
342 : ulong chunk, /* Assumed 32-bit */
343 : ulong sz, /* Assumed 16 bit */
344 : ulong ctl, /* Assumed 16-bit */
345 : ulong tsorig, /* Assumed 32-bit */
346 65874 : ulong tspub ) { /* Assumed 32-bit */
347 65874 : return _mm256_set_epi64x( (long)(tsorig | (tspub<<32)),
348 65874 : (long)(chunk | (sz<<32) | (ctl<<48)),
349 65874 : (long)sig,
350 65874 : (long)seq ); /* Backward Intel ... sigh */
351 65874 : }
352 :
353 0 : FD_FN_CONST static inline ulong fd_frag_meta_avx_seq ( __m256i avx ) { return (ulong) _mm256_extract_epi64( avx, 0 ); }
354 0 : FD_FN_CONST static inline ulong fd_frag_meta_avx_sig ( __m256i avx ) { return (ulong) _mm256_extract_epi64( avx, 1 ); }
355 0 : FD_FN_CONST static inline ulong fd_frag_meta_avx_chunk ( __m256i avx ) { return (ulong)(uint )_mm256_extract_epi32( avx, 4 ); }
356 0 : FD_FN_CONST static inline ulong fd_frag_meta_avx_sz ( __m256i avx ) { return (ulong)(ushort)_mm256_extract_epi16( avx, 10 ); }
357 0 : FD_FN_CONST static inline ulong fd_frag_meta_avx_ctl ( __m256i avx ) { return (ulong)(ushort)_mm256_extract_epi16( avx, 11 ); }
358 0 : FD_FN_CONST static inline ulong fd_frag_meta_avx_tsorig( __m256i avx ) { return (ulong)(uint )_mm256_extract_epi32( avx, 6 ); }
359 0 : FD_FN_CONST static inline ulong fd_frag_meta_avx_tspub ( __m256i avx ) { return (ulong)(uint )_mm256_extract_epi32( avx, 7 ); }
360 :
361 : #endif
362 :
363 : /* fd_frag_meta_ts_{comp,decomp}: Given the longs ts and tsref that
364 : are reasonably close to each other (|ts-tsref| < 2^31 ... about
365 : +/-2.1 seconds if ts and tsref are reasonably well synchronized
366 : fd_log_wallclock measurements), this pair of functions can quickly
367 : and losslessly compress / decompress ts by a factor of 2 exactly
368 : using tsref as the compressor / decompressor "state". */
369 :
370 : FD_FN_CONST static inline ulong /* In [0,UINT_MAX] */
371 11087937 : fd_frag_meta_ts_comp( long ts ) {
372 11087937 : return (ulong)(uint)ts;
373 11087937 : }
374 :
375 : FD_FN_CONST static inline long
376 : fd_frag_meta_ts_decomp( ulong tscomp, /* In [0,UINT_MAX] */
377 0 : long tsref ) {
378 0 : ulong msb = ((ulong)tsref) + fd_ulong_mask_lsb(31) - tscomp;
379 0 : return (long)((msb & ~fd_ulong_mask_lsb(32)) | tscomp);
380 0 : }
381 :
382 : FD_PROTOTYPES_END
383 :
384 : #endif /* HEADER_fd_src_tango_fd_tango_base_h */
385 :
|