Line data Source code
1 : #include "fd_stem.h"
2 :
3 : /* fd_stem provides services to multiplex multiple streams of input
4 : fragments and present them to a mix of reliable and unreliable
5 : consumers as though they were generated by multiple different
6 : multi-stream producers. The code can be included to generate
7 : a definition of stem_run which can be called as a tile main run
8 : loop.
9 :
10 : The template supports various callback functions which can be
11 : defined like #define STEM_CALLBACK_BEFORE_FRAG before_frag to
12 : tune the behavior of the stem_run loop. The callbacks are:
13 :
14 : DURING_HOUSEKEEPING
15 : Is called during the housekeeping routine, which happens infrequently
16 : on a schedule determined by the stem (based on the lazy parameter,
17 : see fd_tempo.h for more information). It is appropriate to do
18 : slightly expensive things here that wouldn't be OK to do in the main
19 : loop, like updating sequence numbers that are shared with other tiles
20 : (e.g. synchronization information), or sending batched information
21 : somewhere. The ctx is a user-provided context object from when the
22 : stem was initialized.
23 :
24 : METRICS_WRITE
25 : By convention, tiles may wish to accumulate high traffic metrics
26 : locally so they don't cause a lot of cache coherency traffic, and
27 : then periodically publish them to external observers. This callback
28 : is here to support that use case. It occurs infrequently during the
29 : housekeeping loop, and is called inside a compiler fence to ensure
30 : the writes do not get reordered, which may be important for observers
31 : or monitoring tools. The ctx is a user-provided context object from
32 : when the stem tile was initialized.
33 :
34 : BEFORE_CREDIT
35 : Is called every iteration of the stem run loop, whether there is a
36 : new frag ready to receive or not. This callback is also still
37 : invoked even if the stem is backpressured and cannot read any new
38 : fragments while waiting for downstream consumers to catch up. This
39 : callback is useful for things that need to occur even if no new frags
40 : are being handled. For example, servicing network connections could
41 : happen here. The ctx is a user-provided context object from when the
42 : stem tile was initialized. The stem is the stem which is invoking
43 : this callback. The stem should only be used for calling
44 : fd_stem_publish to publish a fragment to downstream consumers.
45 :
46 : The charge_busy argument is 0 by default, and should be set to 1 if
47 : the before_credit function is doing work that should be accounted for
48 : as part of the tiles busy indicator.
49 :
50 : AFTER_CREDIT
51 : Is called every iteration of the stem run loop, whether there is a
52 : new frag ready to receive or not, except in cases where the stem is
53 : backpressured by a downstream consumer and would not be able to
54 : publish. The callback might be used for publishing new fragments to
55 : downstream consumers in the main loop which are not in response to an
56 : incoming fragment. For example, code that collects incoming
57 : fragments over a period of 1 second and joins them together before
58 : publishing a large block fragment downstream, would publish the block
59 : here. The ctx is a user-provided context object from when the stem
60 : tile was initialized. The stem is the stem which is invoking this
61 : callback. The stem should only be used for calling fd_stem_publish to
62 : publish a fragment to downstream consumers.
63 :
64 : The opt_poll_in argument determines if the stem should proceed with
65 : checking for new fragments to consumer, or should `continue` the main
66 : stem loop to do credit checking again. This could be used if the
67 : after_credit function publishes, and the flow control needs to be
68 : checked again. By default, opt_poll_in is true and the stem will
69 : poll for fragments right away without rerunning the loop or checking
70 : for credits.
71 :
72 : The charge_busy argument is 0 by default, and should be set to 1 if
73 : the after_credit function is doing work that should be accounted for
74 : as part of the tiles busy indicator.
75 :
76 : BEFORE_FRAG
77 : Is called immediately whenever a new fragment has been detected that
78 : was published by an upstream producer. The signature and sequence
79 : number (sig and seq) provided as arguments are read atomically from
80 : shared memory, so must both match each other from the published
81 : fragment (aka. they will not be torn or partially overwritten).
82 : in_idx is an index in [0, num_ins) indicating which producer
83 : published the fragment. No fragment data has been read yet here, nor
84 : has other metadata, for example the size or timestamps of the
85 : fragment. Mainly this callback is useful for deciding whether to
86 : filter the fragment based on its signature. If the return value is
87 : non-zero, the frag will be skipped completely, no fragment data will
88 : be read, and the in will be advanced so that we now wait for the next
89 : fragment. The ctx is a user-provided context object from when the
90 : stem tile was initialized.
91 :
92 : DURING_FRAG
93 : Is called after the stem has received a new frag from an in, but
94 : before the stem has checked that it was overrun. This callback is
95 : not invoked if the stem is backpressured, as it would not try and
96 : read a frag from an in in the first place (instead, leaving it on the
97 : in mcache to backpressure the upstream producer). in_idx will be the
98 : index of the in that the frag was received from. If the producer of
99 : the frags is respecting flow control, it is safe to read frag data in
100 : any of the callbacks, but it is suggested to copy or read frag data
101 : within this callback, as if the producer does not respect flow
102 : control, the frag may be torn or corrupt due to an overrun by the
103 : reader. If the frag being read from has been overwritten while this
104 : callback is running, the frag will be ignored and the stem will not
105 : call the process function. Instead it will recover from the overrun
106 : and continue with new frags. This function cannot fail. The ctx is a
107 : user-provided context object from when the stem tile was initialized.
108 : seq, sig, chunk, and sz are the respective fields from the mcache
109 : fragment that was received. If the producer is not respecting flow
110 : control, these may be corrupt or torn and should not be trusted,
111 : except for seq which is read atomically.
112 :
113 : AFTER_FRAG
114 : Is is called immediately after the DURING_FRAG, along with an
115 : additional check that the reader was not overrun while handling the
116 : frag. If the reader was overrun, the frag is abandoned and this
117 : function is not called. This callback is not invoked if the stem is
118 : backpressured, as it would not read a frag in the first place.
119 : in_idx will be the index of the in that the frag was received from.
120 : You should not read the frag data directly here, as it might still
121 : get overrun, instead it should be copied out of the frag during the
122 : read callback if needed later. This function cannot fail. The ctx is
123 : a user-provided context object from when the stem tile was
124 : initialized. stem should only be used for calling fd_stem_publish to
125 : publish a fragment to downstream consumers. seq is the sequence
126 : number of the fragment that was read from the input mcache. sig,
127 : chunk, sz, and tsorig are the respective fields from the mcache
128 : fragment that was received. If the producer is not respecting flow
129 : control, these may be corrupt or torn and should not be trusted. */
130 :
131 : #if !FD_HAS_SSE
132 : #error "fd_stem requires SSE"
133 : #endif
134 :
135 : #if !FD_HAS_ALLOCA
136 : #error "fd_stem requires alloca"
137 : #endif
138 :
139 : #include "../topo/fd_topo.h"
140 : #include "../metrics/fd_metrics.h"
141 : #include "../../tango/fd_tango.h"
142 :
143 : #ifndef STEM_NAME
144 : #define STEM_NAME stem
145 : #endif
146 0 : #define STEM_(n) FD_EXPAND_THEN_CONCAT3(STEM_NAME,_,n)
147 :
148 : #ifndef STEM_BURST
149 : #error "STEM_BURST must be defined"
150 : #endif
151 :
152 : #ifndef STEM_CALLBACK_CONTEXT_TYPE
153 : #error "STEM_CALLBACK_CONTEXT_TYPE must be defined"
154 : #endif
155 :
156 : #ifndef STEM_LAZY
157 0 : #define STEM_LAZY (0L)
158 : #endif
159 :
160 : static inline void
161 0 : STEM_(in_update)( fd_stem_tile_in_t * in ) {
162 0 : fd_fseq_update( in->fseq, in->seq );
163 :
164 0 : volatile ulong * metrics = fd_metrics_link_in( fd_metrics_base_tl, in->idx );
165 :
166 0 : uint * accum = in->accum;
167 0 : ulong a0 = (ulong)accum[0]; ulong a1 = (ulong)accum[1]; ulong a2 = (ulong)accum[2];
168 0 : ulong a3 = (ulong)accum[3]; ulong a4 = (ulong)accum[4]; ulong a5 = (ulong)accum[5];
169 0 : FD_COMPILER_MFENCE();
170 0 : metrics[0] += a0; metrics[1] += a1; metrics[2] += a2;
171 0 : metrics[3] += a3; metrics[4] += a4; metrics[5] += a5;
172 0 : FD_COMPILER_MFENCE();
173 0 : accum[0] = 0U; accum[1] = 0U; accum[2] = 0U;
174 0 : accum[3] = 0U; accum[4] = 0U; accum[5] = 0U;
175 0 : }
176 :
177 : FD_FN_PURE static inline ulong
178 0 : STEM_(scratch_align)( void ) {
179 0 : return FD_STEM_SCRATCH_ALIGN;
180 0 : }
181 :
182 : FD_FN_PURE static inline ulong
183 : STEM_(scratch_footprint)( ulong in_cnt,
184 : ulong out_cnt,
185 0 : ulong cons_cnt ) {
186 0 : ulong l = FD_LAYOUT_INIT;
187 0 : l = FD_LAYOUT_APPEND( l, alignof(fd_stem_tile_in_t), in_cnt*sizeof(fd_stem_tile_in_t) ); /* in */
188 0 : l = FD_LAYOUT_APPEND( l, alignof(ulong), out_cnt*sizeof(ulong) ); /* out_depth */
189 0 : l = FD_LAYOUT_APPEND( l, alignof(ulong), out_cnt*sizeof(ulong) ); /* out_seq */
190 0 : l = FD_LAYOUT_APPEND( l, alignof(ulong const *), cons_cnt*sizeof(ulong const *) ); /* cons_fseq */
191 0 : l = FD_LAYOUT_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) ); /* cons_slow */
192 0 : l = FD_LAYOUT_APPEND( l, alignof(ulong), cons_cnt*sizeof(ulong) ); /* cons_out */
193 0 : l = FD_LAYOUT_APPEND( l, alignof(ulong), cons_cnt*sizeof(ulong) ); /* cons_seq */
194 0 : const ulong event_cnt = in_cnt + 1UL + cons_cnt;
195 0 : l = FD_LAYOUT_APPEND( l, alignof(ushort), event_cnt*sizeof(ushort) ); /* event_map */
196 0 : return FD_LAYOUT_FINI( l, STEM_(scratch_align)() );
197 0 : }
198 :
199 : static inline void
200 : STEM_(run1)( ulong in_cnt,
201 : fd_frag_meta_t const ** in_mcache,
202 : ulong ** in_fseq,
203 : ulong out_cnt,
204 : fd_frag_meta_t ** out_mcache,
205 : ulong cons_cnt,
206 : ulong * _cons_out,
207 : ulong ** _cons_fseq,
208 : ulong burst,
209 : long lazy,
210 : fd_rng_t * rng,
211 : void * scratch,
212 0 : STEM_CALLBACK_CONTEXT_TYPE * ctx ) {
213 : /* in frag stream state */
214 0 : ulong in_seq; /* current position in input poll sequence, in [0,in_cnt) */
215 0 : fd_stem_tile_in_t * in; /* in[in_seq] for in_seq in [0,in_cnt) has information about input fragment stream currently at
216 : position in_seq in the in_idx polling sequence. The ordering of this array is continuously
217 : shuffled to avoid lighthousing effects in the output fragment stream at extreme fan-in and load */
218 :
219 : /* out frag stream state */
220 0 : ulong * out_depth; /* ==fd_mcache_depth( out_mcache[out_idx] ) for out_idx in [0, out_cnt) */
221 0 : ulong * out_seq; /* next mux frag sequence number to publish for out_idx in [0, out_cnt) ]*/
222 :
223 : /* out flow control state */
224 0 : ulong cr_avail; /* number of flow control credits available to publish downstream, in [0,cr_max] */
225 0 : ulong const ** cons_fseq; /* cons_fseq[cons_idx] for cons_idx in [0,cons_cnt) is where to receive fctl credits from consumers */
226 0 : ulong ** cons_slow; /* cons_slow[cons_idx] for cons_idx in [0,cons_cnt) is where to accumulate slow events */
227 0 : ulong * cons_out; /* cons_out[cons_idx] for cons_idx in [0,cons_ct) is which out the consumer consumes from ]*/
228 0 : ulong * cons_seq; /* cons_seq [cons_idx] is the most recent observation of cons_fseq[cons_idx] */
229 :
230 : /* housekeeping state */
231 0 : ulong event_cnt; /* ==in_cnt+cons_cnt+1, total number of housekeeping events */
232 0 : ulong event_seq; /* current position in housekeeping event sequence, in [0,event_cnt) */
233 0 : ushort * event_map; /* current mapping of event_seq to event idx, event_map[ event_seq ] is next event to process */
234 0 : ulong async_min; /* minimum number of ticks between processing a housekeeping event, positive integer power of 2 */
235 :
236 : /* performance metrics */
237 0 : ulong metric_in_backp; /* is the run loop currently backpressured by one or more of the outs, in [0,1] */
238 0 : ulong metric_backp_cnt; /* Accumulates number of transitions of tile to backpressured between housekeeping events */
239 :
240 0 : ulong metric_regime_ticks[9]; /* How many ticks the tile has spent in each regime */
241 :
242 0 : if( FD_UNLIKELY( !scratch ) ) FD_LOG_ERR(( "NULL scratch" ));
243 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)scratch, STEM_(scratch_align)() ) ) ) FD_LOG_ERR(( "misaligned scratch" ));
244 :
245 : /* in_backp==1, backp_cnt==0 indicates waiting for initial credits,
246 : cleared during first housekeeping if credits available */
247 0 : metric_in_backp = 1UL;
248 0 : metric_backp_cnt = 0UL;
249 0 : memset( metric_regime_ticks, 0, sizeof( metric_regime_ticks ) );
250 :
251 : /* in frag stream init */
252 :
253 0 : in_seq = 0UL; /* First in to poll */
254 :
255 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
256 0 : in = (fd_stem_tile_in_t *)FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_stem_tile_in_t), in_cnt*sizeof(fd_stem_tile_in_t) );
257 :
258 0 : ulong min_in_depth = (ulong)LONG_MAX;
259 :
260 0 : if( FD_UNLIKELY( !!in_cnt && !in_mcache ) ) FD_LOG_ERR(( "NULL in_mcache" ));
261 0 : if( FD_UNLIKELY( !!in_cnt && !in_fseq ) ) FD_LOG_ERR(( "NULL in_fseq" ));
262 0 : if( FD_UNLIKELY( in_cnt > UINT_MAX ) ) FD_LOG_ERR(( "in_cnt too large" ));
263 0 : for( ulong in_idx=0UL; in_idx<in_cnt; in_idx++ ) {
264 :
265 0 : if( FD_UNLIKELY( !in_mcache[ in_idx ] ) ) FD_LOG_ERR(( "NULL in_mcache[%lu]", in_idx ));
266 0 : if( FD_UNLIKELY( !in_fseq [ in_idx ] ) ) FD_LOG_ERR(( "NULL in_fseq[%lu]", in_idx ));
267 :
268 0 : fd_stem_tile_in_t * this_in = &in[ in_idx ];
269 :
270 0 : this_in->mcache = in_mcache[ in_idx ];
271 0 : this_in->fseq = in_fseq [ in_idx ];
272 :
273 0 : ulong depth = fd_mcache_depth( this_in->mcache ); min_in_depth = fd_ulong_min( min_in_depth, depth );
274 0 : if( FD_UNLIKELY( depth > UINT_MAX ) ) FD_LOG_ERR(( "in_mcache[%lu] too deep", in_idx ));
275 0 : this_in->depth = (uint)depth;
276 0 : this_in->idx = (uint)in_idx;
277 0 : this_in->seq = 0UL;
278 0 : this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in->seq, this_in->depth );
279 :
280 0 : this_in->accum[0] = 0U; this_in->accum[1] = 0U; this_in->accum[2] = 0U;
281 0 : this_in->accum[3] = 0U; this_in->accum[4] = 0U; this_in->accum[5] = 0U;
282 0 : }
283 :
284 : /* out frag stream init */
285 :
286 0 : cr_avail = 0UL;
287 :
288 0 : out_depth = (ulong *)FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), out_cnt*sizeof(ulong) );
289 0 : out_seq = (ulong *)FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), out_cnt*sizeof(ulong) );
290 :
291 0 : ulong cr_max = fd_ulong_if( !out_cnt, 128UL, ULONG_MAX );
292 :
293 0 : for( ulong out_idx=0UL; out_idx<out_cnt; out_idx++ ) {
294 :
295 0 : if( FD_UNLIKELY( !out_mcache[ out_idx ] ) ) FD_LOG_ERR(( "NULL out_mcache[%lu]", out_idx ));
296 :
297 0 : out_depth[ out_idx ] = fd_mcache_depth( out_mcache[ out_idx ] );
298 0 : out_seq[ out_idx ] = 0UL;
299 :
300 0 : cr_max = fd_ulong_min( cr_max, out_depth[ out_idx ] );
301 0 : }
302 :
303 0 : cons_fseq = (ulong const **)FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong const *), cons_cnt*sizeof(ulong const *) );
304 0 : cons_slow = (ulong **) FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong *), cons_cnt*sizeof(ulong *) );
305 0 : cons_out = (ulong *) FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), cons_cnt*sizeof(ulong) );
306 0 : cons_seq = (ulong *) FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), cons_cnt*sizeof(ulong) );
307 :
308 0 : if( FD_UNLIKELY( !!cons_cnt && !_cons_fseq ) ) FD_LOG_ERR(( "NULL cons_fseq" ));
309 0 : for( ulong cons_idx=0UL; cons_idx<cons_cnt; cons_idx++ ) {
310 0 : if( FD_UNLIKELY( !_cons_fseq[ cons_idx ] ) ) FD_LOG_ERR(( "NULL cons_fseq[%lu]", cons_idx ));
311 0 : cons_fseq[ cons_idx ] = _cons_fseq[ cons_idx ];
312 0 : cons_out [ cons_idx ] = _cons_out [ cons_idx ];
313 0 : cons_slow[ cons_idx ] = (ulong*)(fd_metrics_link_out( fd_metrics_base_tl, cons_idx ) + FD_METRICS_COUNTER_LINK_SLOW_COUNT_OFF);
314 0 : cons_seq [ cons_idx ] = fd_fseq_query( _cons_fseq[ cons_idx ] );
315 0 : }
316 :
317 : /* housekeeping init */
318 :
319 0 : if( lazy<=0L ) lazy = fd_tempo_lazy_default( cr_max );
320 0 : FD_LOG_INFO(( "Configuring housekeeping (lazy %li ns)", lazy ));
321 :
322 : /* Initialize the initial event sequence to immediately update
323 : cr_avail on the first run loop iteration and then update all the
324 : ins accordingly. */
325 :
326 0 : event_cnt = in_cnt + 1UL + cons_cnt;
327 0 : event_map = (ushort *)FD_SCRATCH_ALLOC_APPEND( l, alignof(ushort), event_cnt*sizeof(ushort) );
328 0 : event_seq = 0UL; event_map[ event_seq++ ] = (ushort)cons_cnt;
329 0 : for( ulong in_idx=0UL; in_idx< in_cnt; in_idx++ ) event_map[ event_seq++ ] = (ushort)(in_idx+cons_cnt+1UL);
330 0 : for( ulong cons_idx=0UL; cons_idx<cons_cnt; cons_idx++ ) event_map[ event_seq++ ] = (ushort)cons_idx;
331 0 : event_seq = 0UL;
332 :
333 0 : async_min = fd_tempo_async_min( lazy, event_cnt, (float)fd_tempo_tick_per_ns( NULL ) );
334 0 : if( FD_UNLIKELY( !async_min ) ) FD_LOG_ERR(( "bad lazy %lu %lu", (ulong)lazy, event_cnt ));
335 :
336 0 : FD_LOG_INFO(( "Running stem" ));
337 0 : FD_MGAUGE_SET( TILE, STATUS, 1UL );
338 0 : long then = fd_tickcount();
339 0 : long now = then;
340 0 : for(;;) {
341 :
342 : /* Do housekeeping at a low rate in the background */
343 :
344 0 : ulong housekeeping_ticks = 0UL;
345 0 : if( FD_UNLIKELY( (now-then)>=0L ) ) {
346 0 : ulong event_idx = (ulong)event_map[ event_seq ];
347 :
348 : /* Do the next async event. event_idx:
349 : <out_cnt - receive credits from out event_idx
350 : ==out_cnt - housekeeping
351 : >out_cnt - send credits to in event_idx - out_cnt - 1.
352 : Branch hints and order are optimized for the case:
353 : out_cnt >~ in_cnt >~ 1. */
354 :
355 0 : if( FD_LIKELY( event_idx<cons_cnt ) ) { /* cons fctl for cons cons_idx */
356 0 : ulong cons_idx = event_idx;
357 :
358 : /* Receive flow control credits from this out. */
359 0 : cons_seq[ cons_idx ] = fd_fseq_query( cons_fseq[ cons_idx ] );
360 :
361 0 : } else if( FD_LIKELY( event_idx>cons_cnt ) ) { /* in fctl for in in_idx */
362 0 : ulong in_idx = event_idx - cons_cnt - 1UL;
363 :
364 : /* Send flow control credits and drain flow control diagnostics
365 : for in_idx. */
366 :
367 0 : STEM_(in_update)( &in[ in_idx ] );
368 :
369 0 : } else { /* event_idx==cons_cnt, housekeeping event */
370 :
371 : /* Update metrics counters to external viewers */
372 0 : FD_COMPILER_MFENCE();
373 0 : FD_MGAUGE_SET( TILE, HEARTBEAT, (ulong)now );
374 0 : FD_MGAUGE_SET( TILE, IN_BACKPRESSURE, metric_in_backp );
375 0 : FD_MCNT_INC ( TILE, BACKPRESSURE_COUNT, metric_backp_cnt );
376 0 : FD_MCNT_ENUM_COPY( TILE, REGIME_DURATION_NANOS, metric_regime_ticks );
377 : #ifdef STEM_CALLBACK_METRICS_WRITE
378 0 : STEM_CALLBACK_METRICS_WRITE( ctx );
379 : #endif
380 0 : FD_COMPILER_MFENCE();
381 0 : metric_backp_cnt = 0UL;
382 :
383 : /* Receive flow control credits */
384 0 : if( FD_LIKELY( cr_avail<cr_max ) ) {
385 0 : ulong slowest_cons = ULONG_MAX;
386 0 : cr_avail = cr_max;
387 0 : for( ulong cons_idx=0UL; cons_idx<cons_cnt; cons_idx++ ) {
388 0 : ulong cons_cr_avail = (ulong)fd_long_max( (long)cr_max-fd_long_max( fd_seq_diff( out_seq[ cons_out[ cons_idx ] ], cons_seq[ cons_idx ] ), 0L ), 0L );
389 0 : slowest_cons = fd_ulong_if( cons_cr_avail<cr_avail, cons_idx, slowest_cons );
390 0 : cr_avail = fd_ulong_min( cons_cr_avail, cr_avail );
391 0 : }
392 :
393 : /* See notes above about use of quasi-atomic diagnostic accum */
394 0 : if( FD_LIKELY( slowest_cons!=ULONG_MAX ) ) {
395 0 : FD_COMPILER_MFENCE();
396 0 : (*cons_slow[ slowest_cons ]) += metric_in_backp;
397 0 : FD_COMPILER_MFENCE();
398 0 : }
399 0 : }
400 :
401 : #ifdef STEM_CALLBACK_DURING_HOUSEKEEPING
402 0 : STEM_CALLBACK_DURING_HOUSEKEEPING( ctx );
403 : #else
404 : (void)ctx;
405 : #endif
406 0 : }
407 :
408 : /* Select which event to do next (randomized round robin) and
409 : reload the housekeeping timer. */
410 :
411 0 : event_seq++;
412 0 : if( FD_UNLIKELY( event_seq>=event_cnt ) ) {
413 0 : event_seq = 0UL;
414 :
415 : /* Randomize the order of event processing for the next event
416 : event_cnt events to avoid lighthousing effects causing input
417 : credit starvation at extreme fan in/fan out, extreme in load
418 : and high credit return laziness. */
419 :
420 0 : ulong swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)event_cnt );
421 0 : ushort map_tmp = event_map[ swap_idx ];
422 0 : event_map[ swap_idx ] = event_map[ 0 ];
423 0 : event_map[ 0 ] = map_tmp;
424 :
425 : /* We also do the same with the ins to prevent there being a
426 : correlated order frag origins from different inputs
427 : downstream at extreme fan in and extreme in load. */
428 :
429 0 : if( FD_LIKELY( in_cnt>1UL ) ) {
430 0 : swap_idx = (ulong)fd_rng_uint_roll( rng, (uint)in_cnt );
431 0 : fd_stem_tile_in_t in_tmp;
432 0 : in_tmp = in[ swap_idx ];
433 0 : in[ swap_idx ] = in[ 0 ];
434 0 : in[ 0 ] = in_tmp;
435 0 : }
436 0 : }
437 :
438 : /* Reload housekeeping timer */
439 0 : then = now + (long)fd_tempo_async_reload( rng, async_min );
440 0 : long next = fd_tickcount();
441 0 : housekeeping_ticks = (ulong)(next - now);
442 0 : now = next;
443 0 : }
444 :
445 : #if defined(STEM_CALLBACK_BEFORE_CREDIT) || defined(STEM_CALLBACK_AFTER_CREDIT) || defined(STEM_CALLBACK_AFTER_FRAG)
446 : fd_stem_context_t stem = {
447 : .mcaches = out_mcache,
448 : .depths = out_depth,
449 : .seqs = out_seq,
450 :
451 : .cr_avail = &cr_avail,
452 : .cr_decrement_amount = fd_ulong_if( out_cnt>0UL, 1UL, 0UL ),
453 : };
454 : #endif
455 :
456 : #ifdef STEM_CALLBACK_BEFORE_CREDIT
457 : int charge_busy_before = 0;
458 0 : STEM_CALLBACK_BEFORE_CREDIT( ctx, &stem, &charge_busy_before );
459 : #endif
460 :
461 : /* Check if we are backpressured. If so, count any transition into
462 : a backpressured regime and spin to wait for flow control credits
463 : to return. We don't do a fully atomic update here as it is only
464 : diagnostic and it will still be correct in the usual case where
465 : individual diagnostic counters aren't used by writers in
466 : different threads of execution. We only count the transition
467 : from not backpressured to backpressured. */
468 :
469 0 : if( FD_UNLIKELY( cr_avail<burst ) ) {
470 0 : metric_backp_cnt += (ulong)!metric_in_backp;
471 0 : metric_in_backp = 1UL;
472 0 : FD_SPIN_PAUSE();
473 0 : metric_regime_ticks[2] += housekeeping_ticks;
474 0 : long next = fd_tickcount();
475 0 : metric_regime_ticks[5] += (ulong)(next - now);
476 0 : now = next;
477 0 : continue;
478 0 : }
479 0 : metric_in_backp = 0UL;
480 :
481 : #ifdef STEM_CALLBACK_AFTER_CREDIT
482 : int poll_in = 1;
483 : int charge_busy_after = 0;
484 0 : STEM_CALLBACK_AFTER_CREDIT( ctx, &stem, &poll_in, &charge_busy_after );
485 0 : if( FD_UNLIKELY( !poll_in ) ) {
486 0 : metric_regime_ticks[1] += housekeeping_ticks;
487 0 : long next = fd_tickcount();
488 0 : metric_regime_ticks[4] += (ulong)(next - now);
489 0 : now = next;
490 0 : continue;
491 0 : }
492 0 : #endif
493 :
494 : /* Select which in to poll next (randomized round robin) */
495 :
496 0 : if( FD_UNLIKELY( !in_cnt ) ) {
497 0 : metric_regime_ticks[0] += housekeeping_ticks;
498 0 : long next = fd_tickcount();
499 0 : metric_regime_ticks[3] += (ulong)(next - now);
500 0 : now = next;
501 0 : continue;
502 0 : }
503 :
504 0 : ulong prefrag_ticks = 0UL;
505 : #if defined(STEM_CALLBACK_BEFORE_CREDIT) && defined(STEM_CALLBACK_AFTER_CREDIT)
506 0 : if( FD_LIKELY( charge_busy_before || charge_busy_after ) ) {
507 : #elif defined(STEM_CALLBACK_BEFORE_CREDIT)
508 0 : if( FD_LIKELY( charge_busy_before ) ) {
509 : #elif defined(STEM_CALLBACK_AFTER_CREDIT)
510 0 : if( FD_LIKELY( charge_busy_after ) ) {
511 0 : #endif
512 :
513 : #if defined(STEM_CALLBACK_BEFORE_CREDIT) || defined(STEM_CALLBACK_AFTER_CREDIT)
514 0 : long prefrag_next = fd_tickcount();
515 0 : prefrag_ticks = (ulong)(prefrag_next - now);
516 0 : now = prefrag_next;
517 0 : }
518 : #endif
519 :
520 0 : fd_stem_tile_in_t * this_in = &in[ in_seq ];
521 0 : in_seq++;
522 0 : if( in_seq>=in_cnt ) in_seq = 0UL; /* cmov */
523 :
524 : /* Check if this in has any new fragments to mux */
525 :
526 0 : ulong this_in_seq = this_in->seq;
527 0 : fd_frag_meta_t const * this_in_mline = this_in->mline; /* Already at appropriate line for this_in_seq */
528 :
529 0 : __m128i seq_sig = fd_frag_meta_seq_sig_query( this_in_mline );
530 0 : #if FD_USING_CLANG
531 : /* TODO: Clang optimizes extremely aggressively which breaks the
532 : atomicity expected by seq_sig_query. In particular, it replaces
533 : the sequence query with a second load (immediately following
534 : vector load). The signature query a few lines down is still an
535 : extract from the vector which then means that effectively the
536 : signature is loaded before the sequence number.
537 : Adding this clobbers of the vector prevents this optimization by
538 : forcing the seq query to be an extract, but we probably want a
539 : better long term solution. */
540 0 : __asm__( "" : "+x"(seq_sig) );
541 0 : #endif
542 0 : ulong seq_found = fd_frag_meta_sse0_seq( seq_sig );
543 :
544 0 : long diff = fd_seq_diff( this_in_seq, seq_found );
545 0 : if( FD_UNLIKELY( diff ) ) { /* Caught up or overrun, optimize for new frag case */
546 0 : ulong * housekeeping_regime = &metric_regime_ticks[0];
547 0 : ulong * prefrag_regime = &metric_regime_ticks[3];
548 0 : ulong * finish_regime = &metric_regime_ticks[6];
549 0 : if( FD_UNLIKELY( diff<0L ) ) { /* Overrun (impossible if in is honoring our flow control) */
550 0 : this_in->seq = seq_found; /* Resume from here (probably reasonably current, could query in mcache sync directly instead) */
551 0 : housekeeping_regime = &metric_regime_ticks[1];
552 0 : prefrag_regime = &metric_regime_ticks[4];
553 0 : finish_regime = &metric_regime_ticks[7];
554 0 : this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++;
555 0 : this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff);
556 0 : }
557 : /* Don't bother with spin as polling multiple locations */
558 0 : *housekeeping_regime += housekeeping_ticks;
559 0 : *prefrag_regime += prefrag_ticks;
560 0 : long next = fd_tickcount();
561 0 : *finish_regime += (ulong)(next - now);
562 0 : now = next;
563 0 : continue;
564 0 : }
565 :
566 0 : ulong sig = fd_frag_meta_sse0_sig( seq_sig ); (void)sig;
567 : #ifdef STEM_CALLBACK_BEFORE_FRAG
568 0 : int filter = STEM_CALLBACK_BEFORE_FRAG( ctx, (ulong)this_in->idx, seq_found, sig );
569 0 : if( FD_UNLIKELY( filter<0 ) ) {
570 0 : metric_regime_ticks[1] += housekeeping_ticks;
571 0 : metric_regime_ticks[4] += prefrag_ticks;
572 0 : long next = fd_tickcount();
573 0 : metric_regime_ticks[7] += (ulong)(next - now);
574 0 : now = next;
575 0 : continue;
576 0 : } else if( FD_UNLIKELY( filter>0 ) ) {
577 0 : this_in->accum[ FD_METRICS_COUNTER_LINK_FILTERED_COUNT_OFF ]++;
578 0 : this_in->accum[ FD_METRICS_COUNTER_LINK_FILTERED_SIZE_BYTES_OFF ] += (uint)this_in_mline->sz; /* TODO: This might be overrun ... ? Not loaded atomically */
579 :
580 : this_in_seq = fd_seq_inc( this_in_seq, 1UL );
581 : this_in->seq = this_in_seq;
582 : this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth );
583 :
584 0 : metric_regime_ticks[1] += housekeeping_ticks;
585 0 : metric_regime_ticks[4] += prefrag_ticks;
586 0 : long next = fd_tickcount();
587 0 : metric_regime_ticks[7] += (ulong)(next - now);
588 0 : now = next;
589 0 : continue;
590 0 : }
591 0 : #endif
592 :
593 : /* We have a new fragment to mux. Try to load it. This attempt
594 : should always be successful if in producers are honoring our flow
595 : control. Since we can cheaply detect if there are
596 : misconfigurations (should be an L1 cache hit / predictable branch
597 : in the properly configured case), we do so anyway. Note that if
598 : we are on a platform where AVX is atomic, this could be replaced
599 : by a flat AVX load of the metadata and an extraction of the found
600 : sequence number for higher performance. */
601 0 : FD_COMPILER_MFENCE();
602 0 : ulong chunk = (ulong)this_in_mline->chunk; (void)chunk;
603 0 : ulong sz = (ulong)this_in_mline->sz; (void)sz;
604 0 : ulong ctl = (ulong)this_in_mline->ctl; (void)ctl;
605 0 : ulong tsorig = (ulong)this_in_mline->tsorig; (void)tsorig;
606 :
607 : #ifdef STEM_CALLBACK_DURING_FRAG
608 0 : STEM_CALLBACK_DURING_FRAG( ctx, (ulong)this_in->idx, seq_found, sig, chunk, sz );
609 : #endif
610 :
611 0 : FD_COMPILER_MFENCE();
612 0 : ulong seq_test = this_in_mline->seq;
613 0 : FD_COMPILER_MFENCE();
614 :
615 0 : if( FD_UNLIKELY( fd_seq_ne( seq_test, seq_found ) ) ) { /* Overrun while reading (impossible if this_in honoring our fctl) */
616 0 : this_in->seq = seq_test; /* Resume from here (probably reasonably current, could query in mcache sync instead) */
617 0 : fd_metrics_link_in( fd_metrics_base_tl, this_in->idx )[ FD_METRICS_COUNTER_LINK_OVERRUN_READING_COUNT_OFF ]++; /* No local accum since extremely rare, faster to use smaller cache line */
618 0 : fd_metrics_link_in( fd_metrics_base_tl, this_in->idx )[ FD_METRICS_COUNTER_LINK_OVERRUN_READING_FRAG_COUNT_OFF ] += (uint)fd_seq_diff( seq_test, seq_found ); /* No local accum since extremely rare, faster to use smaller cache line */
619 : /* Don't bother with spin as polling multiple locations */
620 0 : metric_regime_ticks[1] += housekeeping_ticks;
621 0 : metric_regime_ticks[4] += prefrag_ticks;
622 0 : long next = fd_tickcount();
623 0 : metric_regime_ticks[7] += (ulong)(next - now);
624 0 : now = next;
625 0 : continue;
626 0 : }
627 :
628 : #ifdef STEM_CALLBACK_AFTER_FRAG
629 0 : STEM_CALLBACK_AFTER_FRAG( ctx, (ulong)this_in->idx, seq_found, sig, sz, tsorig, &stem );
630 0 : #endif
631 :
632 : /* Windup for the next in poll and accumulate diagnostics */
633 :
634 0 : this_in_seq = fd_seq_inc( this_in_seq, 1UL );
635 0 : this_in->seq = this_in_seq;
636 0 : this_in->mline = this_in->mcache + fd_mcache_line_idx( this_in_seq, this_in->depth );
637 :
638 0 : this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_COUNT_OFF ]++;
639 0 : this_in->accum[ FD_METRICS_COUNTER_LINK_CONSUMED_SIZE_BYTES_OFF ] += (uint)sz;
640 :
641 0 : metric_regime_ticks[1] += housekeeping_ticks;
642 0 : metric_regime_ticks[4] += prefrag_ticks;
643 0 : long next = fd_tickcount();
644 0 : metric_regime_ticks[7] += (ulong)(next - now);
645 0 : now = next;
646 0 : }
647 0 : }
648 :
649 : FD_FN_UNUSED static void
650 : STEM_(run)( fd_topo_t * topo,
651 0 : fd_topo_tile_t * tile ) {
652 0 : const fd_frag_meta_t * in_mcache[ FD_TOPO_MAX_LINKS ];
653 0 : ulong * in_fseq[ FD_TOPO_MAX_TILE_IN_LINKS ];
654 :
655 0 : ulong polled_in_cnt = 0UL;
656 0 : for( ulong i=0UL; i<tile->in_cnt; i++ ) {
657 0 : if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue;
658 :
659 0 : in_mcache[ polled_in_cnt ] = topo->links[ tile->in_link_id[ i ] ].mcache;
660 0 : FD_TEST( in_mcache[ polled_in_cnt ] );
661 0 : in_fseq[ polled_in_cnt ] = tile->in_link_fseq[ i ];
662 0 : FD_TEST( in_fseq[ polled_in_cnt ] );
663 0 : polled_in_cnt += 1;
664 0 : }
665 :
666 0 : fd_frag_meta_t * out_mcache[ FD_TOPO_MAX_LINKS ];
667 0 : for( ulong i=0UL; i<tile->out_cnt; i++ ) {
668 0 : out_mcache[ i ] = topo->links[ tile->out_link_id[ i ] ].mcache;
669 0 : FD_TEST( out_mcache[ i ] );
670 0 : }
671 :
672 0 : ulong reliable_cons_cnt = 0UL;
673 0 : ulong cons_out[ FD_TOPO_MAX_LINKS ];
674 0 : ulong * cons_fseq[ FD_TOPO_MAX_LINKS ];
675 0 : for( ulong i=0UL; i<topo->tile_cnt; i++ ) {
676 0 : fd_topo_tile_t * consumer_tile = &topo->tiles[ i ];
677 0 : for( ulong j=0UL; j<consumer_tile->in_cnt; j++ ) {
678 0 : for( ulong k=0UL; k<tile->out_cnt; k++ ) {
679 0 : if( FD_UNLIKELY( consumer_tile->in_link_id[ j ]==tile->out_link_id[ k ] && consumer_tile->in_link_reliable[ j ] ) ) {
680 0 : cons_out[ reliable_cons_cnt ] = k;
681 0 : cons_fseq[ reliable_cons_cnt ] = consumer_tile->in_link_fseq[ j ];
682 0 : FD_TEST( cons_fseq[ reliable_cons_cnt ] );
683 0 : reliable_cons_cnt++;
684 : /* Need to test this, since each link may connect to many outs,
685 : you could construct a topology which has more than this
686 : consumers of links. */
687 0 : FD_TEST( reliable_cons_cnt<FD_TOPO_MAX_LINKS );
688 0 : }
689 0 : }
690 0 : }
691 0 : }
692 :
693 0 : fd_rng_t rng[1];
694 0 : FD_TEST( fd_rng_join( fd_rng_new( rng, 0, 0UL ) ) );
695 :
696 0 : STEM_CALLBACK_CONTEXT_TYPE * ctx = (STEM_CALLBACK_CONTEXT_TYPE*)fd_ulong_align_up( (ulong)fd_topo_obj_laddr( topo, tile->tile_obj_id ), STEM_CALLBACK_CONTEXT_ALIGN );
697 :
698 0 : STEM_(run1)( polled_in_cnt,
699 0 : in_mcache,
700 0 : in_fseq,
701 0 : tile->out_cnt,
702 0 : out_mcache,
703 0 : reliable_cons_cnt,
704 0 : cons_out,
705 0 : cons_fseq,
706 0 : STEM_BURST,
707 0 : STEM_LAZY,
708 0 : rng,
709 0 : fd_alloca( FD_STEM_SCRATCH_ALIGN, STEM_(scratch_footprint)( polled_in_cnt, tile->out_cnt, reliable_cons_cnt ) ),
710 0 : ctx );
711 0 : }
712 :
713 : #undef STEM_NAME
714 : #undef STEM_
715 : #undef STEM_BURST
716 : #undef STEM_CALLBACK_CONTEXT_TYPE
717 : #undef STEM_LAZY
718 : #undef STEM_CALLBACK_DURING_HOUSEKEEPING
719 : #undef STEM_CALLBACK_METRICS_WRITE
720 : #undef STEM_CALLBACK_BEFORE_CREDIT
721 : #undef STEM_CALLBACK_AFTER_CREDIT
722 : #undef STEM_CALLBACK_BEFORE_FRAG
723 : #undef STEM_CALLBACK_DURING_FRAG
724 : #undef STEM_CALLBACK_AFTER_FRAG
|