Line data Source code
1 : #include "../../../../disco/tiles.h"
2 :
3 : #include "generated/quic_seccomp.h"
4 :
5 : #include "../../../../disco/metrics/fd_metrics.h"
6 : #include "../../../../waltz/quic/fd_quic.h"
7 : #include "../../../../waltz/xdp/fd_xsk_aio.h"
8 : #include "../../../../waltz/xdp/fd_xsk.h"
9 : #include "../../../../waltz/ip/fd_netlink.h"
10 : #include "../../../../disco/quic/fd_tpu.h"
11 :
12 : #include <linux/unistd.h>
13 : #include <sys/random.h>
14 :
15 : /* fd_quic provides a TPU server tile.
16 :
17 : This tile handles incoming transactions that clients request to be
18 : included in blocks. Supported protocols currently include TPU/UDP
19 : and TPU/QUIC.
20 :
21 : The fd_quic tile acts as a plain old Tango producer writing to a cnc
22 : and an mcache. The tile will defragment multi-packet TPU streams
23 : coming in from QUIC, such that each mcache/dcache pair forms a
24 : complete txn. This requires the dcache mtu to be at least that of
25 : the largest allowed serialized txn size.
26 :
27 : QUIC tiles don't service network devices directly, but rely on
28 : packets being received by net tiles and forwarded on via. a mux
29 : (multiplexer). An arbitrary number of QUIC tiles can be run. Each
30 : UDP flow must stick to one QUIC tile. */
31 :
32 : typedef struct {
33 : fd_tpu_reasm_t * reasm;
34 :
35 : fd_stem_context_t * stem;
36 :
37 : fd_quic_t * quic;
38 : const fd_aio_t * quic_rx_aio;
39 : fd_aio_t quic_tx_aio[1];
40 :
41 : # define ED25519_PRIV_KEY_SZ (32)
42 0 : # define ED25519_PUB_KEY_SZ (32)
43 : uchar tls_priv_key[ ED25519_PRIV_KEY_SZ ];
44 : uchar tls_pub_key [ ED25519_PUB_KEY_SZ ];
45 : fd_sha512_t sha512[1]; /* used for signing */
46 :
47 : uchar buffer[ FD_NET_MTU ];
48 :
49 : ulong conn_seq; /* current quic connection sequence number */
50 :
51 : ulong round_robin_cnt;
52 : ulong round_robin_id;
53 :
54 : fd_wksp_t * in_mem;
55 : ulong in_chunk0;
56 : ulong in_wmark;
57 :
58 : fd_frag_meta_t * net_out_mcache;
59 : ulong * net_out_sync;
60 : ulong net_out_depth;
61 : ulong net_out_seq;
62 :
63 : fd_wksp_t * net_out_mem;
64 : ulong net_out_chunk0;
65 : ulong net_out_wmark;
66 : ulong net_out_chunk;
67 :
68 : fd_wksp_t * verify_out_mem;
69 :
70 : struct {
71 : ulong legacy_reasm_append [ FD_METRICS_COUNTER_QUIC_TILE_NON_QUIC_REASSEMBLY_APPEND_CNT ];
72 : ulong legacy_reasm_publish[ FD_METRICS_COUNTER_QUIC_TILE_NON_QUIC_REASSEMBLY_PUBLISH_CNT ];
73 :
74 : ulong reasm_append [ FD_METRICS_COUNTER_QUIC_TILE_REASSEMBLY_APPEND_CNT ];
75 : ulong reasm_publish[ FD_METRICS_COUNTER_QUIC_TILE_REASSEMBLY_PUBLISH_CNT ];
76 : } metrics;
77 : } fd_quic_ctx_t;
78 :
79 : FD_FN_CONST static inline fd_quic_limits_t
80 3 : quic_limits( fd_topo_tile_t const * tile ) {
81 3 : fd_quic_limits_t limits = {
82 3 : .conn_cnt = tile->quic.max_concurrent_connections,
83 3 : .handshake_cnt = tile->quic.max_concurrent_handshakes,
84 :
85 : /* fd_quic will not issue nor use any new connection IDs after
86 : completing a handshake. Connection migration is not supported
87 : either. */
88 3 : .conn_id_cnt = FD_QUIC_MIN_CONN_ID_CNT,
89 3 : .inflight_pkt_cnt = tile->quic.max_inflight_quic_packets,
90 3 : .tx_buf_sz = 0,
91 3 : .rx_stream_cnt = tile->quic.max_concurrent_streams_per_connection,
92 3 : .stream_pool_cnt = tile->quic.max_concurrent_streams_per_connection * tile->quic.max_concurrent_connections,
93 3 : };
94 3 : return limits;
95 3 : }
96 :
97 : FD_FN_CONST static inline ulong
98 3 : scratch_align( void ) {
99 3 : return 4096UL;
100 3 : }
101 :
102 : FD_FN_PURE static inline ulong
103 3 : scratch_footprint( fd_topo_tile_t const * tile ) {
104 3 : fd_quic_limits_t limits = quic_limits( tile );
105 3 : ulong l = FD_LAYOUT_INIT;
106 3 : l = FD_LAYOUT_APPEND( l, alignof( fd_quic_ctx_t ), sizeof( fd_quic_ctx_t ) );
107 3 : l = FD_LAYOUT_APPEND( l, fd_aio_align(), fd_aio_footprint() );
108 3 : l = FD_LAYOUT_APPEND( l, fd_quic_align(), fd_quic_footprint( &limits ) );
109 3 : return FD_LAYOUT_FINI( l, scratch_align() );
110 3 : }
111 :
112 : /* legacy_stream_notify is called for transactions sent via TPU/UDP. For
113 : now both QUIC and non-QUIC transactions are accepted, with traffic
114 : type determined by port.
115 :
116 : UDP transactions must fit in one packet and cannot be fragmented, and
117 : notify here means the entire packet was received. */
118 :
119 : static void
120 : legacy_stream_notify( fd_quic_ctx_t * ctx,
121 : uchar * packet,
122 0 : ulong packet_sz ) {
123 :
124 0 : fd_stem_context_t * stem = ctx->stem;
125 :
126 0 : uint tsorig = (uint)fd_frag_meta_ts_comp( fd_tickcount() );
127 0 : fd_tpu_reasm_slot_t * slot = fd_tpu_reasm_prepare( ctx->reasm, tsorig );
128 :
129 0 : int add_err = fd_tpu_reasm_append( ctx->reasm, slot, packet, packet_sz, 0UL );
130 0 : ctx->metrics.legacy_reasm_append[ add_err ]++;
131 0 : if( FD_UNLIKELY( add_err!=FD_TPU_REASM_SUCCESS ) ) return;
132 :
133 0 : uint tspub = (uint)fd_frag_meta_ts_comp( fd_tickcount() );
134 0 : void * base = ctx->verify_out_mem;
135 0 : ulong seq = stem->seqs[0];
136 :
137 0 : int pub_err = fd_tpu_reasm_publish( ctx->reasm, slot, stem->mcaches[0], base, seq, tspub );
138 0 : ctx->metrics.legacy_reasm_publish[ pub_err ]++;
139 0 : if( FD_UNLIKELY( pub_err!=FD_TPU_REASM_SUCCESS ) ) return;
140 :
141 0 : fd_stem_advance( stem, 0UL );
142 0 : }
143 :
144 : /* Because of the separate mcache for publishing network fragments
145 : back to networking tiles, which is not managed by the mux, we
146 : need to periodically update the sync. */
147 : static void
148 0 : during_housekeeping( fd_quic_ctx_t * ctx ) {
149 0 : fd_mcache_seq_update( ctx->net_out_sync, ctx->net_out_seq );
150 0 : }
151 :
152 : /* This tile always publishes messages downstream, even if there are
153 : no credits available. It ignores the flow control of the downstream
154 : verify tile. This is OK as the verify tile is written to expect
155 : this behavior, and enables the QUIC tile to publish as fast as it
156 : can. It would currently be difficult trying to backpressure further
157 : up the stack to the network itself. */
158 : static inline void
159 : before_credit( fd_quic_ctx_t * ctx,
160 : fd_stem_context_t * stem,
161 0 : int * charge_busy ) {
162 0 : ctx->stem = stem;
163 :
164 : /* Publishes to mcache via callbacks */
165 0 : *charge_busy = fd_quic_service( ctx->quic );
166 0 : }
167 :
168 : static inline void
169 0 : metrics_write( fd_quic_ctx_t * ctx ) {
170 0 : FD_MCNT_ENUM_COPY( QUIC_TILE, NON_QUIC_REASSEMBLY_APPEND, ctx->metrics.legacy_reasm_append );
171 0 : FD_MCNT_ENUM_COPY( QUIC_TILE, NON_QUIC_REASSEMBLY_PUBLISH, ctx->metrics.legacy_reasm_publish );
172 0 : FD_MCNT_ENUM_COPY( QUIC_TILE, REASSEMBLY_APPEND, ctx->metrics.reasm_append );
173 0 : FD_MCNT_ENUM_COPY( QUIC_TILE, REASSEMBLY_PUBLISH, ctx->metrics.reasm_publish );
174 :
175 0 : FD_MCNT_SET( QUIC, RECEIVED_PACKETS, ctx->quic->metrics.net_rx_pkt_cnt );
176 0 : FD_MCNT_SET( QUIC, RECEIVED_BYTES, ctx->quic->metrics.net_rx_byte_cnt );
177 0 : FD_MCNT_SET( QUIC, SENT_PACKETS, ctx->quic->metrics.net_tx_pkt_cnt );
178 0 : FD_MCNT_SET( QUIC, SENT_BYTES, ctx->quic->metrics.net_tx_byte_cnt );
179 :
180 0 : FD_MGAUGE_SET( QUIC, CONNECTIONS_ACTIVE, ctx->quic->metrics.conn_active_cnt );
181 0 : FD_MCNT_SET( QUIC, CONNECTIONS_CREATED, ctx->quic->metrics.conn_created_cnt );
182 0 : FD_MCNT_SET( QUIC, CONNECTIONS_CLOSED, ctx->quic->metrics.conn_closed_cnt );
183 0 : FD_MCNT_SET( QUIC, CONNECTIONS_ABORTED, ctx->quic->metrics.conn_aborted_cnt );
184 0 : FD_MCNT_SET( QUIC, CONNECTIONS_TIMED_OUT, ctx->quic->metrics.conn_timeout_cnt );
185 0 : FD_MCNT_SET( QUIC, CONNECTIONS_RETRIED, ctx->quic->metrics.conn_retry_cnt );
186 :
187 0 : FD_MCNT_SET( QUIC, CONNECTION_ERROR_NO_SLOTS, ctx->quic->metrics.conn_err_no_slots_cnt );
188 0 : FD_MCNT_SET( QUIC, CONNECTION_ERROR_TLS_FAIL, ctx->quic->metrics.conn_err_tls_fail_cnt );
189 0 : FD_MCNT_SET( QUIC, CONNECTION_ERROR_RETRY_FAIL, ctx->quic->metrics.conn_err_retry_fail_cnt );
190 :
191 0 : FD_MCNT_SET( QUIC, HANDSHAKES_CREATED, ctx->quic->metrics.hs_created_cnt );
192 0 : FD_MCNT_SET( QUIC, HANDSHAKE_ERROR_ALLOC_FAIL, ctx->quic->metrics.hs_err_alloc_fail_cnt );
193 :
194 0 : FD_MCNT_SET( QUIC, STREAM_OPENED, ctx->quic->metrics.stream_opened_cnt );
195 0 : FD_MCNT_ENUM_COPY( QUIC, STREAM_CLOSED, ctx->quic->metrics.stream_closed_cnt );
196 0 : FD_MGAUGE_SET( QUIC, STREAM_ACTIVE, ctx->quic->metrics.stream_active_cnt );
197 :
198 0 : FD_MCNT_SET( QUIC, STREAM_RECEIVED_EVENTS, ctx->quic->metrics.stream_rx_event_cnt );
199 0 : FD_MCNT_SET( QUIC, STREAM_RECEIVED_BYTES, ctx->quic->metrics.stream_rx_byte_cnt );
200 :
201 0 : FD_MCNT_ENUM_COPY( QUIC, RECEIVED_FRAMES, ctx->quic->metrics.frame_rx_cnt );
202 0 : }
203 :
204 : static int
205 : before_frag( fd_quic_ctx_t * ctx,
206 : ulong in_idx,
207 : ulong seq,
208 0 : ulong sig ) {
209 0 : (void)in_idx;
210 0 : (void)seq;
211 :
212 0 : ulong proto = fd_disco_netmux_sig_proto( sig );
213 0 : if( FD_UNLIKELY( proto!=DST_PROTO_TPU_UDP && proto!=DST_PROTO_TPU_QUIC ) ) return 1;
214 :
215 0 : ulong hash = fd_disco_netmux_sig_hash( sig );
216 0 : if( FD_UNLIKELY( (hash % ctx->round_robin_cnt) != ctx->round_robin_id ) ) return 1;
217 :
218 0 : return 0;
219 0 : }
220 :
221 : static void
222 : during_frag( fd_quic_ctx_t * ctx,
223 : ulong in_idx,
224 : ulong seq,
225 : ulong sig,
226 : ulong chunk,
227 0 : ulong sz ) {
228 0 : (void)in_idx;
229 0 : (void)seq;
230 0 : (void)sig;
231 :
232 0 : if( FD_UNLIKELY( chunk<ctx->in_chunk0 || chunk>ctx->in_wmark || sz > FD_NET_MTU ) )
233 0 : FD_LOG_ERR(( "chunk %lu %lu corrupt, not in range [%lu,%lu]", chunk, sz, ctx->in_chunk0, ctx->in_wmark ));
234 :
235 0 : uchar * src = (uchar *)fd_chunk_to_laddr( ctx->in_mem, chunk );
236 0 : fd_memcpy( ctx->buffer, src, sz ); /* TODO: Eliminate copy... fd_aio needs refactoring */
237 0 : }
238 :
239 : static void
240 : after_frag( fd_quic_ctx_t * ctx,
241 : ulong in_idx,
242 : ulong seq,
243 : ulong sig,
244 : ulong chunk,
245 : ulong sz,
246 : ulong tsorig,
247 0 : fd_stem_context_t * stem ) {
248 0 : (void)in_idx;
249 0 : (void)seq;
250 0 : (void)chunk;
251 0 : (void)tsorig;
252 0 : (void)stem;
253 :
254 0 : ulong proto = fd_disco_netmux_sig_proto( sig );
255 :
256 0 : if( FD_LIKELY( proto==DST_PROTO_TPU_QUIC ) ) {
257 0 : fd_aio_pkt_info_t pkt = { .buf = ctx->buffer, .buf_sz = (ushort)sz };
258 0 : fd_aio_send( ctx->quic_rx_aio, &pkt, 1, NULL, 1 );
259 0 : } else if( FD_LIKELY( proto==DST_PROTO_TPU_UDP ) ) {
260 0 : ulong network_hdr_sz = fd_disco_netmux_sig_hdr_sz( sig );
261 0 : if( FD_UNLIKELY( sz<=network_hdr_sz ) ) {
262 : /* Transaction not valid if the packet isn't large enough for the network
263 : headers. */
264 0 : FD_MCNT_INC( QUIC_TILE, NON_QUIC_PACKET_TOO_SMALL, 1UL );
265 0 : return;
266 0 : }
267 :
268 0 : ulong data_sz = sz - network_hdr_sz;
269 0 : if( FD_UNLIKELY( data_sz<FD_TXN_MIN_SERIALIZED_SZ ) ) {
270 : /* Smaller than the smallest possible transaction */
271 0 : FD_MCNT_INC( QUIC_TILE, NON_QUIC_PACKET_TOO_SMALL, 1UL );
272 0 : return;
273 0 : }
274 :
275 0 : if( FD_UNLIKELY( data_sz>FD_TPU_MTU ) ) {
276 : /* Transaction couldn't possibly be valid if it's longer than transaction
277 : MTU so drop it. This is not required, as the txn will fail to parse,
278 : but it's a nice short circuit. */
279 0 : FD_MCNT_INC( QUIC_TILE, NON_QUIC_PACKET_TOO_LARGE, 1UL );
280 0 : return;
281 0 : }
282 :
283 0 : legacy_stream_notify( ctx, ctx->buffer+network_hdr_sz, data_sz );
284 0 : }
285 0 : }
286 :
287 : /* quic_now is called by the QUIC engine to get the current timestamp in
288 : UNIX time. */
289 :
290 : static ulong
291 0 : quic_now( void * ctx ) {
292 0 : (void)ctx;
293 0 : return (ulong)fd_log_wallclock();
294 0 : }
295 :
296 : /* quic_conn_new is invoked by the QUIC engine whenever a new connection
297 : is being established. */
298 : static void
299 : quic_conn_new( fd_quic_conn_t * conn,
300 0 : void * _ctx ) {
301 0 : fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx;
302 :
303 0 : conn->local_conn_id = ++ctx->conn_seq;
304 0 : }
305 :
306 : /* quic_stream_new is called back by the QUIC engine whenever an open
307 : connection creates a new stream, at the time this is called, both the
308 : client and server must have agreed to open the stream. In case the
309 : client has opened this stream, it is assumed that the QUIC
310 : implementation has verified that the client has the necessary stream
311 : quota to do so. */
312 :
313 : static void
314 : quic_stream_new( fd_quic_stream_t * stream,
315 0 : void * _ctx ) {
316 :
317 : /* Load QUIC state */
318 :
319 0 : fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx;
320 :
321 0 : ulong conn_id = stream->conn->local_conn_id;
322 0 : ulong stream_id = stream->stream_id;
323 :
324 : /* Acquire reassembly slot */
325 :
326 0 : uint tsorig = (uint)fd_frag_meta_ts_comp( fd_tickcount() );
327 0 : fd_tpu_reasm_slot_t * slot = fd_tpu_reasm_prepare( ctx->reasm, tsorig );
328 :
329 0 : slot->conn_id = conn_id;
330 0 : slot->stream_id = stream_id;
331 :
332 : /* Wire up with QUIC stream */
333 :
334 0 : stream->context = slot;
335 :
336 : /* Wind up for next iteration */
337 :
338 0 : }
339 :
340 : /* quic_stream_receive is called back by the QUIC engine when any stream
341 : in any connection being serviced receives new data. Currently we
342 : simply copy received data out of the xsk (network device memory) into
343 : a local dcache. */
344 :
345 : static void
346 : quic_stream_receive( fd_quic_stream_t * stream,
347 : void * stream_ctx,
348 : uchar const * data,
349 : ulong data_sz,
350 : ulong offset,
351 0 : int fin ) {
352 :
353 0 : (void)fin; /* TODO instantly publish if offset==0UL && fin */
354 :
355 : /* Load TPU state */
356 :
357 0 : fd_quic_t * quic = stream->conn->quic;
358 0 : fd_quic_ctx_t * quic_ctx = quic->cb.quic_ctx;
359 0 : fd_tpu_reasm_t * reasm = quic_ctx->reasm;
360 0 : fd_tpu_reasm_slot_t * slot = stream_ctx;
361 0 : fd_quic_ctx_t * ctx = quic->cb.quic_ctx;
362 :
363 : /* Check if reassembly slot is still valid */
364 :
365 0 : ulong conn_id = stream->conn->local_conn_id;
366 0 : ulong stream_id = stream->stream_id;
367 :
368 0 : if( FD_UNLIKELY( ( slot->conn_id != conn_id ) |
369 0 : ( slot->stream_id != stream_id ) ) ) {
370 0 : return; /* clobbered */
371 0 : }
372 :
373 : /* Append data into chunk, we know this is valid */
374 :
375 0 : int add_err = fd_tpu_reasm_append( reasm, slot, data, data_sz, offset );
376 0 : ctx->metrics.reasm_append[ add_err ]++;
377 0 : }
378 :
379 : /* quic_stream_notify is called back by the QUIC implementation when a
380 : stream is finished. This could either be because it completed
381 : successfully after reading valid data, or it was closed prematurely
382 : for some other reason. All streams must eventually notify.
383 :
384 : If we see a successful QUIC stream notify, it means we have received
385 : a full transaction and should publish it downstream to be verified
386 : and executed. */
387 :
388 : static void
389 : quic_stream_notify( fd_quic_stream_t * stream,
390 : void * stream_ctx,
391 0 : int type ) {
392 :
393 : /* Load TPU state */
394 :
395 0 : fd_quic_t * quic = stream->conn->quic;
396 0 : fd_quic_ctx_t * ctx = quic->cb.quic_ctx;
397 0 : fd_tpu_reasm_t * reasm = ctx->reasm;
398 0 : fd_tpu_reasm_slot_t * slot = stream_ctx;
399 0 : fd_stem_context_t * stem = ctx->stem;
400 0 : fd_frag_meta_t * mcache = stem->mcaches[0];
401 0 : void * base = ctx->verify_out_mem;
402 :
403 : /* Check if reassembly slot is still valid */
404 :
405 0 : ulong conn_id = stream->conn->local_conn_id;
406 0 : ulong stream_id = stream->stream_id;
407 :
408 0 : if( FD_UNLIKELY( ( slot->conn_id != conn_id ) |
409 0 : ( slot->stream_id != stream_id ) ) ) {
410 0 : FD_MCNT_INC( QUIC_TILE, REASSEMBLY_NOTIFY_CLOBBERED, 1UL );
411 0 : return; /* clobbered */
412 0 : }
413 :
414 : /* Abort reassembly slot if QUIC stream closes non-gracefully */
415 :
416 0 : if( FD_UNLIKELY( type!=FD_QUIC_STREAM_NOTIFY_END ) ) {
417 0 : FD_MCNT_INC( QUIC_TILE, REASSEMBLY_NOTIFY_ABORTED, 1UL );
418 0 : fd_tpu_reasm_cancel( reasm, slot );
419 0 : return; /* not a successful stream close */
420 0 : }
421 :
422 : /* Publish message */
423 :
424 0 : ulong seq = stem->seqs[0];
425 0 : uint tspub = (uint)fd_frag_meta_ts_comp( fd_tickcount() );
426 0 : int pub_err = fd_tpu_reasm_publish( reasm, slot, mcache, base, seq, tspub );
427 0 : ctx->metrics.reasm_publish[ pub_err ]++;
428 0 : if( FD_UNLIKELY( pub_err!=FD_TPU_REASM_SUCCESS ) ) return;
429 :
430 0 : fd_stem_advance( stem, 0UL );
431 0 : }
432 :
433 : static int
434 : quic_tx_aio_send( void * _ctx,
435 : fd_aio_pkt_info_t const * batch,
436 : ulong batch_cnt,
437 : ulong * opt_batch_idx,
438 0 : int flush ) {
439 0 : (void)flush;
440 :
441 0 : fd_quic_ctx_t * ctx = (fd_quic_ctx_t *)_ctx;
442 :
443 0 : for( ulong i=0; i<batch_cnt; i++ ) {
444 0 : void * dst = fd_chunk_to_laddr( ctx->net_out_mem, ctx->net_out_chunk );
445 0 : fd_memcpy( dst, batch[ i ].buf, batch[ i ].buf_sz );
446 :
447 0 : uchar const * packet = dst;
448 0 : uchar const * packet_end = packet + batch[i].buf_sz;
449 0 : uchar const * iphdr = packet + 14U;
450 :
451 0 : uint test_ethip = ( (uint)packet[12] << 16u ) | ( (uint)packet[13] << 8u ) | (uint)packet[23];
452 0 : uint ip_dstaddr = 0;
453 0 : if( FD_LIKELY( test_ethip==0x080011 ) ) {
454 : /* IPv4 is variable-length, so lookup IHL to find start of UDP */
455 0 : uint iplen = ( ( (uint)iphdr[0] ) & 0x0FU ) * 4U;
456 0 : uchar const * udp = iphdr + iplen;
457 :
458 : /* Ignore if UDP header is too short */
459 0 : if( FD_UNLIKELY( udp+8U>packet_end ) ) {
460 0 : FD_MCNT_INC( QUIC_TILE, QUIC_PACKET_TOO_SMALL, 1UL );
461 0 : continue;
462 0 : }
463 :
464 : /* Extract IP dest addr and UDP dest port */
465 0 : ip_dstaddr = *(uint *)( iphdr+16UL );
466 0 : }
467 :
468 : /* send packets are just round-robined by sequence number, so for now
469 : just indicate where they came from so they don't bounce back */
470 0 : ulong sig = fd_disco_netmux_sig( 0U, 0U, ip_dstaddr, DST_PROTO_OUTGOING, FD_NETMUX_SIG_MIN_HDR_SZ );
471 :
472 0 : ulong tspub = (ulong)fd_frag_meta_ts_comp( fd_tickcount() );
473 0 : fd_mcache_publish( ctx->net_out_mcache,
474 0 : ctx->net_out_depth,
475 0 : ctx->net_out_seq,
476 0 : sig,
477 0 : ctx->net_out_chunk,
478 0 : batch[ i ].buf_sz,
479 0 : 0,
480 0 : 0,
481 0 : tspub );
482 :
483 0 : ctx->net_out_seq = fd_seq_inc( ctx->net_out_seq, 1UL );
484 0 : ctx->net_out_chunk = fd_dcache_compact_next( ctx->net_out_chunk, FD_NET_MTU, ctx->net_out_chunk0, ctx->net_out_wmark );
485 0 : }
486 :
487 0 : if( FD_LIKELY( opt_batch_idx ) ) {
488 0 : *opt_batch_idx = batch_cnt;
489 0 : }
490 :
491 0 : return FD_AIO_SUCCESS;
492 0 : }
493 :
494 : static void
495 : privileged_init( fd_topo_t * topo,
496 0 : fd_topo_tile_t * tile ) {
497 0 : (void)topo; (void)tile;
498 :
499 : /* The fd_quic implementation calls fd_log_wallclock() internally
500 : which itself calls clock_gettime() which on most kernels is not a
501 : real syscall but a virtual one in the process via. the vDSO.
502 :
503 : The first time this virtual call is made to the vDSO it does an
504 : mmap(2) of some shared memory into userspace, which cannot
505 : happen while sandboxed so we need to ensure that initialization
506 : happens here. */
507 :
508 0 : fd_log_wallclock();
509 0 : }
510 :
511 : static void
512 : quic_tls_cv_sign( void * signer_ctx,
513 : uchar signature[ static 64 ],
514 0 : uchar const payload[ static 130 ] ) {
515 0 : fd_quic_ctx_t * ctx = signer_ctx;
516 0 : fd_sha512_t * sha512 = fd_sha512_join( ctx->sha512 );
517 0 : fd_ed25519_sign( signature, payload, 130UL, ctx->tls_pub_key, ctx->tls_priv_key, sha512 );
518 0 : fd_sha512_leave( sha512 );
519 0 : }
520 :
521 : static void
522 : unprivileged_init( fd_topo_t * topo,
523 0 : fd_topo_tile_t * tile ) {
524 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
525 :
526 0 : if( FD_UNLIKELY( tile->in_cnt<1UL ||
527 0 : strcmp( topo->links[ tile->in_link_id[ 0UL ] ].name, "net_quic" ) ) )
528 0 : FD_LOG_ERR(( "quic tile has none or unexpected input links %lu %s %s",
529 0 : tile->in_cnt, topo->links[ tile->in_link_id[ 0 ] ].name, topo->links[ tile->in_link_id[ 1 ] ].name ));
530 :
531 0 : if( FD_UNLIKELY( tile->out_cnt!=2UL ||
532 0 : strcmp( topo->links[ tile->out_link_id[ 0UL ] ].name, "quic_verify" ) ||
533 0 : strcmp( topo->links[ tile->out_link_id[ 1UL ] ].name, "quic_net" ) ) )
534 0 : FD_LOG_ERR(( "quic tile has none or unexpected output links %lu %s %s",
535 0 : tile->out_cnt, topo->links[ tile->out_link_id[ 0 ] ].name, topo->links[ tile->out_link_id[ 1 ] ].name ));
536 :
537 0 : if( FD_UNLIKELY( !tile->in_cnt ) ) FD_LOG_ERR(( "quic tile in cnt is zero" ));
538 :
539 0 : ulong depth = tile->quic.depth;
540 0 : if( topo->links[ tile->out_link_id[ 0 ] ].depth != depth )
541 0 : FD_LOG_ERR(( "quic tile in depths are not equal" ));
542 :
543 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
544 0 : fd_quic_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_quic_ctx_t ), sizeof( fd_quic_ctx_t ) );
545 :
546 : /* End privileged allocs */
547 :
548 0 : FD_TEST( getrandom( ctx->tls_priv_key, ED25519_PRIV_KEY_SZ, 0 )==ED25519_PRIV_KEY_SZ );
549 0 : fd_sha512_t * sha512 = fd_sha512_join( fd_sha512_new( ctx->sha512 ) );
550 0 : fd_ed25519_public_from_private( ctx->tls_pub_key, ctx->tls_priv_key, sha512 );
551 0 : fd_sha512_leave( sha512 );
552 :
553 0 : fd_aio_t * quic_tx_aio = fd_aio_join( fd_aio_new( ctx->quic_tx_aio, ctx, quic_tx_aio_send ) );
554 0 : if( FD_UNLIKELY( !quic_tx_aio ) ) FD_LOG_ERR(( "fd_aio_join failed" ));
555 :
556 0 : fd_quic_limits_t limits = quic_limits( tile );
557 0 : fd_quic_t * quic = fd_quic_join( fd_quic_new( FD_SCRATCH_ALLOC_APPEND( l, fd_quic_align(), fd_quic_footprint( &limits ) ), &limits ) );
558 0 : if( FD_UNLIKELY( !quic ) ) FD_LOG_ERR(( "fd_quic_join failed" ));
559 :
560 0 : if( FD_UNLIKELY( tile->quic.ack_delay_millis == 0 ) ) {
561 0 : FD_LOG_ERR(( "Invalid `ack_delay_millis`: must be greater than zero" ));
562 0 : }
563 0 : if( FD_UNLIKELY( tile->quic.ack_delay_millis >= tile->quic.idle_timeout_millis ) ) {
564 0 : FD_LOG_ERR(( "Invalid `ack_delay_millis`: must be lower than `idle_timeout_millis`" ));
565 0 : }
566 :
567 0 : quic->config.role = FD_QUIC_ROLE_SERVER;
568 0 : quic->config.net.ip_addr = tile->quic.ip_addr;
569 0 : quic->config.net.listen_udp_port = tile->quic.quic_transaction_listen_port;
570 0 : quic->config.idle_timeout = tile->quic.idle_timeout_millis * 1000000UL;
571 0 : quic->config.ack_delay = tile->quic.ack_delay_millis * 1000000UL;
572 0 : quic->config.initial_rx_max_stream_data = FD_TXN_MTU;
573 0 : quic->config.retry = tile->quic.retry;
574 0 : fd_memcpy( quic->config.link.src_mac_addr, tile->quic.src_mac_addr, 6 );
575 0 : fd_memcpy( quic->config.identity_public_key, ctx->tls_pub_key, ED25519_PUB_KEY_SZ );
576 :
577 0 : quic->config.sign = quic_tls_cv_sign;
578 0 : quic->config.sign_ctx = ctx;
579 :
580 0 : quic->cb.conn_new = quic_conn_new;
581 0 : quic->cb.conn_hs_complete = NULL;
582 0 : quic->cb.conn_final = NULL;
583 0 : quic->cb.stream_new = quic_stream_new;
584 0 : quic->cb.stream_receive = quic_stream_receive;
585 0 : quic->cb.stream_notify = quic_stream_notify;
586 0 : quic->cb.now = quic_now;
587 0 : quic->cb.now_ctx = NULL;
588 0 : quic->cb.quic_ctx = ctx;
589 :
590 0 : fd_quic_set_aio_net_tx( quic, quic_tx_aio );
591 0 : if( FD_UNLIKELY( !fd_quic_init( quic ) ) ) FD_LOG_ERR(( "fd_quic_init failed" ));
592 :
593 : /* Put a bound on chunks we read from the input, to make sure they
594 : are within in the data region of the workspace. */
595 0 : fd_topo_link_t * link0 = &topo->links[ tile->in_link_id[ 0 ] ];
596 :
597 0 : for( ulong i=1UL; i<tile->in_cnt; i++ ) {
598 0 : fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ];
599 :
600 0 : if( FD_UNLIKELY( !tile->in_link_poll[ i ] ) ) continue;
601 :
602 0 : if( FD_UNLIKELY( topo->objs[ link0->dcache_obj_id ].wksp_id!=topo->objs[ link->dcache_obj_id ].wksp_id ) ) FD_LOG_ERR(( "quic tile reads input from multiple workspaces" ));
603 0 : if( FD_UNLIKELY( link0->mtu!=link->mtu ) ) FD_LOG_ERR(( "quic tile reads input from multiple links with different MTUs" ));
604 0 : }
605 :
606 0 : ctx->in_mem = topo->workspaces[ topo->objs[ link0->dcache_obj_id ].wksp_id ].wksp;
607 0 : ctx->in_chunk0 = fd_disco_compact_chunk0( ctx->in_mem );
608 0 : ctx->in_wmark = fd_disco_compact_wmark ( ctx->in_mem, link0->mtu );
609 :
610 0 : fd_topo_link_t * net_out = &topo->links[ tile->out_link_id[ 1 ] ];
611 :
612 0 : ctx->net_out_mcache = net_out->mcache;
613 0 : ctx->net_out_sync = fd_mcache_seq_laddr( ctx->net_out_mcache );
614 0 : ctx->net_out_depth = fd_mcache_depth( ctx->net_out_mcache );
615 0 : ctx->net_out_seq = fd_mcache_seq_query( ctx->net_out_sync );
616 0 : ctx->net_out_chunk0 = fd_dcache_compact_chunk0( fd_wksp_containing( net_out->dcache ), net_out->dcache );
617 0 : ctx->net_out_mem = topo->workspaces[ topo->objs[ net_out->dcache_obj_id ].wksp_id ].wksp;
618 0 : ctx->net_out_wmark = fd_dcache_compact_wmark ( ctx->net_out_mem, net_out->dcache, net_out->mtu );
619 0 : ctx->net_out_chunk = ctx->net_out_chunk0;
620 :
621 0 : fd_topo_link_t * verify_out = &topo->links[ tile->out_link_id[ 0 ] ];
622 :
623 0 : ctx->verify_out_mem = topo->workspaces[ topo->objs[ verify_out->reasm_obj_id ].wksp_id ].wksp;
624 :
625 0 : ctx->reasm = verify_out->reasm;
626 0 : if( FD_UNLIKELY( !ctx->reasm ) )
627 0 : FD_LOG_ERR(( "invalid tpu_reasm parameters" ));
628 :
629 0 : ctx->conn_seq = 0UL;
630 :
631 0 : ctx->quic = quic;
632 0 : ctx->quic_rx_aio = fd_quic_get_aio_net_rx( quic );
633 :
634 0 : ctx->round_robin_cnt = fd_topo_tile_name_cnt( topo, tile->name );
635 0 : ctx->round_robin_id = tile->kind_id;
636 :
637 0 : ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
638 0 : if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
639 0 : FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
640 0 : }
641 :
642 : static ulong
643 : populate_allowed_seccomp( fd_topo_t const * topo,
644 : fd_topo_tile_t const * tile,
645 : ulong out_cnt,
646 0 : struct sock_filter * out ) {
647 0 : (void)topo;
648 0 : (void)tile;
649 :
650 0 : populate_sock_filter_policy_quic( out_cnt, out, (uint)fd_log_private_logfile_fd() );
651 0 : return sock_filter_policy_quic_instr_cnt;
652 0 : }
653 :
654 : static ulong
655 : populate_allowed_fds( fd_topo_t const * topo,
656 : fd_topo_tile_t const * tile,
657 : ulong out_fds_cnt,
658 0 : int * out_fds ) {
659 0 : (void)topo;
660 0 : (void)tile;
661 :
662 0 : if( FD_UNLIKELY( out_fds_cnt<2UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
663 :
664 0 : ulong out_cnt = 0UL;
665 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
666 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
667 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
668 0 : return out_cnt;
669 0 : }
670 :
671 0 : #define STEM_BURST (1UL)
672 :
673 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_quic_ctx_t
674 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_quic_ctx_t)
675 :
676 0 : #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
677 0 : #define STEM_CALLBACK_METRICS_WRITE metrics_write
678 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
679 0 : #define STEM_CALLBACK_BEFORE_FRAG before_frag
680 0 : #define STEM_CALLBACK_DURING_FRAG during_frag
681 0 : #define STEM_CALLBACK_AFTER_FRAG after_frag
682 :
683 : #include "../../../../disco/stem/fd_stem.c"
684 :
685 : fd_topo_run_tile_t fd_tile_quic = {
686 : .name = "quic",
687 : .populate_allowed_seccomp = populate_allowed_seccomp,
688 : .populate_allowed_fds = populate_allowed_fds,
689 : .scratch_align = scratch_align,
690 : .scratch_footprint = scratch_footprint,
691 : .privileged_init = privileged_init,
692 : .unprivileged_init = unprivileged_init,
693 : .run = stem_run,
694 : };
|