Line data Source code
1 : #include "fd_quic_tile.h"
2 : #include "../metrics/fd_metrics.h"
3 : #include "../stem/fd_stem.h"
4 : #include "../topo/fd_topo.h"
5 : #include "fd_tpu.h"
6 : #include "../../waltz/quic/fd_quic_private.h"
7 : #include "generated/quic_seccomp.h"
8 : #include "../../util/io/fd_io.h"
9 : #include "../../util/net/fd_eth.h"
10 :
11 : #include <errno.h>
12 : #include <fcntl.h>
13 : #include <linux/unistd.h>
14 : #include <sys/random.h>
15 :
16 : #define OUT_IDX_VERIFY 0
17 0 : #define OUT_IDX_NET 1
18 :
19 0 : #define FD_QUIC_KEYLOG_FLUSH_INTERVAL_NS ((long)100e6)
20 :
21 : /* fd_quic_tile provides a TPU server tile.
22 :
23 : This tile handles incoming transactions that clients request to be
24 : included in blocks. Supported protocols currently include TPU/UDP
25 : and TPU/QUIC.
26 :
27 : The fd_quic tile acts as a plain old Tango producer. The tile will
28 : defragment multi-packet TPU streams coming in from QUIC, such tha
29 : each frag_meta refers to a complete txn. This requires the dcache
30 : mtu to be at least that of the largest allowed serialized txn size.
31 :
32 : QUIC tiles don't service network devices directly, but rely on net
33 : tiles to send and receive packets. An arbitrary number of QUIC tiles
34 : can be run. Each UDP flow must stick to one QUIC tile. */
35 :
36 : static inline fd_quic_limits_t
37 0 : quic_limits( fd_topo_tile_t const * tile ) {
38 0 : fd_quic_limits_t limits = {
39 0 : .conn_cnt = tile->quic.max_concurrent_connections,
40 0 : .handshake_cnt = tile->quic.max_concurrent_handshakes,
41 :
42 : /* fd_quic will not issue nor use any new connection IDs after
43 : completing a handshake. Connection migration is not supported
44 : either. */
45 0 : .conn_id_cnt = FD_QUIC_MIN_CONN_ID_CNT,
46 0 : .inflight_frame_cnt = 64UL * tile->quic.max_concurrent_connections,
47 0 : .min_inflight_frame_cnt_conn = 32UL
48 0 : };
49 0 : if( FD_UNLIKELY( !fd_quic_footprint( &limits ) ) ) {
50 0 : FD_LOG_ERR(( "Invalid QUIC limits in config" ));
51 0 : }
52 0 : return limits;
53 0 : }
54 :
55 : FD_FN_CONST static inline ulong
56 0 : scratch_align( void ) {
57 0 : return fd_ulong_max( alignof(fd_quic_ctx_t), fd_quic_align() );
58 0 : }
59 :
60 : static inline ulong
61 0 : scratch_footprint( fd_topo_tile_t const * tile ) {
62 0 : ulong out_depth = tile->quic.out_depth;
63 0 : ulong reasm_max = tile->quic.reasm_cnt;
64 :
65 0 : fd_quic_limits_t limits = quic_limits( tile ); /* May FD_LOG_ERR */
66 0 : ulong l = FD_LAYOUT_INIT;
67 0 : l = FD_LAYOUT_APPEND( l, alignof( fd_quic_ctx_t ), sizeof( fd_quic_ctx_t ) );
68 0 : l = FD_LAYOUT_APPEND( l, fd_quic_align(), fd_quic_footprint( &limits ) );
69 0 : l = FD_LAYOUT_APPEND( l, fd_tpu_reasm_align(), fd_tpu_reasm_footprint( out_depth, reasm_max ) );
70 0 : return FD_LAYOUT_FINI( l, scratch_align() );
71 0 : }
72 :
73 : /* legacy_stream_notify is called for transactions sent via TPU/UDP. For
74 : now both QUIC and non-QUIC transactions are accepted, with traffic
75 : type determined by port.
76 :
77 : UDP transactions must fit in one packet and cannot be fragmented, and
78 : notify here means the entire packet was received. */
79 :
80 : static void
81 : legacy_stream_notify( fd_quic_ctx_t * ctx,
82 : uchar * packet,
83 : ulong packet_sz,
84 0 : uint ipv4 ) {
85 :
86 0 : long tspub = ctx->now;
87 0 : fd_tpu_reasm_t * reasm = ctx->reasm;
88 0 : fd_stem_context_t * stem = ctx->stem;
89 0 : fd_frag_meta_t * mcache = stem->mcaches[0];
90 0 : void * base = ctx->verify_out_mem;
91 0 : ulong seq = stem->seqs[0];
92 :
93 0 : int err = fd_tpu_reasm_publish_fast( reasm, packet, packet_sz, mcache, base, seq, tspub, ipv4, FD_TXN_M_TPU_SOURCE_UDP );
94 0 : if( FD_LIKELY( err==FD_TPU_REASM_SUCCESS ) ) {
95 0 : fd_stem_advance( stem, 0UL );
96 0 : ctx->metrics.txns_received_udp++;
97 0 : }
98 0 : }
99 :
100 : /* This tile always publishes messages downstream, even if there are
101 : no credits available. It ignores the flow control of the downstream
102 : verify tile. This is OK as the verify tile is written to expect
103 : this behavior, and enables the QUIC tile to publish as fast as it
104 : can. It would currently be difficult trying to backpressure further
105 : up the stack to the network itself. */
106 : static inline void
107 : before_credit( fd_quic_ctx_t * ctx,
108 : fd_stem_context_t * stem,
109 0 : int * charge_busy ) {
110 0 : ctx->stem = stem;
111 :
112 : /* Publishes to mcache via callbacks */
113 0 : long now = fd_clock_now( ctx->clock );
114 0 : ctx->now = now;
115 0 : *charge_busy = fd_quic_service( ctx->quic, now );
116 0 : }
117 :
118 : static inline void
119 0 : metrics_write( fd_quic_ctx_t * ctx ) {
120 0 : FD_MCNT_SET ( QUIC, TXNS_RECEIVED_UDP, ctx->metrics.txns_received_udp );
121 0 : FD_MCNT_SET ( QUIC, TXNS_RECEIVED_QUIC_FAST, ctx->metrics.txns_received_quic_fast );
122 0 : FD_MCNT_SET ( QUIC, TXNS_RECEIVED_QUIC_FRAG, ctx->metrics.txns_received_quic_frag );
123 0 : FD_MCNT_SET ( QUIC, FRAGS_OK, ctx->metrics.frag_ok_cnt );
124 0 : FD_MCNT_SET ( QUIC, FRAGS_GAP, ctx->metrics.frag_gap_cnt );
125 0 : FD_MCNT_SET ( QUIC, FRAGS_DUP, ctx->metrics.frag_dup_cnt );
126 0 : FD_MCNT_SET ( QUIC, TXNS_OVERRUN, ctx->metrics.reasm_overrun );
127 0 : FD_MCNT_SET ( QUIC, TXNS_ABANDONED, ctx->metrics.reasm_abandoned );
128 0 : FD_MCNT_SET ( QUIC, TXN_REASMS_STARTED, ctx->metrics.reasm_started );
129 0 : FD_MGAUGE_SET( QUIC, TXN_REASMS_ACTIVE, (ulong)fd_long_max( ctx->metrics.reasm_active, 0L ) );
130 :
131 0 : FD_MCNT_SET( QUIC, LEGACY_TXN_UNDERSZ, ctx->metrics.udp_pkt_too_small );
132 0 : FD_MCNT_SET( QUIC, LEGACY_TXN_OVERSZ, ctx->metrics.udp_pkt_too_large );
133 0 : FD_MCNT_SET( QUIC, TXN_UNDERSZ, ctx->metrics.quic_txn_too_small );
134 0 : FD_MCNT_SET( QUIC, TXN_OVERSZ, ctx->metrics.quic_txn_too_large );
135 :
136 0 : FD_MCNT_SET( QUIC, RECEIVED_PACKETS, ctx->quic->metrics.net_rx_pkt_cnt );
137 0 : FD_MCNT_SET( QUIC, RECEIVED_BYTES, ctx->quic->metrics.net_rx_byte_cnt );
138 0 : FD_MCNT_SET( QUIC, SENT_PACKETS, ctx->quic->metrics.net_tx_pkt_cnt );
139 0 : FD_MCNT_SET( QUIC, SENT_BYTES, ctx->quic->metrics.net_tx_byte_cnt );
140 0 : FD_MCNT_SET( QUIC, RETRY_SENT, ctx->quic->metrics.retry_tx_cnt );
141 :
142 0 : FD_MGAUGE_ENUM_COPY( QUIC, CONNECTIONS_STATE, ctx->quic->metrics.conn_state_cnt );
143 0 : FD_MGAUGE_SET( QUIC, CONNECTIONS_ALLOC, ctx->quic->metrics.conn_alloc_cnt );
144 0 : FD_MCNT_SET( QUIC, CONNECTIONS_CREATED, ctx->quic->metrics.conn_created_cnt );
145 0 : FD_MCNT_SET( QUIC, CONNECTIONS_CLOSED, ctx->quic->metrics.conn_closed_cnt );
146 0 : FD_MCNT_SET( QUIC, CONNECTIONS_ABORTED, ctx->quic->metrics.conn_aborted_cnt );
147 0 : FD_MCNT_SET( QUIC, CONNECTIONS_TIMED_OUT, ctx->quic->metrics.conn_timeout_cnt );
148 0 : FD_MCNT_SET( QUIC, CONNECTIONS_RETRIED, ctx->quic->metrics.conn_retry_cnt );
149 :
150 0 : FD_MCNT_SET( QUIC, CONNECTION_ERROR_NO_SLOTS, ctx->quic->metrics.conn_err_no_slots_cnt );
151 0 : FD_MCNT_SET( QUIC, CONNECTION_ERROR_RETRY_FAIL, ctx->quic->metrics.conn_err_retry_fail_cnt );
152 :
153 0 : FD_MCNT_ENUM_COPY( QUIC, PKT_CRYPTO_FAILED, ctx->quic->metrics.pkt_decrypt_fail_cnt );
154 0 : FD_MCNT_ENUM_COPY( QUIC, PKT_NO_KEY, ctx->quic->metrics.pkt_no_key_cnt );
155 0 : FD_MCNT_ENUM_COPY( QUIC, PKT_NO_CONN, ctx->quic->metrics.pkt_no_conn_cnt );
156 0 : FD_MCNT_ENUM_COPY( QUIC, FRAME_TX_ALLOC, ctx->quic->metrics.frame_tx_alloc_cnt );
157 0 : FD_MCNT_SET( QUIC, PKT_NET_HEADER_INVALID, ctx->quic->metrics.pkt_net_hdr_err_cnt );
158 0 : FD_MCNT_SET( QUIC, PKT_QUIC_HEADER_INVALID, ctx->quic->metrics.pkt_quic_hdr_err_cnt );
159 0 : FD_MCNT_SET( QUIC, PKT_UNDERSZ, ctx->quic->metrics.pkt_undersz_cnt );
160 0 : FD_MCNT_SET( QUIC, PKT_OVERSZ, ctx->quic->metrics.pkt_oversz_cnt );
161 0 : FD_MCNT_SET( QUIC, PKT_VERNEG, ctx->quic->metrics.pkt_verneg_cnt );
162 0 : FD_MCNT_SET( QUIC, PKT_RETRANSMISSIONS, ctx->quic->metrics.pkt_retransmissions_cnt );
163 0 : FD_MCNT_ENUM_COPY( QUIC, INITIAL_TOKEN_LEN, ctx->quic->metrics.initial_token_len_cnt );
164 :
165 0 : FD_MCNT_SET( QUIC, HANDSHAKES_CREATED, ctx->quic->metrics.hs_created_cnt );
166 0 : FD_MCNT_SET( QUIC, HANDSHAKE_ERROR_ALLOC_FAIL, ctx->quic->metrics.hs_err_alloc_fail_cnt );
167 0 : FD_MCNT_SET( QUIC, HANDSHAKE_EVICTED, ctx->quic->metrics.hs_evicted_cnt );
168 :
169 0 : FD_MCNT_SET( QUIC, STREAM_RECEIVED_EVENTS, ctx->quic->metrics.stream_rx_event_cnt );
170 0 : FD_MCNT_SET( QUIC, STREAM_RECEIVED_BYTES, ctx->quic->metrics.stream_rx_byte_cnt );
171 :
172 0 : FD_MCNT_ENUM_COPY( QUIC, RECEIVED_FRAMES, ctx->quic->metrics.frame_rx_cnt );
173 0 : FD_MCNT_SET ( QUIC, FRAME_FAIL_PARSE, ctx->quic->metrics.frame_rx_err_cnt );
174 :
175 0 : FD_MCNT_ENUM_COPY( QUIC, ACK_TX, ctx->quic->metrics.ack_tx );
176 :
177 0 : FD_MHIST_COPY( QUIC, SERVICE_DURATION_SECONDS, ctx->quic->metrics.service_duration );
178 0 : FD_MHIST_COPY( QUIC, RECEIVE_DURATION_SECONDS, ctx->quic->metrics.receive_duration );
179 0 : }
180 :
181 : static int
182 : before_frag( fd_quic_ctx_t * ctx,
183 : ulong in_idx,
184 : ulong seq,
185 0 : ulong sig ) {
186 0 : (void)in_idx;
187 0 : (void)seq;
188 :
189 0 : ulong proto = fd_disco_netmux_sig_proto( sig );
190 0 : if( FD_UNLIKELY( proto!=DST_PROTO_TPU_UDP && proto!=DST_PROTO_TPU_QUIC ) ) return 1;
191 :
192 0 : ulong hash = fd_disco_netmux_sig_hash( sig );
193 0 : if( FD_UNLIKELY( (hash % ctx->round_robin_cnt) != ctx->round_robin_id ) ) return 1;
194 :
195 0 : return 0;
196 0 : }
197 :
198 : static void
199 : during_frag( fd_quic_ctx_t * ctx,
200 : ulong in_idx,
201 : ulong seq FD_PARAM_UNUSED,
202 : ulong sig FD_PARAM_UNUSED,
203 : ulong chunk,
204 : ulong sz,
205 0 : ulong ctl ) {
206 0 : void const * src = fd_net_rx_translate_frag( &ctx->net_in_bounds[ in_idx ], chunk, ctl, sz );
207 :
208 : /* FIXME this copy could be eliminated by combining it with the decrypt operation */
209 0 : fd_memcpy( ctx->buffer, src, sz );
210 0 : }
211 :
212 : static void
213 : after_frag( fd_quic_ctx_t * ctx,
214 : ulong in_idx,
215 : ulong seq,
216 : ulong sig,
217 : ulong sz,
218 : ulong tsorig,
219 : ulong tspub,
220 0 : fd_stem_context_t * stem ) {
221 0 : (void)in_idx;
222 0 : (void)seq;
223 0 : (void)tsorig;
224 0 : (void)tspub;
225 0 : (void)stem;
226 :
227 0 : ulong proto = fd_disco_netmux_sig_proto( sig );
228 :
229 0 : if( FD_LIKELY( proto==DST_PROTO_TPU_QUIC ) ) {
230 0 : if( FD_UNLIKELY( sz<sizeof(fd_eth_hdr_t) ) ) FD_LOG_ERR(( "QUIC packet too small" ));
231 0 : uchar * ip_pkt = ctx->buffer + sizeof(fd_eth_hdr_t);
232 0 : ulong ip_sz = sz - sizeof(fd_eth_hdr_t);
233 :
234 0 : fd_quic_t * quic = ctx->quic;
235 0 : fd_quic_process_packet( quic, ip_pkt, ip_sz, ctx->now );
236 0 : } else if( FD_LIKELY( proto==DST_PROTO_TPU_UDP ) ) {
237 0 : ulong network_hdr_sz = fd_disco_netmux_sig_hdr_sz( sig );
238 0 : if( FD_UNLIKELY( sz<=network_hdr_sz ) ) {
239 : /* Transaction not valid if the packet isn't large enough for the network
240 : headers. */
241 0 : ctx->metrics.udp_pkt_too_small++;
242 0 : return;
243 0 : }
244 :
245 0 : ulong data_sz = sz - network_hdr_sz;
246 0 : if( FD_UNLIKELY( data_sz<FD_TXN_MIN_SERIALIZED_SZ ) ) {
247 : /* Smaller than the smallest possible transaction */
248 0 : ctx->metrics.udp_pkt_too_small++;
249 0 : return;
250 0 : }
251 :
252 0 : if( FD_UNLIKELY( data_sz>FD_TPU_MTU ) ) {
253 : /* Transaction couldn't possibly be valid if it's longer than transaction
254 : MTU so drop it. This is not required, as the txn will fail to parse,
255 : but it's a nice short circuit. */
256 0 : ctx->metrics.udp_pkt_too_large++;
257 0 : return;
258 0 : }
259 :
260 0 : legacy_stream_notify( ctx, ctx->buffer+network_hdr_sz, data_sz, fd_disco_netmux_sig_ip( sig ) );
261 0 : }
262 0 : }
263 :
264 : static void
265 : quic_conn_final( fd_quic_conn_t * conn,
266 0 : void * quic_ctx ) {
267 0 : fd_quic_ctx_t * ctx = quic_ctx;
268 0 : long abandon_cnt = fd_long_max( conn->srx->rx_streams_active, 0L );
269 0 : ctx->metrics.reasm_active -= abandon_cnt;
270 0 : ctx->metrics.reasm_abandoned += (ulong)abandon_cnt;
271 0 : }
272 :
273 : static int
274 : quic_stream_rx( fd_quic_conn_t * conn,
275 : ulong stream_id,
276 : ulong offset,
277 : uchar const * data,
278 : ulong data_sz,
279 0 : int fin ) {
280 :
281 0 : fd_quic_t * quic = conn->quic;
282 0 : fd_quic_state_t * state = fd_quic_get_state( quic ); /* ugly */
283 0 : fd_quic_ctx_t * ctx = quic->cb.quic_ctx;
284 0 : long tspub = ctx->now;
285 0 : fd_tpu_reasm_t * reasm = ctx->reasm;
286 0 : ulong conn_uid = fd_quic_conn_uid( conn );
287 0 : fd_stem_context_t * stem = ctx->stem;
288 0 : fd_frag_meta_t * mcache = stem->mcaches[0];
289 0 : void * base = ctx->verify_out_mem;
290 0 : ulong seq = stem->seqs[0];
291 :
292 0 : int oversz = offset+data_sz > FD_TPU_MTU;
293 :
294 0 : if( offset==0UL && fin ) {
295 : /* Fast path */
296 0 : if( FD_UNLIKELY( data_sz<FD_TXN_MIN_SERIALIZED_SZ ) ) {
297 0 : ctx->metrics.quic_txn_too_small++;
298 0 : return FD_QUIC_SUCCESS; /* drop */
299 0 : }
300 0 : if( FD_UNLIKELY( oversz ) ) {
301 0 : ctx->metrics.quic_txn_too_large++;
302 0 : return FD_QUIC_SUCCESS; /* drop */
303 0 : }
304 0 : int err = fd_tpu_reasm_publish_fast( reasm, data, data_sz, mcache, base, seq, tspub, conn->peer->ip_addr, FD_TXN_M_TPU_SOURCE_QUIC );
305 0 : if( FD_LIKELY( err==FD_TPU_REASM_SUCCESS ) ) {
306 0 : fd_stem_advance( stem, 0UL );
307 0 : ctx->metrics.txns_received_quic_fast++;
308 0 : }
309 0 : return FD_QUIC_SUCCESS;
310 0 : }
311 :
312 0 : if( data_sz==0UL && !fin ) return FD_QUIC_SUCCESS; /* noop */
313 :
314 0 : fd_tpu_reasm_slot_t * slot = fd_tpu_reasm_query( reasm, conn_uid, stream_id );
315 :
316 0 : if( !slot ) { /* start a new reassembly */
317 0 : if( offset>0 ) {
318 0 : ctx->metrics.frag_gap_cnt++;
319 0 : return FD_QUIC_SUCCESS;
320 0 : }
321 0 : if( data_sz==0 ) return FD_QUIC_SUCCESS; /* ignore empty */
322 0 : if( FD_UNLIKELY( oversz ) ) {
323 0 : ctx->metrics.quic_txn_too_large++;
324 0 : return FD_QUIC_SUCCESS; /* drop */
325 0 : }
326 :
327 : /* Was the reasm buffer we evicted busy? */
328 0 : fd_tpu_reasm_slot_t * victim = fd_tpu_reasm_peek_tail( reasm );
329 0 : int victim_busy = victim->k.state == FD_TPU_REASM_STATE_BUSY;
330 :
331 : /* If so, does the connection it refers to still exist?
332 : (Or was the buffer previously abandoned by means of conn close) */
333 0 : uint victim_cidx = fd_quic_conn_uid_idx( victim->k.conn_uid );
334 0 : uint victim_gen = fd_quic_conn_uid_gen( victim->k.conn_uid );
335 0 : fd_quic_conn_t * victim_conn = fd_quic_conn_at_idx( state, victim_cidx ); /* possibly oob */
336 0 : if( victim_busy ) {
337 0 : uint victim_exists = (victim_conn->conn_gen == victim_gen) &
338 0 : (victim_conn->state == FD_QUIC_CONN_STATE_ACTIVE); /* in [0,1] */
339 0 : victim_conn->srx->rx_streams_active -= victim_exists;
340 0 : ctx->metrics.reasm_overrun += victim_exists;
341 0 : ctx->metrics.reasm_active -= victim_exists;
342 0 : }
343 :
344 0 : slot = fd_tpu_reasm_prepare( reasm, conn_uid, stream_id, tspub ); /* infallible */
345 0 : ctx->metrics.reasm_started++;
346 0 : ctx->metrics.reasm_active++;
347 0 : conn->srx->rx_streams_active++;
348 0 : } else if( slot->k.state != FD_TPU_REASM_STATE_BUSY ) {
349 0 : ctx->metrics.frag_dup_cnt++;
350 0 : return FD_QUIC_SUCCESS;
351 0 : }
352 :
353 0 : int reasm_res = fd_tpu_reasm_frag( reasm, slot, data, data_sz, offset );
354 0 : if( FD_UNLIKELY( reasm_res != FD_TPU_REASM_SUCCESS ) ) {
355 0 : int is_gap = reasm_res==FD_TPU_REASM_ERR_SKIP;
356 0 : int is_oversz = reasm_res==FD_TPU_REASM_ERR_SZ;
357 0 : ctx->metrics.frag_gap_cnt += (ulong)is_gap;
358 0 : ctx->metrics.quic_txn_too_large += (ulong)is_oversz;
359 0 : return is_gap ? FD_QUIC_FAILED : FD_QUIC_SUCCESS;
360 0 : }
361 0 : ctx->metrics.frag_ok_cnt++;
362 :
363 0 : if( fin ) {
364 0 : if( FD_UNLIKELY( slot->k.sz < FD_TXN_MIN_SERIALIZED_SZ ) ) {
365 0 : ctx->metrics.quic_txn_too_small++;
366 0 : return FD_QUIC_SUCCESS; /* ignore */
367 0 : }
368 0 : int pub_err = fd_tpu_reasm_publish( reasm, slot, mcache, base, seq, tspub, conn->peer->ip_addr, FD_TXN_M_TPU_SOURCE_QUIC );
369 0 : if( FD_UNLIKELY( pub_err!=FD_TPU_REASM_SUCCESS ) ) return FD_QUIC_SUCCESS; /* unreachable */
370 0 : ulong * rcv_cnt = (offset==0UL && fin) ? &ctx->metrics.txns_received_quic_fast : &ctx->metrics.txns_received_quic_frag;
371 0 : (*rcv_cnt)++;
372 0 : ctx->metrics.reasm_active--;
373 0 : conn->srx->rx_streams_active--;
374 :
375 0 : fd_stem_advance( stem, 0UL );
376 0 : }
377 :
378 0 : return FD_QUIC_SUCCESS;
379 0 : }
380 :
381 : static int
382 : quic_tx_aio_send( void * _ctx,
383 : fd_aio_pkt_info_t const * batch,
384 : ulong batch_cnt,
385 : ulong * opt_batch_idx,
386 0 : int flush ) {
387 0 : (void)flush;
388 :
389 0 : fd_quic_ctx_t * ctx = _ctx;
390 :
391 0 : for( ulong i=0; i<batch_cnt; i++ ) {
392 0 : if( FD_UNLIKELY( batch[ i ].buf_sz<FD_NETMUX_SIG_MIN_HDR_SZ ) ) continue;
393 :
394 0 : uint const ip_dst = FD_LOAD( uint, batch[ i ].buf+offsetof( fd_ip4_hdr_t, daddr_c ) );
395 0 : uchar * packet_l2 = fd_chunk_to_laddr( ctx->net_out_mem, ctx->net_out_chunk );
396 0 : uchar * packet_l3 = packet_l2 + sizeof(fd_eth_hdr_t);
397 0 : memset( packet_l2, 0, 12 );
398 0 : FD_STORE( ushort, packet_l2+offsetof( fd_eth_hdr_t, net_type ), fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) );
399 0 : fd_memcpy( packet_l3, batch[ i ].buf, batch[ i ].buf_sz );
400 0 : ulong sz_l2 = sizeof(fd_eth_hdr_t) + batch[ i ].buf_sz;
401 :
402 : /* send packets are just round-robined by sequence number, so for now
403 : just indicate where they came from so they don't bounce back */
404 0 : ulong sig = fd_disco_netmux_sig( ip_dst, 0U, ip_dst, DST_PROTO_OUTGOING, FD_NETMUX_SIG_MIN_HDR_SZ );
405 :
406 0 : ulong chunk = ctx->net_out_chunk;
407 0 : ulong ctl = fd_frag_meta_ctl( 0UL, 1, 1, 0 );
408 0 : fd_stem_publish( ctx->stem, OUT_IDX_NET, sig, chunk, sz_l2, ctl, 0L, 0L );
409 :
410 0 : ctx->net_out_chunk = fd_dcache_compact_next( chunk, FD_NET_MTU, ctx->net_out_chunk0, ctx->net_out_wmark );
411 0 : }
412 :
413 0 : if( FD_LIKELY( opt_batch_idx ) ) {
414 0 : *opt_batch_idx = batch_cnt;
415 0 : }
416 :
417 0 : return FD_AIO_SUCCESS;
418 0 : }
419 :
420 : static void
421 : quic_tls_keylog( void * _ctx,
422 0 : char const * line ) {
423 0 : fd_quic_ctx_t * ctx = _ctx;
424 0 : fd_io_buffered_ostream_t * os = &ctx->keylog_stream;
425 :
426 : /* Lazily flush ostream */
427 0 : ulong line_sz = strlen( line )+1;
428 0 : ulong peek_sz = fd_io_buffered_ostream_peek_sz( os );
429 0 : if( FD_UNLIKELY( peek_sz<line_sz ) ) {
430 0 : int err = fd_io_buffered_ostream_flush( os );
431 0 : if( FD_UNLIKELY( err ) ) {
432 0 : FD_LOG_ERR(( "fd_io_buffered_ostream_flush(keylog) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
433 0 : }
434 0 : peek_sz = fd_io_buffered_ostream_peek_sz( os );
435 0 : }
436 0 : if( FD_UNLIKELY( peek_sz<line_sz ) ) {
437 0 : FD_LOG_ERR(( "keylog buffer too small (buf_sz=%lu, line_sz=%lu)", peek_sz, line_sz ));
438 0 : }
439 :
440 : /* Append line */
441 0 : char * cur = fd_io_buffered_ostream_peek( os );
442 0 : cur = fd_cstr_append_text( cur, line, strlen( line ) );
443 0 : cur = fd_cstr_append_char( cur, '\n' );
444 0 : fd_io_buffered_ostream_seek( os, line_sz );
445 0 : }
446 :
447 : static void
448 0 : during_housekeeping( fd_quic_ctx_t * ctx ) {
449 0 : if( FD_UNLIKELY( ctx->recal_next <= ctx->now ) ) {
450 0 : ctx->recal_next = fd_clock_default_recal( ctx->clock );
451 0 : }
452 :
453 0 : if( FD_UNLIKELY( ctx->keylog_stream.wbuf ) ) {
454 0 : long now = ctx->now = fd_clock_now( ctx->clock );
455 0 : if( FD_UNLIKELY( now > ctx->keylog_next_flush ) ) {
456 0 : int err = fd_io_buffered_ostream_flush( &ctx->keylog_stream );
457 0 : if( FD_UNLIKELY( err ) ) {
458 0 : FD_LOG_ERR(( "fd_io_buffered_ostream_flush(keylog) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
459 0 : }
460 0 : ctx->keylog_next_flush = now + FD_QUIC_KEYLOG_FLUSH_INTERVAL_NS;
461 0 : }
462 0 : }
463 0 : }
464 :
465 : static void
466 : privileged_init( fd_topo_t * topo,
467 0 : fd_topo_tile_t * tile ) {
468 0 : fd_quic_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
469 0 : if( FD_UNLIKELY( topo->objs[ tile->tile_obj_id ].footprint < scratch_footprint( tile ) ) ) {
470 0 : FD_LOG_ERR(( "insufficient tile scratch space" ));
471 0 : }
472 0 : fd_memset( ctx, 0, sizeof(fd_quic_ctx_t) );
473 0 : ctx->keylog_fd = -1;
474 :
475 0 : if( 0!=strcmp( tile->quic.key_log_path, "" ) ) {
476 0 : ctx->keylog_fd = open( tile->quic.key_log_path, O_WRONLY|O_CREAT|O_APPEND, 0644 );
477 0 : if( FD_UNLIKELY( ctx->keylog_fd<0 ) ) {
478 0 : FD_LOG_ERR(( "open(%s, O_WRONLY|O_CREAT|O_APPEND, 0644) failed (%i-%s)",
479 0 : tile->quic.key_log_path, errno, fd_io_strerror( errno ) ));
480 0 : }
481 0 : fd_io_buffered_ostream_init( &ctx->keylog_stream, ctx->keylog_fd, ctx->keylog_buf, sizeof(ctx->keylog_buf) );
482 0 : FD_LOG_WARNING(( "Logging QUIC encryption keys to %s", tile->quic.key_log_path ));
483 0 : }
484 :
485 : /* The fd_quic implementation calls fd_log_wallclock() internally
486 : which itself calls clock_gettime() which on most kernels is not a
487 : real syscall but a virtual one in the process via. the vDSO.
488 :
489 : The first time this virtual call is made to the vDSO it does an
490 : mmap(2) of some shared memory into userspace, which cannot
491 : happen while sandboxed so we need to ensure that initialization
492 : happens here. */
493 :
494 0 : fd_log_wallclock();
495 0 : }
496 :
497 : static void
498 : quic_tls_cv_sign( void * signer_ctx,
499 : uchar signature[ static 64 ],
500 0 : uchar const payload[ static 130 ] ) {
501 0 : fd_quic_ctx_t * ctx = signer_ctx;
502 0 : fd_sha512_t * sha512 = fd_sha512_join( ctx->sha512 );
503 0 : fd_ed25519_sign( signature, payload, 130UL, ctx->tls_pub_key, ctx->tls_priv_key, sha512 );
504 0 : fd_sha512_leave( sha512 );
505 0 : }
506 :
507 : static void
508 : unprivileged_init( fd_topo_t * topo,
509 0 : fd_topo_tile_t * tile ) {
510 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
511 :
512 0 : if( FD_UNLIKELY( tile->in_cnt==0 ) ) {
513 0 : FD_LOG_ERR(( "quic tile has no input links" ));
514 0 : }
515 0 : if( FD_UNLIKELY( tile->in_cnt > FD_QUIC_TILE_IN_MAX ) ) {
516 0 : FD_LOG_ERR(( "quic tile has too many input links (%lu), max %lu",
517 0 : tile->in_cnt, FD_QUIC_TILE_IN_MAX ));
518 0 : }
519 :
520 0 : if( FD_UNLIKELY( tile->out_cnt!=2UL ||
521 0 : strcmp( topo->links[ tile->out_link_id[ OUT_IDX_VERIFY ] ].name, "quic_verify" ) ||
522 0 : strcmp( topo->links[ tile->out_link_id[ OUT_IDX_NET ] ].name, "quic_net" ) ) )
523 0 : FD_LOG_ERR(( "quic tile has none or unexpected output links %lu %s %s",
524 0 : tile->out_cnt, topo->links[ tile->out_link_id[ 0 ] ].name, topo->links[ tile->out_link_id[ 1 ] ].name ));
525 :
526 0 : ulong out_depth = topo->links[ tile->out_link_id[ 0 ] ].depth;
527 0 : if( FD_UNLIKELY( tile->quic.out_depth != out_depth ) )
528 0 : FD_LOG_ERR(( "tile->quic.out_depth (%u) does not match quic_verify link depth (%lu)",
529 0 : tile->quic.out_depth, out_depth ));
530 :
531 0 : void * txn_dcache = topo->links[ tile->out_link_id[ 0UL ] ].dcache;
532 0 : if( FD_UNLIKELY( !txn_dcache ) ) FD_LOG_ERR(( "Missing output dcache" ));
533 :
534 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
535 0 : fd_quic_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_quic_ctx_t ), sizeof( fd_quic_ctx_t ) );
536 0 : FD_TEST( (ulong)ctx==(ulong)scratch );
537 :
538 0 : for( ulong i=0; i<tile->in_cnt; i++ ) {
539 0 : fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ];
540 0 : if( FD_UNLIKELY( 0!=strcmp( link->name, "net_quic" ) ) ) {
541 0 : FD_LOG_ERR(( "unexpected input link %s", link->name ));
542 0 : }
543 0 : fd_net_rx_bounds_init( &ctx->net_in_bounds[ i ], link->dcache );
544 0 : }
545 :
546 0 : fd_clock_t * clock = ctx->clock;
547 0 : fd_clock_default_init( clock, ctx->clock_mem );
548 0 : ctx->recal_next = fd_clock_recal_next( clock );
549 0 : ctx->now = fd_clock_now( clock );
550 :
551 0 : if( FD_UNLIKELY( getrandom( ctx->tls_priv_key, ED25519_PRIV_KEY_SZ, 0 )!=ED25519_PRIV_KEY_SZ ) ) {
552 0 : FD_LOG_ERR(( "getrandom failed (%i-%s)", errno, fd_io_strerror( errno ) ));
553 0 : }
554 0 : fd_sha512_t * sha512 = fd_sha512_join( fd_sha512_new( ctx->sha512 ) );
555 0 : fd_ed25519_public_from_private( ctx->tls_pub_key, ctx->tls_priv_key, sha512 );
556 0 : fd_sha512_leave( sha512 );
557 :
558 0 : fd_aio_t * quic_tx_aio = fd_aio_join( fd_aio_new( ctx->quic_tx_aio, ctx, quic_tx_aio_send ) );
559 0 : if( FD_UNLIKELY( !quic_tx_aio ) ) FD_LOG_ERR(( "fd_aio_join failed" ));
560 :
561 0 : fd_quic_limits_t limits = quic_limits( tile );
562 0 : fd_quic_t * quic = fd_quic_join( fd_quic_new( FD_SCRATCH_ALLOC_APPEND( l, fd_quic_align(), fd_quic_footprint( &limits ) ), &limits ) );
563 0 : if( FD_UNLIKELY( !quic ) ) FD_LOG_ERR(( "fd_quic_new failed" ));
564 :
565 0 : ulong orig = 0UL; /* fd_tango origin ID */
566 0 : ulong reasm_max = tile->quic.reasm_cnt;
567 0 : void * reasm_mem = FD_SCRATCH_ALLOC_APPEND( l, fd_tpu_reasm_align(), fd_tpu_reasm_footprint( out_depth, reasm_max ) );
568 0 : ctx->reasm = fd_tpu_reasm_join( fd_tpu_reasm_new( reasm_mem, out_depth, reasm_max, orig, txn_dcache ) );
569 0 : if( FD_UNLIKELY( !ctx->reasm ) ) FD_LOG_ERR(( "fd_tpu_reasm_new failed" ));
570 :
571 0 : if( FD_UNLIKELY( tile->quic.ack_delay_millis == 0 ) ) {
572 0 : FD_LOG_ERR(( "Invalid `ack_delay_millis`: must be greater than zero" ));
573 0 : }
574 0 : if( FD_UNLIKELY( tile->quic.ack_delay_millis >= tile->quic.idle_timeout_millis ) ) {
575 0 : FD_LOG_ERR(( "Invalid `ack_delay_millis`: must be lower than `idle_timeout_millis`" ));
576 0 : }
577 :
578 0 : quic->config.role = FD_QUIC_ROLE_SERVER;
579 0 : quic->config.idle_timeout = tile->quic.idle_timeout_millis * (long)1e6;
580 0 : quic->config.ack_delay = tile->quic.ack_delay_millis * (long)1e6;
581 0 : quic->config.initial_rx_max_stream_data = FD_TXN_MTU;
582 0 : quic->config.retry = tile->quic.retry;
583 0 : fd_memcpy( quic->config.identity_public_key, ctx->tls_pub_key, ED25519_PUB_KEY_SZ );
584 :
585 0 : quic->config.sign = quic_tls_cv_sign;
586 0 : quic->config.sign_ctx = ctx;
587 :
588 0 : quic->cb.conn_final = quic_conn_final;
589 0 : quic->cb.stream_rx = quic_stream_rx;
590 0 : quic->cb.quic_ctx = ctx;
591 0 : if( ctx->keylog_fd>=0 ) {
592 0 : quic->cb.tls_keylog = quic_tls_keylog;
593 0 : ctx->keylog_next_flush = fd_log_wallclock() + FD_QUIC_KEYLOG_FLUSH_INTERVAL_NS;
594 0 : }
595 :
596 0 : fd_quic_set_aio_net_tx( quic, quic_tx_aio );
597 0 : if( FD_UNLIKELY( !fd_quic_init( quic ) ) ) FD_LOG_ERR(( "fd_quic_init failed" ));
598 :
599 0 : fd_topo_link_t * net_out = &topo->links[ tile->out_link_id[ 1 ] ];
600 :
601 0 : ctx->net_out_mem = topo->workspaces[ topo->objs[ net_out->dcache_obj_id ].wksp_id ].wksp;
602 0 : ctx->net_out_chunk0 = fd_dcache_compact_chunk0( ctx->net_out_mem, net_out->dcache );
603 0 : ctx->net_out_wmark = fd_dcache_compact_wmark ( ctx->net_out_mem, net_out->dcache, net_out->mtu );
604 0 : ctx->net_out_chunk = ctx->net_out_chunk0;
605 :
606 0 : fd_topo_link_t * verify_out = &topo->links[ tile->out_link_id[ 0 ] ];
607 :
608 0 : ctx->verify_out_mem = topo->workspaces[ topo->objs[ verify_out->dcache_obj_id ].wksp_id ].wksp;
609 :
610 0 : ctx->quic = quic;
611 :
612 0 : ctx->round_robin_cnt = fd_topo_tile_name_cnt( topo, tile->name );
613 0 : ctx->round_robin_id = tile->kind_id;
614 0 : if( FD_UNLIKELY( ctx->round_robin_id >= ctx->round_robin_cnt ) ) {
615 0 : FD_LOG_ERR(( "invalid round robin configuration" ));
616 0 : }
617 :
618 0 : ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
619 0 : if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
620 0 : FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
621 :
622 : /* Call new/join here rather than in fd_quic so min/max can differ across uses */
623 0 : fd_histf_join( fd_histf_new( ctx->quic->metrics.service_duration, FD_MHIST_SECONDS_MIN( QUIC, SERVICE_DURATION_SECONDS ),
624 0 : FD_MHIST_SECONDS_MAX( QUIC, SERVICE_DURATION_SECONDS ) ) );
625 0 : fd_histf_join( fd_histf_new( ctx->quic->metrics.receive_duration, FD_MHIST_SECONDS_MIN( QUIC, RECEIVE_DURATION_SECONDS ),
626 0 : FD_MHIST_SECONDS_MAX( QUIC, RECEIVE_DURATION_SECONDS ) ) );
627 0 : }
628 :
629 : static ulong
630 : populate_allowed_seccomp( fd_topo_t const * topo,
631 : fd_topo_tile_t const * tile,
632 : ulong out_cnt,
633 0 : struct sock_filter * out ) {
634 0 : fd_quic_ctx_t const * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
635 0 : populate_sock_filter_policy_quic( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->keylog_fd );
636 0 : return sock_filter_policy_quic_instr_cnt;
637 0 : }
638 :
639 : static ulong
640 : populate_allowed_fds( fd_topo_t const * topo,
641 : fd_topo_tile_t const * tile,
642 : ulong out_fds_cnt,
643 0 : int * out_fds ) {
644 0 : fd_quic_ctx_t * ctx = fd_topo_obj_laddr( topo, tile->tile_obj_id );
645 :
646 0 : if( FD_UNLIKELY( out_fds_cnt<3UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
647 :
648 0 : ulong out_cnt = 0UL;
649 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
650 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
651 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
652 0 : if( ctx->keylog_fd!=-1 )
653 0 : out_fds[ out_cnt++ ] = ctx->keylog_fd;
654 0 : return out_cnt;
655 0 : }
656 :
657 0 : #define STEM_BURST (1UL)
658 0 : #define STEM_LAZY ((long)10e6) /* 10ms */
659 :
660 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_quic_ctx_t
661 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_quic_ctx_t)
662 :
663 0 : #define STEM_CALLBACK_METRICS_WRITE metrics_write
664 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
665 0 : #define STEM_CALLBACK_BEFORE_FRAG before_frag
666 0 : #define STEM_CALLBACK_DURING_FRAG during_frag
667 0 : #define STEM_CALLBACK_AFTER_FRAG after_frag
668 0 : #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
669 :
670 : #include "../stem/fd_stem.c"
671 :
672 : fd_topo_run_tile_t fd_tile_quic = {
673 : .name = "quic",
674 : .populate_allowed_seccomp = populate_allowed_seccomp,
675 : .populate_allowed_fds = populate_allowed_fds,
676 : .scratch_align = scratch_align,
677 : .scratch_footprint = scratch_footprint,
678 : .privileged_init = privileged_init,
679 : .unprivileged_init = unprivileged_init,
680 : .run = stem_run,
681 : };
|