Line data Source code
1 : /* The xdp tile translates between AF_XDP and fd_tango
2 : traffic. It is responsible for setting up the XDP and
3 : XSK socket configuration. */
4 :
5 : #include <errno.h>
6 : #include <fcntl.h>
7 : #include <net/if.h>
8 : #include <netinet/in.h>
9 : #include <sys/socket.h> /* MSG_DONTWAIT needed before importing the net seccomp filter */
10 : #include <linux/if_xdp.h>
11 :
12 : #include "../../metrics/fd_metrics.h"
13 : #include "../../netlink/fd_netlink_tile.h" /* neigh4_solicit */
14 : #include "../../topo/fd_topo.h"
15 :
16 : #include "../../../waltz/ip/fd_fib4.h"
17 : #include "../../../waltz/neigh/fd_neigh4_map.h"
18 : #include "../../../waltz/xdp/fd_xdp_redirect_user.h" /* fd_xsk_activate */
19 : #include "../../../waltz/xdp/fd_xsk.h"
20 : #include "../../../util/log/fd_dtrace.h"
21 : #include "../../../util/net/fd_eth.h"
22 : #include "../../../util/net/fd_ip4.h"
23 :
24 : #include <unistd.h>
25 : #include <linux/if.h> /* struct ifreq */
26 : #include <sys/ioctl.h>
27 : #include <linux/unistd.h>
28 :
29 : #include "generated/xdp_seccomp.h"
30 :
31 : /* MAX_NET_INS controls the max number of TX links that a net tile can
32 : serve. */
33 :
34 : #define MAX_NET_INS (32UL)
35 :
36 : /* FD_XDP_STATS_INTERVAL_NS controls the XDP stats refresh interval.
37 : This should be lower than the interval at which the metrics tile
38 : collects metrics. */
39 :
40 0 : #define FD_XDP_STATS_INTERVAL_NS (11e6) /* 11ms */
41 :
42 : /* REPAIR_PING_SZ is the sz of a ping packet for the repair protocol.
43 : Because pings are routed to the same port as shreds without any
44 : discriminant encoding, we have to use the packet sz to interpret the
45 : payload. Note that any valid shred must be either FD_SHRED_MAX_SZ
46 : or FD_SHRED_MIN_SZ ie. will never be FD_REPAIR_PING_SZ.*/
47 :
48 : #define REPAIR_PING_SZ (174UL)
49 :
50 : /* fd_net_in_ctx_t contains consumer information for an incoming tango
51 : link. It is used as part of the TX path. */
52 :
53 : typedef struct {
54 : fd_wksp_t * mem;
55 : ulong chunk0;
56 : ulong wmark;
57 : } fd_net_in_ctx_t;
58 :
59 : /* fd_net_out_ctx_t contains publisher information for a link to a
60 : downstream app tile. It is used as part of the RX path. */
61 :
62 : typedef struct {
63 : fd_frag_meta_t * mcache;
64 : ulong * sync;
65 : ulong depth;
66 : ulong seq;
67 : } fd_net_out_ctx_t;
68 :
69 : /* fd_net_flusher_t controls the pacing of XDP sendto calls for flushing
70 : TX batches. In the 'wakeup' XDP mode, no TX occurs unless the net
71 : tile wakes up the kernel periodically using the sendto() syscall.
72 : If sendto() is called too frequently, time is wasted on context
73 : switches. If sendto() is called not often enough, packets are
74 : delayed or dropped. sendto() calls make almost no guarantees how
75 : much packets are sent out, nor do they indicate when the kernel
76 : finishes a wakeup call (asynchronously dispatched). The net tile
77 : thus uses a myraid of flush triggers that were tested for best
78 : performance. */
79 :
80 : struct fd_net_flusher {
81 :
82 : /* Packets that were enqueued after the last sendto() wakeup are
83 : considered "pending". If there are more than pending_wmark packets
84 : pending, a wakeup is dispatched. Thus, this dispatch trigger is
85 : proportional to packet rate, but does not trigger if I/O is seldom. */
86 : ulong pending_cnt;
87 : ulong pending_wmark;
88 :
89 : /* Sometimes, packets are not flushed out even after a sendto()
90 : wakeup. This can result in the tail of a burst getting delayed or
91 : overrun. If more than tail_flush_backoff ticks pass since the last
92 : sendto() wakeup and there are still unacknowledged packets in the
93 : TX ring, issues another wakeup. */
94 : long next_tail_flush_ticks;
95 : long tail_flush_backoff;
96 :
97 : };
98 :
99 : typedef struct fd_net_flusher fd_net_flusher_t;
100 :
101 : FD_PROTOTYPES_BEGIN
102 :
103 : /* fd_net_flusher_inc marks a new packet as enqueued. */
104 :
105 : static inline void
106 : fd_net_flusher_inc( fd_net_flusher_t * flusher,
107 0 : long now ) {
108 0 : flusher->pending_cnt++;
109 0 : long next_flush = now + flusher->tail_flush_backoff;
110 0 : flusher->next_tail_flush_ticks = fd_long_min( flusher->next_tail_flush_ticks, next_flush );
111 0 : }
112 :
113 : /* fd_net_flusher_check returns 1 if a sendto() wakeup should be issued
114 : immediately. now is a recent fd_tickcount() value.
115 : If tx_ring_empty==0 then the kernel is caught up with the net tile
116 : on the XDP TX ring. (Otherwise, the kernel is behind the net tile) */
117 :
118 : static inline int
119 : fd_net_flusher_check( fd_net_flusher_t * flusher,
120 : long now,
121 0 : int tx_ring_empty ) {
122 0 : int flush_level = flusher->pending_cnt >= flusher->pending_wmark;
123 0 : int flush_timeout = now >= flusher->next_tail_flush_ticks;
124 0 : int flush = flush_level || flush_timeout;
125 0 : if( !flush ) return 0;
126 0 : if( FD_UNLIKELY( tx_ring_empty ) ) {
127 : /* Flush requested but caught up */
128 0 : flusher->pending_cnt = 0UL;
129 0 : flusher->next_tail_flush_ticks = LONG_MAX;
130 0 : return 0;
131 0 : }
132 0 : return 1;
133 0 : }
134 :
135 : /* fd_net_flusher_wakeup signals a sendto() wakeup was done. now is a
136 : recent fd_tickcount() value. */
137 :
138 : static inline void
139 : fd_net_flusher_wakeup( fd_net_flusher_t * flusher,
140 0 : long now ) {
141 0 : flusher->pending_cnt = 0UL;
142 0 : flusher->next_tail_flush_ticks = now + flusher->tail_flush_backoff;
143 0 : }
144 :
145 : FD_PROTOTYPES_END
146 :
147 : /* fd_net_free_ring is a FIFO queue that stores pointers to free XDP TX
148 : frames. */
149 :
150 : struct fd_net_free_ring {
151 : ulong prod;
152 : ulong cons;
153 : ulong depth;
154 : ulong * queue;
155 : };
156 : typedef struct fd_net_free_ring fd_net_free_ring_t;
157 :
158 : typedef struct {
159 : /* An "XSK" is an AF_XDP socket */
160 : uint xsk_cnt;
161 : fd_xsk_t xsk[ 2 ];
162 : int prog_link_fds[ 2 ];
163 :
164 : /* UMEM frame region within dcache */
165 : void * umem_frame0; /* First UMEM frame */
166 : ulong umem_sz; /* Usable UMEM size starting at frame0 */
167 :
168 : /* UMEM chunk region within workspace */
169 : uint umem_chunk0; /* Lowest allowed chunk number */
170 : uint umem_wmark; /* Highest allowed chunk number */
171 :
172 : /* All net tiles are subscribed to the same TX links. (These are
173 : incoming links from app tiles asking the net tile to send out packets)
174 : The net tiles "take turns" doing TX jobs based on the L3+L4 dst hash.
175 : net_tile_id is the index of the current interface, net_tile_cnt is the
176 : total amount of interfaces. */
177 : uint net_tile_id;
178 : uint net_tile_cnt;
179 :
180 : /* Details pertaining to an inflight send op */
181 : struct {
182 : uint if_idx; /* 0: main interface, 1: loopback */
183 : void * frame;
184 : uchar mac_addrs[12]; /* First 12 bytes of Ethernet header */
185 : uint src_ip;
186 : } tx_op;
187 :
188 : /* Round-robin cycle serivce operations */
189 : uint rr_idx;
190 :
191 : /* Ring tracking free packet buffers */
192 : fd_net_free_ring_t free_tx;
193 :
194 : uchar src_mac_addr[6];
195 :
196 : uint default_address;
197 : uint bind_address;
198 : ushort shred_listen_port;
199 : ushort quic_transaction_listen_port;
200 : ushort legacy_transaction_listen_port;
201 : ushort gossip_listen_port;
202 : ushort repair_intake_listen_port;
203 : ushort repair_serve_listen_port;
204 : ushort send_src_port;
205 :
206 : ulong in_cnt;
207 : fd_net_in_ctx_t in[ MAX_NET_INS ];
208 :
209 : fd_net_out_ctx_t quic_out[1];
210 : fd_net_out_ctx_t shred_out[1];
211 : fd_net_out_ctx_t gossip_out[1];
212 : fd_net_out_ctx_t repair_out[1];
213 : fd_net_out_ctx_t send_out[1];
214 :
215 : /* XDP stats refresh timer */
216 : long xdp_stats_interval_ticks;
217 : long next_xdp_stats_refresh;
218 :
219 : /* TX flush timers */
220 : fd_net_flusher_t tx_flusher[2]; /* one per XSK */
221 :
222 : /* Route and neighbor tables */
223 : fd_fib4_t const * fib_local;
224 : fd_fib4_t const * fib_main;
225 : fd_neigh4_hmap_t neigh4[1];
226 : fd_netlink_neigh4_solicit_link_t neigh4_solicit[1];
227 :
228 : struct {
229 : ulong rx_pkt_cnt;
230 : ulong rx_bytes_total;
231 : ulong rx_undersz_cnt;
232 : ulong rx_fill_blocked_cnt;
233 : ulong rx_backp_cnt;
234 : long rx_busy_cnt;
235 : long rx_idle_cnt;
236 :
237 : ulong tx_submit_cnt;
238 : ulong tx_complete_cnt;
239 : ulong tx_bytes_total;
240 : ulong tx_route_fail_cnt;
241 : ulong tx_no_xdp_cnt;
242 : ulong tx_neigh_fail_cnt;
243 : ulong tx_full_fail_cnt;
244 : long tx_busy_cnt;
245 : long tx_idle_cnt;
246 :
247 : ulong xsk_tx_wakeup_cnt;
248 : ulong xsk_rx_wakeup_cnt;
249 : } metrics;
250 : } fd_net_ctx_t;
251 :
252 : FD_FN_CONST static inline ulong
253 0 : scratch_align( void ) {
254 0 : return 4096UL;
255 0 : }
256 :
257 : FD_FN_PURE static inline ulong
258 0 : scratch_footprint( fd_topo_tile_t const * tile ) {
259 0 : ulong l = FD_LAYOUT_INIT;
260 0 : l = FD_LAYOUT_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
261 0 : l = FD_LAYOUT_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );
262 0 : return FD_LAYOUT_FINI( l, scratch_align() );
263 0 : }
264 :
265 : static void
266 0 : metrics_write( fd_net_ctx_t * ctx ) {
267 0 : FD_MCNT_SET( NET, RX_PKT_CNT, ctx->metrics.rx_pkt_cnt );
268 0 : FD_MCNT_SET( NET, RX_BYTES_TOTAL, ctx->metrics.rx_bytes_total );
269 0 : FD_MCNT_SET( NET, RX_UNDERSZ_CNT, ctx->metrics.rx_undersz_cnt );
270 0 : FD_MCNT_SET( NET, RX_FILL_BLOCKED_CNT, ctx->metrics.rx_fill_blocked_cnt );
271 0 : FD_MCNT_SET( NET, RX_BACKPRESSURE_CNT, ctx->metrics.rx_backp_cnt );
272 0 : FD_MGAUGE_SET( NET, RX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.rx_busy_cnt, 0L ) );
273 0 : FD_MGAUGE_SET( NET, RX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.rx_idle_cnt, 0L ) );
274 0 : FD_MGAUGE_SET( NET, TX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.tx_busy_cnt, 0L ) );
275 0 : FD_MGAUGE_SET( NET, TX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.tx_idle_cnt, 0L ) );
276 :
277 0 : FD_MCNT_SET( NET, TX_SUBMIT_CNT, ctx->metrics.tx_submit_cnt );
278 0 : FD_MCNT_SET( NET, TX_COMPLETE_CNT, ctx->metrics.tx_complete_cnt );
279 0 : FD_MCNT_SET( NET, TX_BYTES_TOTAL, ctx->metrics.tx_bytes_total );
280 0 : FD_MCNT_SET( NET, TX_ROUTE_FAIL_CNT, ctx->metrics.tx_route_fail_cnt );
281 0 : FD_MCNT_SET( NET, TX_NEIGHBOR_FAIL_CNT, ctx->metrics.tx_neigh_fail_cnt );
282 0 : FD_MCNT_SET( NET, TX_FULL_FAIL_CNT, ctx->metrics.tx_full_fail_cnt );
283 :
284 0 : FD_MCNT_SET( NET, XSK_TX_WAKEUP_CNT, ctx->metrics.xsk_tx_wakeup_cnt );
285 0 : FD_MCNT_SET( NET, XSK_RX_WAKEUP_CNT, ctx->metrics.xsk_rx_wakeup_cnt );
286 0 : }
287 :
288 : struct xdp_statistics_v0 {
289 : __u64 rx_dropped; /* Dropped for other reasons */
290 : __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
291 : __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
292 : };
293 :
294 : struct xdp_statistics_v1 {
295 : __u64 rx_dropped; /* Dropped for other reasons */
296 : __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
297 : __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
298 : __u64 rx_ring_full; /* Dropped due to rx ring being full */
299 : __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */
300 : __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */
301 : };
302 :
303 : static void
304 0 : poll_xdp_statistics( fd_net_ctx_t * ctx ) {
305 0 : struct xdp_statistics_v1 stats = {0};
306 0 : ulong xsk_cnt = ctx->xsk_cnt;
307 0 : for( ulong j=0UL; j<xsk_cnt; j++ ) {
308 0 : struct xdp_statistics_v1 sub_stats;
309 0 : uint optlen = (uint)sizeof(struct xdp_statistics_v1);
310 0 : if( FD_UNLIKELY( -1==getsockopt( ctx->xsk[ j ].xsk_fd, SOL_XDP, XDP_STATISTICS, &sub_stats, &optlen ) ) )
311 0 : FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) failed: %s", strerror( errno ) ));
312 0 : if( FD_UNLIKELY( optlen!=sizeof(struct xdp_statistics_v0) &&
313 0 : optlen!=sizeof(struct xdp_statistics_v1) ) ) {
314 0 : FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) returned unexpected size %u", optlen ));
315 0 : }
316 0 : stats.rx_dropped += sub_stats.rx_dropped;
317 0 : stats.rx_invalid_descs += sub_stats.rx_invalid_descs;
318 0 : stats.tx_invalid_descs += sub_stats.tx_invalid_descs;
319 0 : stats.rx_ring_full += sub_stats.rx_ring_full;
320 0 : stats.rx_fill_ring_empty_descs += sub_stats.rx_fill_ring_empty_descs;
321 0 : stats.tx_ring_empty_descs += sub_stats.tx_ring_empty_descs;
322 0 : }
323 :
324 0 : FD_MCNT_SET( NET, XDP_RX_DROPPED_OTHER, stats.rx_dropped );
325 0 : FD_MCNT_SET( NET, XDP_RX_INVALID_DESCS, stats.rx_invalid_descs );
326 0 : FD_MCNT_SET( NET, XDP_TX_INVALID_DESCS, stats.tx_invalid_descs );
327 0 : FD_MCNT_SET( NET, XDP_RX_RING_FULL, stats.rx_ring_full );
328 0 : FD_MCNT_SET( NET, XDP_RX_FILL_RING_EMPTY_DESCS, stats.rx_fill_ring_empty_descs );
329 0 : FD_MCNT_SET( NET, XDP_TX_RING_EMPTY_DESCS, stats.tx_ring_empty_descs );
330 0 : }
331 :
332 : /* net_is_fatal_xdp_error returns 1 if the given errno returned by an
333 : XDP API indicates a non-recoverable error code. The net tile should
334 : crash if it sees such an error so the problem does not go undetected.
335 : Otherwise, returns 0. */
336 :
337 : static int
338 0 : net_is_fatal_xdp_error( int err ) {
339 0 : return err==ESOCKTNOSUPPORT || err==EOPNOTSUPP || err==EINVAL ||
340 0 : err==EPERM;
341 0 : }
342 :
343 : /* net_tx_ready returns 1 if the current XSK is ready to submit a TX send
344 : job. If the XSK is blocked for sends, returns 0. Reasons for block
345 : include:
346 : - No XSK TX buffer is available
347 : - XSK TX ring is full */
348 :
349 : static int
350 : net_tx_ready( fd_net_ctx_t * ctx,
351 0 : uint if_idx ) {
352 0 : fd_xsk_t * xsk = &ctx->xsk[ if_idx ];
353 0 : fd_xdp_ring_t * tx_ring = &xsk->ring_tx;
354 0 : fd_net_free_ring_t * free = &ctx->free_tx;
355 0 : if( free->prod == free->cons ) return 0; /* drop */
356 0 : if( tx_ring->prod - tx_ring->cons >= tx_ring->depth ) return 0; /* drop */
357 0 : return 1;
358 0 : }
359 :
360 : /* net_rx_wakeup triggers xsk_recvmsg to run in the kernel. Needs to be
361 : called periodically in order to receive packets. */
362 :
363 : static void
364 : net_rx_wakeup( fd_net_ctx_t * ctx,
365 : fd_xsk_t * xsk,
366 0 : int * charge_busy ) {
367 0 : if( !fd_xsk_rx_need_wakeup( xsk ) ) return;
368 0 : *charge_busy = 1;
369 0 : struct msghdr _ignored[ 1 ] = { 0 };
370 0 : if( FD_UNLIKELY( -1==recvmsg( xsk->xsk_fd, _ignored, MSG_DONTWAIT ) ) ) {
371 0 : if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
372 0 : FD_LOG_ERR(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
373 0 : }
374 0 : if( FD_UNLIKELY( errno!=EAGAIN ) ) {
375 0 : long ts = fd_log_wallclock();
376 0 : if( ts > xsk->log_suppress_until_ns ) {
377 0 : FD_LOG_WARNING(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
378 0 : xsk->log_suppress_until_ns = ts + (long)1e9;
379 0 : }
380 0 : }
381 0 : }
382 0 : ctx->metrics.xsk_rx_wakeup_cnt++;
383 0 : }
384 :
385 : /* net_tx_wakeup triggers xsk_sendmsg to run in the kernel. Needs to be
386 : called periodically in order to transmit packets. */
387 :
388 : static void
389 : net_tx_wakeup( fd_net_ctx_t * ctx,
390 : fd_xsk_t * xsk,
391 0 : int * charge_busy ) {
392 0 : if( !fd_xsk_tx_need_wakeup( xsk ) ) return;
393 0 : if( FD_VOLATILE_CONST( *xsk->ring_tx.prod )==FD_VOLATILE_CONST( *xsk->ring_tx.cons ) ) return;
394 0 : *charge_busy = 1;
395 0 : if( FD_UNLIKELY( -1==sendto( xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) {
396 0 : if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
397 0 : FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
398 0 : }
399 0 : if( FD_UNLIKELY( errno!=EAGAIN ) ) {
400 0 : long ts = fd_log_wallclock();
401 0 : if( ts > xsk->log_suppress_until_ns ) {
402 0 : FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
403 0 : xsk->log_suppress_until_ns = ts + (long)1e9;
404 0 : }
405 0 : }
406 0 : }
407 0 : ctx->metrics.xsk_tx_wakeup_cnt++;
408 0 : }
409 :
410 : /* net_tx_periodic_wakeup does a timer based xsk_sendmsg wakeup. */
411 :
412 : static inline int
413 : net_tx_periodic_wakeup( fd_net_ctx_t * ctx,
414 : uint if_idx,
415 : long now,
416 0 : int * charge_busy ) {
417 0 : uint tx_prod = FD_VOLATILE_CONST( *ctx->xsk[ if_idx ].ring_tx.prod );
418 0 : uint tx_cons = FD_VOLATILE_CONST( *ctx->xsk[ if_idx ].ring_tx.cons );
419 0 : int tx_ring_empty = tx_prod==tx_cons;
420 0 : if( fd_net_flusher_check( ctx->tx_flusher+if_idx, now, tx_ring_empty ) ) {
421 0 : net_tx_wakeup( ctx, &ctx->xsk[ if_idx ], charge_busy );
422 0 : fd_net_flusher_wakeup( ctx->tx_flusher+if_idx, now );
423 0 : }
424 0 : return 0;
425 0 : }
426 :
427 : static void
428 0 : during_housekeeping( fd_net_ctx_t * ctx ) {
429 0 : long now = fd_tickcount();
430 :
431 0 : ctx->metrics.rx_busy_cnt = 0UL;
432 0 : ctx->metrics.rx_idle_cnt = 0UL;
433 0 : ctx->metrics.tx_busy_cnt = 0UL;
434 0 : ctx->metrics.tx_idle_cnt = fd_seq_diff( ctx->free_tx.prod, ctx->free_tx.cons );
435 0 : for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
436 0 : fd_xsk_t * xsk = &ctx->xsk[ j ];
437 : /* Refresh all sequence numbers (consumer first, then producer) */
438 0 : FD_COMPILER_MFENCE();
439 0 : xsk->ring_fr.cached_cons = FD_VOLATILE_CONST( *xsk->ring_fr.cons );
440 0 : xsk->ring_fr.cached_prod = FD_VOLATILE_CONST( *xsk->ring_fr.prod );
441 0 : xsk->ring_rx.cached_cons = FD_VOLATILE_CONST( *xsk->ring_rx.cons );
442 0 : xsk->ring_rx.cached_prod = FD_VOLATILE_CONST( *xsk->ring_rx.prod );
443 0 : xsk->ring_tx.cached_cons = FD_VOLATILE_CONST( *xsk->ring_tx.cons );
444 0 : xsk->ring_tx.cached_prod = FD_VOLATILE_CONST( *xsk->ring_tx.prod );
445 0 : xsk->ring_cr.cached_cons = FD_VOLATILE_CONST( *xsk->ring_cr.cons );
446 0 : xsk->ring_cr.cached_prod = FD_VOLATILE_CONST( *xsk->ring_cr.prod );
447 0 : FD_COMPILER_MFENCE();
448 0 : ctx->metrics.rx_busy_cnt += (long)(int)( xsk->ring_rx.cached_prod - xsk->ring_rx.cached_cons );
449 0 : ctx->metrics.rx_idle_cnt += (long)(int)( xsk->ring_fr.cached_prod - xsk->ring_fr.cached_cons );
450 0 : ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_tx.cached_prod - xsk->ring_tx.cached_cons );
451 0 : ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_cr.cached_prod - xsk->ring_cr.cached_cons );
452 0 : }
453 :
454 0 : if( now > ctx->next_xdp_stats_refresh ) {
455 0 : ctx->next_xdp_stats_refresh = now + ctx->xdp_stats_interval_ticks;
456 0 : poll_xdp_statistics( ctx );
457 0 : }
458 0 : }
459 :
460 : /* net_tx_route resolves the destination interface index, src MAC address,
461 : and dst MAC address. Returns 1 on success, 0 on failure. On success,
462 : tx_op->{if_idx,mac_addrs} is set. */
463 :
464 : static int
465 : net_tx_route( fd_net_ctx_t * ctx,
466 0 : uint dst_ip ) {
467 :
468 : /* Route lookup */
469 :
470 0 : fd_fib4_hop_t hop[2] = {0};
471 0 : fd_fib4_lookup( ctx->fib_local, hop+0, dst_ip, 0UL );
472 0 : fd_fib4_lookup( ctx->fib_main, hop+1, dst_ip, 0UL );
473 0 : fd_fib4_hop_t const * next_hop = fd_fib4_hop_or( hop+0, hop+1 );
474 :
475 0 : uint rtype = next_hop->rtype;
476 0 : uint if_idx = next_hop->if_idx;
477 0 : uint ip4_src = next_hop->ip4_src;
478 :
479 0 : if( FD_UNLIKELY( rtype==FD_FIB4_RTYPE_LOCAL ) ) {
480 0 : rtype = FD_FIB4_RTYPE_UNICAST;
481 0 : if_idx = 1;
482 0 : }
483 :
484 0 : if( FD_UNLIKELY( rtype!=FD_FIB4_RTYPE_UNICAST ) ) {
485 0 : ctx->metrics.tx_route_fail_cnt++;
486 0 : return 0;
487 0 : }
488 :
489 0 : ip4_src = fd_uint_if( !!ctx->bind_address, ctx->bind_address, ip4_src );
490 :
491 0 : if( if_idx==1 ) {
492 : /* Set Ethernet src and dst address to 00:00:00:00:00:00 */
493 0 : memset( ctx->tx_op.mac_addrs, 0, 12UL );
494 0 : ctx->tx_op.if_idx = 1;
495 : /* Set preferred src address to 127.0.0.1 if no bind address is set */
496 0 : if( !ip4_src ) ip4_src = FD_IP4_ADDR( 127,0,0,1 );
497 0 : ctx->tx_op.src_ip = ip4_src;
498 0 : return 1;
499 0 : }
500 :
501 0 : if( FD_UNLIKELY( if_idx!=ctx->xsk[ 0 ].if_idx ) ) {
502 0 : ctx->metrics.tx_no_xdp_cnt++;
503 0 : return 0;
504 0 : }
505 0 : ctx->tx_op.if_idx = 0;
506 :
507 : /* Neighbor resolve */
508 :
509 0 : uint neigh_ip = next_hop->ip4_gw;
510 0 : if( !neigh_ip ) neigh_ip = dst_ip;
511 :
512 0 : fd_neigh4_hmap_query_t neigh_query[1];
513 0 : int neigh_res = fd_neigh4_hmap_query_try( ctx->neigh4, &neigh_ip, NULL, neigh_query, 0 );
514 0 : if( FD_UNLIKELY( neigh_res!=FD_MAP_SUCCESS ) ) {
515 : /* Neighbor not found */
516 0 : fd_netlink_neigh4_solicit( ctx->neigh4_solicit, neigh_ip, if_idx, fd_frag_meta_ts_comp( fd_tickcount() ) );
517 0 : ctx->metrics.tx_neigh_fail_cnt++;
518 0 : return 0;
519 0 : }
520 0 : fd_neigh4_entry_t const * neigh = fd_neigh4_hmap_query_ele_const( neigh_query );
521 0 : if( FD_UNLIKELY( neigh->state != FD_NEIGH4_STATE_ACTIVE ) ) {
522 0 : ctx->metrics.tx_neigh_fail_cnt++;
523 0 : return 0;
524 0 : }
525 0 : ip4_src = fd_uint_if( !ip4_src, ctx->default_address, ip4_src );
526 0 : ctx->tx_op.src_ip = ip4_src;
527 0 : memcpy( ctx->tx_op.mac_addrs+0, neigh->mac_addr, 6 );
528 0 : memcpy( ctx->tx_op.mac_addrs+6, ctx->src_mac_addr, 6 );
529 :
530 0 : if( FD_UNLIKELY( fd_neigh4_hmap_query_test( neigh_query ) ) ) {
531 0 : ctx->metrics.tx_neigh_fail_cnt++;
532 0 : return 0;
533 0 : }
534 :
535 0 : return 1;
536 0 : }
537 :
538 : /* before_frag is called when a new metadata descriptor for a TX job is
539 : found. This callback determines whether this net tile is responsible
540 : for the TX job. If so, it prepares the TX op for the during_frag and
541 : after_frag callbacks. */
542 :
543 : static inline int
544 : before_frag( fd_net_ctx_t * ctx,
545 : ulong in_idx,
546 : ulong seq,
547 0 : ulong sig ) {
548 0 : (void)in_idx; (void)seq;
549 :
550 : /* Find interface index of next packet */
551 :
552 0 : ulong proto = fd_disco_netmux_sig_proto( sig );
553 0 : if( FD_UNLIKELY( proto!=DST_PROTO_OUTGOING ) ) return 1;
554 :
555 0 : uint dst_ip = fd_disco_netmux_sig_dst_ip( sig );
556 0 : if( FD_UNLIKELY( !net_tx_route( ctx, dst_ip ) ) ) return 1;
557 :
558 0 : uint net_tile_id = ctx->net_tile_id;
559 0 : uint net_tile_cnt = ctx->net_tile_cnt;
560 0 : uint if_idx = ctx->tx_op.if_idx;
561 0 : if( FD_UNLIKELY( if_idx>=ctx->xsk_cnt ) ) return 1; /* ignore */
562 :
563 : /* Load balance TX */
564 :
565 0 : uint hash = (uint)fd_disco_netmux_sig_hash( sig );
566 0 : uint target_idx = hash % net_tile_cnt;
567 0 : if( if_idx==1 ) target_idx = 0; /* loopback always targets tile 0 */
568 :
569 : /* Skip if another net tile is responsible for this packet */
570 :
571 0 : if( net_tile_id!=target_idx ) return 1; /* ignore */
572 :
573 : /* Skip if TX is blocked */
574 :
575 0 : if( FD_UNLIKELY( !net_tx_ready( ctx, if_idx ) ) ) {
576 0 : ctx->metrics.tx_full_fail_cnt++;
577 0 : return 1;
578 0 : }
579 :
580 : /* Allocate buffer for receive */
581 :
582 0 : fd_net_free_ring_t * free = &ctx->free_tx;
583 0 : ulong alloc_seq = free->cons;
584 0 : void * frame = (void *)free->queue[ alloc_seq % free->depth ];
585 0 : free->cons = fd_seq_inc( alloc_seq, 1UL );
586 :
587 0 : ctx->tx_op.if_idx = if_idx;
588 0 : ctx->tx_op.frame = frame;
589 :
590 0 : return 0; /* continue */
591 0 : }
592 :
593 : /* during_frag is called when before_frag has committed to transmit an
594 : outgoing packet. */
595 :
596 : static inline void
597 : during_frag( fd_net_ctx_t * ctx,
598 : ulong in_idx,
599 : ulong seq FD_PARAM_UNUSED,
600 : ulong sig FD_PARAM_UNUSED,
601 : ulong chunk,
602 : ulong sz,
603 0 : ulong ctl FD_PARAM_UNUSED ) {
604 0 : if( FD_UNLIKELY( chunk<ctx->in[ in_idx ].chunk0 || chunk>ctx->in[ in_idx ].wmark || sz>FD_NET_MTU ) )
605 0 : FD_LOG_ERR(( "chunk %lu %lu corrupt, not in range [%lu,%lu]", chunk, sz, ctx->in[ in_idx ].chunk0, ctx->in[ in_idx ].wmark ));
606 :
607 0 : if( FD_UNLIKELY( sz<34UL ) )
608 0 : FD_LOG_ERR(( "packet too small %lu (in_idx=%lu)", sz, in_idx ));
609 :
610 0 : void * frame = ctx->tx_op.frame;
611 0 : if( FD_UNLIKELY( (ulong)frame < (ulong)ctx->umem_frame0 ) )
612 0 : FD_LOG_ERR(( "frame %p out of bounds (below %p)", frame, (void *)ctx->umem_frame0 ));
613 0 : ulong umem_off = (ulong)frame - (ulong)ctx->umem_frame0;
614 0 : if( FD_UNLIKELY( (ulong)umem_off > (ulong)ctx->umem_sz ) )
615 0 : FD_LOG_ERR(( "frame %p out of bounds (beyond %p)", frame, (void *)ctx->umem_sz ));
616 :
617 : /* Speculatively copy frame into XDP buffer */
618 0 : uchar const * src = fd_chunk_to_laddr_const( ctx->in[ in_idx ].mem, chunk );
619 0 : fd_memcpy( ctx->tx_op.frame, src, sz );
620 0 : }
621 :
622 : /* after_frag is called when the during_frag memcpy was _not_ overrun. */
623 :
624 : static void
625 : after_frag( fd_net_ctx_t * ctx,
626 : ulong in_idx,
627 : ulong seq,
628 : ulong sig,
629 : ulong sz,
630 : ulong tsorig,
631 : ulong tspub,
632 0 : fd_stem_context_t * stem ) {
633 0 : (void)in_idx; (void)seq; (void)sig; (void)tsorig; (void)tspub; (void)stem;
634 :
635 : /* Current send operation */
636 :
637 0 : uint if_idx = ctx->tx_op.if_idx;
638 0 : uchar * frame = ctx->tx_op.frame;
639 0 : fd_xsk_t * xsk = &ctx->xsk[ if_idx ];
640 :
641 : /* Select Ethernet addresses */
642 0 : memcpy( frame, ctx->tx_op.mac_addrs, 12 );
643 :
644 : /* Select IPv4 source address */
645 0 : uint ihl = frame[ 14 ] & 0x0f;
646 0 : ushort ethertype = FD_LOAD( ushort, frame+12 );
647 0 : uint ip4_saddr = FD_LOAD( uint, frame+26 );
648 0 : if( ethertype==fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) && ip4_saddr==0 ) {
649 0 : if( FD_UNLIKELY( ctx->tx_op.src_ip==0 ||
650 0 : ihl<5 || (14+(ihl<<2))>sz ) ) {
651 : /* Outgoing IPv4 packet with unknown src IP or invalid IHL */
652 : /* FIXME should select first IPv4 address of device table here */
653 0 : ctx->metrics.tx_route_fail_cnt++;
654 0 : return;
655 0 : }
656 :
657 : /* Recompute checksum after changing header */
658 0 : FD_STORE( uint, frame+26, ctx->tx_op.src_ip );
659 0 : FD_STORE( ushort, frame+24, 0 );
660 0 : FD_STORE( ushort, frame+24, fd_ip4_hdr_check( frame+14 ) );
661 0 : }
662 :
663 : /* Submit packet TX job
664 :
665 : Invariant for ring_tx: prod-cons<length
666 : (This invariant breaks if any other packet is sent over this ring
667 : between before_frag and this point, e.g. send_arp_probe.) */
668 :
669 0 : fd_xdp_ring_t * tx_ring = &xsk->ring_tx;
670 0 : uint tx_seq = FD_VOLATILE_CONST( *tx_ring->prod );
671 0 : uint tx_mask = tx_ring->depth - 1U;
672 0 : xsk->ring_tx.packet_ring[ tx_seq&tx_mask ] = (struct xdp_desc) {
673 0 : .addr = (ulong)frame - (ulong)ctx->umem_frame0,
674 0 : .len = (uint)sz,
675 0 : .options = 0
676 0 : };
677 :
678 : /* Frame is now owned by kernel. Clear tx_op. */
679 0 : ctx->tx_op.frame = NULL;
680 :
681 : /* Register newly enqueued packet */
682 0 : FD_VOLATILE( *xsk->ring_tx.prod ) = tx_ring->cached_prod = tx_seq+1U;
683 0 : ctx->metrics.tx_submit_cnt++;
684 0 : ctx->metrics.tx_bytes_total += sz;
685 0 : fd_net_flusher_inc( ctx->tx_flusher+if_idx, fd_tickcount() );
686 :
687 0 : }
688 :
689 : /* net_rx_packet is called when a new Ethernet frame is available.
690 : Attempts to copy out the frame to a downstream tile. */
691 :
692 : static void
693 : net_rx_packet( fd_net_ctx_t * ctx,
694 : ulong umem_off,
695 : ulong sz,
696 0 : uint * freed_chunk ) {
697 :
698 0 : ulong umem_lowbits = umem_off & 0x3fUL;
699 :
700 0 : uchar const * packet = (uchar const *)ctx->umem_frame0 + umem_off;
701 0 : uchar const * packet_end = packet + sz;
702 0 : uchar const * iphdr = packet + 14U;
703 :
704 : /* Translate packet to UMEM frame index */
705 0 : ulong chunk = ctx->umem_chunk0 + (umem_off>>FD_CHUNK_LG_SZ);
706 :
707 : /* Filter for UDP/IPv4 packets. Test for ethtype and ipproto in 1
708 : branch */
709 0 : uint test_ethip = ( (uint)packet[12] << 16u ) | ( (uint)packet[13] << 8u ) | (uint)packet[23];
710 0 : if( FD_UNLIKELY( test_ethip!=0x080011 ) ) {
711 0 : FD_LOG_ERR(( "Firedancer received a packet from the XDP program that was either "
712 0 : "not an IPv4 packet, or not a UDP packet. It is likely your XDP program "
713 0 : "is not configured correctly." ));
714 0 : }
715 :
716 : /* IPv4 is variable-length, so lookup IHL to find start of UDP */
717 0 : uint iplen = ( ( (uint)iphdr[0] ) & 0x0FU ) * 4U;
718 0 : uchar const * udp = iphdr + iplen;
719 :
720 : /* Ignore if UDP header is too short */
721 0 : if( FD_UNLIKELY( udp+8U > packet_end ) ) {
722 0 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
723 0 : ctx->metrics.rx_undersz_cnt++;
724 0 : return;
725 0 : }
726 :
727 : /* Extract IP dest addr and UDP src/dest port */
728 0 : uint ip_srcaddr = *(uint *)( iphdr+12UL );
729 0 : ushort udp_srcport = fd_ushort_bswap( *(ushort *)( udp+0UL ) );
730 0 : ushort udp_dstport = fd_ushort_bswap( *(ushort *)( udp+2UL ) );
731 :
732 0 : FD_DTRACE_PROBE_4( net_tile_pkt_rx, ip_srcaddr, udp_srcport, udp_dstport, sz );
733 :
734 : /* Route packet to downstream tile */
735 0 : ushort proto;
736 0 : fd_net_out_ctx_t * out;
737 0 : if( FD_UNLIKELY( udp_dstport==ctx->shred_listen_port ) ) {
738 0 : proto = DST_PROTO_SHRED;
739 0 : out = ctx->shred_out;
740 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->quic_transaction_listen_port ) ) {
741 0 : proto = DST_PROTO_TPU_QUIC;
742 0 : out = ctx->quic_out;
743 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->legacy_transaction_listen_port ) ) {
744 0 : proto = DST_PROTO_TPU_UDP;
745 0 : out = ctx->quic_out;
746 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->gossip_listen_port ) ) {
747 0 : proto = DST_PROTO_GOSSIP;
748 0 : out = ctx->gossip_out;
749 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->repair_intake_listen_port ) ) {
750 0 : proto = DST_PROTO_REPAIR;
751 0 : if( FD_UNLIKELY( sz == REPAIR_PING_SZ ) ) out = ctx->repair_out; /* ping-pong */
752 0 : else out = ctx->shred_out;
753 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->repair_serve_listen_port ) ) {
754 0 : proto = DST_PROTO_REPAIR;
755 0 : out = ctx->repair_out;
756 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->send_src_port ) ) {
757 0 : proto = DST_PROTO_SEND;
758 0 : out = ctx->send_out;
759 0 : } else {
760 :
761 0 : FD_LOG_ERR(( "Firedancer received a UDP packet on port %hu which was not expected. "
762 0 : "Only the following ports should be configured to forward packets: "
763 0 : "%hu, %hu, %hu, %hu, %hu, %hu (excluding any 0 ports, which can be ignored)."
764 0 : "Please report this error to Firedancer maintainers.",
765 0 : udp_dstport,
766 0 : ctx->shred_listen_port,
767 0 : ctx->quic_transaction_listen_port,
768 0 : ctx->legacy_transaction_listen_port,
769 0 : ctx->gossip_listen_port,
770 0 : ctx->repair_intake_listen_port,
771 0 : ctx->repair_serve_listen_port ));
772 0 : }
773 :
774 : /* tile can decide how to partition based on src ip addr and src port */
775 0 : ulong sig = fd_disco_netmux_sig( ip_srcaddr, udp_srcport, 0U, proto, 14UL+8UL+iplen );
776 :
777 : /* Peek the mline for an old frame */
778 0 : fd_frag_meta_t * mline = out->mcache + fd_mcache_line_idx( out->seq, out->depth );
779 0 : *freed_chunk = mline->chunk;
780 :
781 : /* Overwrite the mline with the new frame */
782 0 : ulong tspub = (ulong)fd_frag_meta_ts_comp( fd_tickcount() );
783 0 : fd_mcache_publish( out->mcache, out->depth, out->seq, sig, chunk, sz, umem_lowbits, 0, tspub );
784 :
785 : /* Wind up for the next iteration */
786 0 : out->seq = fd_seq_inc( out->seq, 1UL );
787 :
788 0 : ctx->metrics.rx_pkt_cnt++;
789 0 : ctx->metrics.rx_bytes_total += sz;
790 :
791 0 : }
792 :
793 : /* net_comp_event is called when an XDP TX frame is free again. */
794 :
795 : static void
796 : net_comp_event( fd_net_ctx_t * ctx,
797 : fd_xsk_t * xsk,
798 0 : uint comp_seq ) {
799 :
800 : /* Locate the incoming frame */
801 :
802 0 : fd_xdp_ring_t * comp_ring = &xsk->ring_cr;
803 0 : uint comp_mask = comp_ring->depth - 1U;
804 0 : ulong frame = FD_VOLATILE_CONST( comp_ring->frame_ring[ comp_seq&comp_mask ] );
805 0 : ulong const frame_mask = FD_NET_MTU - 1UL;
806 0 : if( FD_UNLIKELY( frame+FD_NET_MTU > ctx->umem_sz ) ) {
807 0 : FD_LOG_ERR(( "Bounds check failed: frame=0x%lx umem_sz=0x%lx",
808 0 : frame, (ulong)ctx->umem_sz ));
809 0 : }
810 :
811 : /* Check if we have space to return the freed frame */
812 :
813 0 : fd_net_free_ring_t * free = &ctx->free_tx;
814 0 : ulong free_prod = free->prod;
815 0 : ulong free_mask = free->depth - 1UL;
816 0 : long free_cnt = fd_seq_diff( free_prod, free->cons );
817 0 : if( FD_UNLIKELY( free_cnt>=(long)free->depth ) ) return; /* blocked */
818 :
819 0 : free->queue[ free_prod&free_mask ] = (ulong)ctx->umem_frame0 + (frame & (~frame_mask));
820 0 : free->prod = fd_seq_inc( free_prod, 1UL );
821 :
822 : /* Wind up for next iteration */
823 :
824 0 : FD_VOLATILE( *comp_ring->cons ) = comp_ring->cached_cons = comp_seq+1U;
825 :
826 0 : ctx->metrics.tx_complete_cnt++;
827 :
828 0 : }
829 :
830 : /* net_rx_event is called when a new XDP RX frame is available. Calls
831 : net_rx_packet, then returns the packet back to the kernel via the fill
832 : ring. */
833 :
834 : static void
835 : net_rx_event( fd_net_ctx_t * ctx,
836 : fd_xsk_t * xsk,
837 0 : uint rx_seq ) {
838 : /* Locate the incoming frame */
839 :
840 0 : fd_xdp_ring_t * rx_ring = &xsk->ring_rx;
841 0 : uint rx_mask = rx_ring->depth - 1U;
842 0 : struct xdp_desc frame = FD_VOLATILE_CONST( rx_ring->packet_ring[ rx_seq&rx_mask ] );
843 :
844 0 : if( FD_UNLIKELY( frame.len>FD_NET_MTU ) )
845 0 : FD_LOG_ERR(( "received a UDP packet with a too large payload (%u)", frame.len ));
846 :
847 : /* Check if we have space in the fill ring to free the frame */
848 :
849 0 : fd_xdp_ring_t * fill_ring = &xsk->ring_fr;
850 0 : uint fill_depth = fill_ring->depth;
851 0 : uint fill_mask = fill_depth-1U;
852 0 : ulong frame_mask = FD_NET_MTU - 1UL;
853 0 : uint fill_prod = FD_VOLATILE_CONST( *fill_ring->prod );
854 0 : uint fill_cons = FD_VOLATILE_CONST( *fill_ring->cons );
855 :
856 0 : if( FD_UNLIKELY( (int)(fill_prod-fill_cons) >= (int)fill_depth ) ) {
857 0 : ctx->metrics.rx_fill_blocked_cnt++;
858 0 : return; /* blocked */
859 0 : }
860 :
861 : /* Pass it to the receive handler */
862 :
863 0 : uint freed_chunk = UINT_MAX;
864 0 : net_rx_packet( ctx, frame.addr, frame.len, &freed_chunk );
865 :
866 0 : FD_COMPILER_MFENCE();
867 0 : FD_VOLATILE( *rx_ring->cons ) = rx_ring->cached_cons = rx_seq+1U;
868 :
869 : /* If this mcache publish shadowed a previous publish, mark the old
870 : frame as free. */
871 :
872 0 : if( FD_LIKELY( freed_chunk!=UINT_MAX ) ) {
873 0 : if( FD_UNLIKELY( ( freed_chunk < ctx->umem_chunk0 ) |
874 0 : ( freed_chunk > ctx->umem_wmark ) ) ) {
875 0 : FD_LOG_ERR(( "mcache corruption detected: chunk=%u chunk0=%u wmark=%u",
876 0 : freed_chunk, ctx->umem_chunk0, ctx->umem_wmark ));
877 0 : }
878 0 : ulong freed_off = (freed_chunk - ctx->umem_chunk0)<<FD_CHUNK_LG_SZ;
879 0 : fill_ring->frame_ring[ fill_prod&fill_mask ] = freed_off & (~frame_mask);
880 0 : FD_VOLATILE( *fill_ring->prod ) = fill_ring->cached_prod = fill_prod+1U;
881 0 : }
882 :
883 0 : }
884 :
885 : /* before_credit is called every loop iteration. */
886 :
887 : static void
888 : before_credit( fd_net_ctx_t * ctx,
889 : fd_stem_context_t * stem,
890 0 : int * charge_busy ) {
891 0 : (void)stem;
892 : /* A previous send attempt was overrun. A corrupt copy of the packet was
893 : placed into an XDP frame, but the frame was not yet submitted to the
894 : TX ring. Return the tx buffer to the free list. */
895 :
896 0 : if( ctx->tx_op.frame ) {
897 0 : *charge_busy = 1;
898 0 : fd_net_free_ring_t * free = &ctx->free_tx;
899 0 : ulong alloc_seq = free->prod;
900 0 : free->queue[ alloc_seq % free->depth ] = (ulong)ctx->tx_op.frame;
901 0 : free->prod = fd_seq_inc( alloc_seq, 1UL );
902 0 : ctx->tx_op.frame = NULL;
903 0 : }
904 :
905 : /* Check if new packets are available or if TX frames are free again
906 : (Round-robin through sockets) */
907 :
908 0 : uint rr_idx = ctx->rr_idx;
909 0 : fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ];
910 :
911 0 : net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy );
912 :
913 0 : uint rx_cons = rr_xsk->ring_rx.cached_cons;
914 0 : uint rx_prod = FD_VOLATILE_CONST( *rr_xsk->ring_rx.prod );
915 0 : if( rx_cons!=rx_prod ) {
916 0 : *charge_busy = 1;
917 0 : rr_xsk->ring_rx.cached_prod = rx_prod;
918 0 : net_rx_event( ctx, rr_xsk, rx_cons );
919 0 : } else {
920 0 : net_rx_wakeup( ctx, rr_xsk, charge_busy );
921 0 : ctx->rr_idx++;
922 0 : ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx );
923 0 : }
924 :
925 0 : uint comp_cons = FD_VOLATILE_CONST( *rr_xsk->ring_cr.cons );
926 0 : uint comp_prod = FD_VOLATILE_CONST( *rr_xsk->ring_cr.prod );
927 0 : if( comp_cons!=comp_prod ) {
928 0 : *charge_busy = 1;
929 0 : rr_xsk->ring_cr.cached_prod = comp_prod;
930 0 : net_comp_event( ctx, rr_xsk, comp_cons );
931 0 : }
932 :
933 0 : }
934 :
935 : /* net_xsk_bootstrap assigns UMEM frames to the FILL ring. */
936 :
937 : static ulong
938 : net_xsk_bootstrap( fd_net_ctx_t * ctx,
939 : uint xsk_idx,
940 0 : ulong frame_off ) {
941 0 : fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
942 :
943 0 : ulong const frame_sz = FD_NET_MTU;
944 0 : ulong const fr_depth = ctx->xsk[ xsk_idx ].ring_fr.depth/2UL;
945 :
946 0 : fd_xdp_ring_t * fill = &xsk->ring_fr;
947 0 : uint fill_prod = fill->cached_prod;
948 0 : for( ulong j=0UL; j<fr_depth; j++ ) {
949 0 : fill->frame_ring[ j ] = frame_off;
950 0 : frame_off += frame_sz;
951 0 : }
952 0 : FD_VOLATILE( *fill->prod ) = fill->cached_prod = fill_prod + (uint)fr_depth;
953 :
954 0 : return frame_off;
955 0 : }
956 :
957 : /* FIXME source MAC address from netlnk tile instead */
958 :
959 : static void
960 : interface_addrs( const char * interface,
961 : uchar * mac,
962 0 : uint * ip4_addr ) {
963 0 : int fd = socket( AF_INET, SOCK_DGRAM, 0 );
964 0 : struct ifreq ifr;
965 0 : ifr.ifr_addr.sa_family = AF_INET;
966 :
967 0 : strncpy( ifr.ifr_name, interface, IFNAMSIZ );
968 0 : if( FD_UNLIKELY( ioctl( fd, SIOCGIFHWADDR, &ifr ) ) )
969 0 : FD_LOG_ERR(( "could not get MAC address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
970 0 : fd_memcpy( mac, ifr.ifr_hwaddr.sa_data, 6 );
971 :
972 0 : if( FD_UNLIKELY( ioctl( fd, SIOCGIFADDR, &ifr ) ) )
973 0 : FD_LOG_ERR(( "could not get IP address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
974 0 : *ip4_addr = ((struct sockaddr_in *)fd_type_pun( &ifr.ifr_addr ))->sin_addr.s_addr;
975 :
976 0 : if( FD_UNLIKELY( close(fd) ) )
977 0 : FD_LOG_ERR(( "could not close socket (%i-%s)", errno, fd_io_strerror( errno ) ));
978 0 : }
979 :
980 : /* privileged_init does the following initialization steps:
981 :
982 : - Create an AF_XDP socket
983 : - Map XDP metadata rings
984 : - Register UMEM data region with socket
985 : - Insert AF_XDP socket into xsk_map
986 :
987 : Net tile 0 also runs fd_xdp_install and repeats the above step for
988 : the loopback device. (Unless the main interface is already loopback)
989 :
990 : Kernel object references:
991 :
992 : BPF_LINK file descriptor
993 : |
994 : +-> XDP program installation on NIC
995 : | |
996 : | +-> XDP program <-- BPF_PROG file descriptor (prog_fd)
997 : |
998 : +-> XSKMAP object <-- BPF_MAP file descriptor (xsk_map) */
999 :
1000 : static void
1001 : privileged_init( fd_topo_t * topo,
1002 0 : fd_topo_tile_t * tile ) {
1003 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1004 :
1005 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1006 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
1007 0 : ulong * free_tx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );;
1008 :
1009 0 : fd_memset( ctx, 0, sizeof(fd_net_ctx_t) );
1010 :
1011 0 : uint if_idx = if_nametoindex( tile->xdp.interface );
1012 0 : if( FD_UNLIKELY( !if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed", tile->xdp.interface ));
1013 :
1014 0 : interface_addrs( tile->xdp.interface, ctx->src_mac_addr, &ctx->default_address );
1015 :
1016 : /* Load up dcache containing UMEM */
1017 :
1018 0 : void * const dcache_mem = fd_topo_obj_laddr( topo, tile->net.umem_dcache_obj_id );
1019 0 : void * const umem_dcache = fd_dcache_join( dcache_mem );
1020 0 : ulong const umem_dcache_data_sz = fd_dcache_data_sz( umem_dcache );
1021 0 : ulong const umem_frame_sz = 2048UL;
1022 :
1023 : /* Left shrink UMEM region to be 4096 byte aligned */
1024 :
1025 0 : void * const umem_frame0 = (void *)fd_ulong_align_up( (ulong)umem_dcache, 4096UL );
1026 0 : ulong umem_sz = umem_dcache_data_sz - ((ulong)umem_frame0 - (ulong)umem_dcache);
1027 0 : umem_sz = fd_ulong_align_dn( umem_sz, umem_frame_sz );
1028 :
1029 : /* Derive chunk bounds */
1030 :
1031 0 : void * const umem_base = fd_wksp_containing( dcache_mem );
1032 0 : ulong const umem_chunk0 = ( (ulong)umem_frame0 - (ulong)umem_base )>>FD_CHUNK_LG_SZ;
1033 0 : ulong const umem_wmark = umem_chunk0 + ( ( umem_sz-umem_frame_sz )>>FD_CHUNK_LG_SZ );
1034 0 : if( FD_UNLIKELY( umem_chunk0>UINT_MAX || umem_wmark>UINT_MAX || umem_chunk0>umem_wmark ) ) {
1035 0 : FD_LOG_ERR(( "Calculated invalid UMEM bounds [%lu,%lu]", umem_chunk0, umem_wmark ));
1036 0 : }
1037 :
1038 0 : if( FD_UNLIKELY( !umem_base ) ) FD_LOG_ERR(( "UMEM dcache is not in a workspace" ));
1039 0 : if( FD_UNLIKELY( !umem_dcache ) ) FD_LOG_ERR(( "Failed to join UMEM dcache" ));
1040 :
1041 0 : ctx->umem_frame0 = umem_frame0;
1042 0 : ctx->umem_sz = umem_sz;
1043 0 : ctx->umem_chunk0 = (uint)umem_chunk0;
1044 0 : ctx->umem_wmark = (uint)umem_wmark;
1045 :
1046 0 : ctx->free_tx.queue = free_tx;
1047 0 : ctx->free_tx.depth = tile->xdp.xdp_tx_queue_size;
1048 :
1049 : /* Create and install XSKs */
1050 :
1051 0 : fd_xsk_params_t params0 = {
1052 0 : .if_idx = if_idx,
1053 0 : .if_queue_id = (uint)tile->kind_id,
1054 :
1055 : /* Some kernels produce EOPNOTSUP errors on sendto calls when
1056 : starting up without either XDP_ZEROCOPY or XDP_COPY
1057 : (e.g. 5.14.0-503.23.1.el9_5 with i40e) */
1058 0 : .bind_flags = tile->xdp.zero_copy ? XDP_ZEROCOPY : XDP_COPY,
1059 :
1060 0 : .fr_depth = tile->xdp.xdp_rx_queue_size*2,
1061 0 : .rx_depth = tile->xdp.xdp_rx_queue_size,
1062 0 : .cr_depth = tile->xdp.xdp_tx_queue_size,
1063 0 : .tx_depth = tile->xdp.xdp_tx_queue_size,
1064 :
1065 0 : .umem_addr = umem_frame0,
1066 0 : .frame_sz = umem_frame_sz,
1067 0 : .umem_sz = umem_sz
1068 0 : };
1069 :
1070 0 : int xsk_map_fd = 123462;
1071 0 : ctx->prog_link_fds[ 0 ] = 123463;
1072 : /* Init XSK */
1073 0 : if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 0 ], ¶ms0 ) ) ) FD_LOG_ERR(( "failed to bind xsk for net tile %lu", tile->kind_id ));
1074 0 : if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 0 ], xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate xsk for net tile %lu", tile->kind_id ));
1075 0 : ctx->xsk_cnt = 1;
1076 :
1077 0 : if( FD_UNLIKELY( fd_sandbox_gettid()==fd_sandbox_getpid() ) ) {
1078 : /* Kind of gross.. in single threaded mode we don't want to close the xsk_map_fd
1079 : since it's shared with other net tiles. Just check for that by seeing if we
1080 : are the only thread in the process. */
1081 0 : if( FD_UNLIKELY( -1==close( xsk_map_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
1082 0 : }
1083 :
1084 : /* Networking tile at index 0 also binds to loopback (only queue 0 available on lo) */
1085 :
1086 0 : if( FD_UNLIKELY( strcmp( tile->xdp.interface, "lo" ) && !tile->kind_id ) ) {
1087 0 : ctx->xsk_cnt = 2;
1088 :
1089 0 : ushort udp_port_candidates[] = {
1090 0 : (ushort)tile->xdp.net.legacy_transaction_listen_port,
1091 0 : (ushort)tile->xdp.net.quic_transaction_listen_port,
1092 0 : (ushort)tile->xdp.net.shred_listen_port,
1093 0 : (ushort)tile->xdp.net.gossip_listen_port,
1094 0 : (ushort)tile->xdp.net.repair_intake_listen_port,
1095 0 : (ushort)tile->xdp.net.repair_serve_listen_port,
1096 0 : (ushort)tile->xdp.net.send_src_port
1097 0 : };
1098 :
1099 0 : uint lo_idx = if_nametoindex( "lo" );
1100 0 : if( FD_UNLIKELY( !lo_idx ) ) FD_LOG_ERR(( "if_nametoindex(lo) failed" ));
1101 :
1102 : /* FIXME move this to fd_topo_run */
1103 0 : fd_xdp_fds_t lo_fds = fd_xdp_install( lo_idx,
1104 0 : tile->net.bind_address,
1105 0 : sizeof(udp_port_candidates)/sizeof(udp_port_candidates[0]),
1106 0 : udp_port_candidates,
1107 0 : "skb" );
1108 :
1109 0 : ctx->prog_link_fds[ 1 ] = lo_fds.prog_link_fd;
1110 : /* init xsk 1 */
1111 0 : fd_xsk_params_t params1 = params0;
1112 0 : params1.if_idx = lo_idx; /* probably always 1 */
1113 0 : params1.if_queue_id = 0;
1114 0 : params1.bind_flags = 0;
1115 0 : if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 1 ], ¶ms1 ) ) ) FD_LOG_ERR(( "failed to bind lo_xsk" ));
1116 0 : if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 1 ], lo_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate lo_xsk" ));
1117 0 : if( FD_UNLIKELY( -1==close( lo_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
1118 0 : }
1119 :
1120 0 : double tick_per_ns = fd_tempo_tick_per_ns( NULL );
1121 0 : ctx->xdp_stats_interval_ticks = (long)( FD_XDP_STATS_INTERVAL_NS * tick_per_ns );
1122 :
1123 0 : ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
1124 0 : if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
1125 0 : FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
1126 0 : }
1127 :
1128 : static void
1129 : unprivileged_init( fd_topo_t * topo,
1130 0 : fd_topo_tile_t * tile ) {
1131 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1132 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1133 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
1134 0 : FD_TEST( ctx->xsk_cnt!=0 );
1135 :
1136 0 : ctx->net_tile_id = (uint)tile->kind_id;
1137 0 : ctx->net_tile_cnt = (uint)fd_topo_tile_name_cnt( topo, tile->name );
1138 :
1139 0 : ctx->bind_address = tile->net.bind_address;
1140 0 : ctx->shred_listen_port = tile->net.shred_listen_port;
1141 0 : ctx->quic_transaction_listen_port = tile->net.quic_transaction_listen_port;
1142 0 : ctx->legacy_transaction_listen_port = tile->net.legacy_transaction_listen_port;
1143 0 : ctx->gossip_listen_port = tile->net.gossip_listen_port;
1144 0 : ctx->repair_intake_listen_port = tile->net.repair_intake_listen_port;
1145 0 : ctx->repair_serve_listen_port = tile->net.repair_serve_listen_port;
1146 0 : ctx->send_src_port = tile->net.send_src_port;
1147 :
1148 : /* Put a bound on chunks we read from the input, to make sure they
1149 : are within in the data region of the workspace. */
1150 :
1151 0 : if( FD_UNLIKELY( !tile->in_cnt ) ) FD_LOG_ERR(( "net tile in link cnt is zero" ));
1152 0 : if( FD_UNLIKELY( tile->in_cnt>MAX_NET_INS ) ) FD_LOG_ERR(( "net tile in link cnt %lu exceeds MAX_NET_INS %lu", tile->in_cnt, MAX_NET_INS ));
1153 0 : FD_TEST( tile->in_cnt<=32 );
1154 0 : for( ulong i=0UL; i<tile->in_cnt; i++ ) {
1155 0 : fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ];
1156 0 : if( FD_UNLIKELY( link->mtu!=FD_NET_MTU ) ) FD_LOG_ERR(( "net tile in link does not have a normal MTU" ));
1157 :
1158 0 : ctx->in[ i ].mem = topo->workspaces[ topo->objs[ link->dcache_obj_id ].wksp_id ].wksp;
1159 0 : ctx->in[ i ].chunk0 = fd_dcache_compact_chunk0( ctx->in[ i ].mem, link->dcache );
1160 0 : ctx->in[ i ].wmark = fd_dcache_compact_wmark( ctx->in[ i ].mem, link->dcache, link->mtu );
1161 0 : }
1162 :
1163 0 : for( ulong i = 0; i < tile->out_cnt; i++ ) {
1164 0 : fd_topo_link_t * out_link = &topo->links[ tile->out_link_id[ i ] ];
1165 0 : if( strcmp( out_link->name, "net_quic" ) == 0 ) {
1166 0 : fd_topo_link_t * quic_out = out_link;
1167 0 : ctx->quic_out->mcache = quic_out->mcache;
1168 0 : ctx->quic_out->sync = fd_mcache_seq_laddr( ctx->quic_out->mcache );
1169 0 : ctx->quic_out->depth = fd_mcache_depth( ctx->quic_out->mcache );
1170 0 : ctx->quic_out->seq = fd_mcache_seq_query( ctx->quic_out->sync );
1171 0 : } else if( strcmp( out_link->name, "net_shred" ) == 0 ) {
1172 0 : fd_topo_link_t * shred_out = out_link;
1173 0 : ctx->shred_out->mcache = shred_out->mcache;
1174 0 : ctx->shred_out->sync = fd_mcache_seq_laddr( ctx->shred_out->mcache );
1175 0 : ctx->shred_out->depth = fd_mcache_depth( ctx->shred_out->mcache );
1176 0 : ctx->shred_out->seq = fd_mcache_seq_query( ctx->shred_out->sync );
1177 0 : } else if( strcmp( out_link->name, "net_gossip" ) == 0 ) {
1178 0 : fd_topo_link_t * gossip_out = out_link;
1179 0 : ctx->gossip_out->mcache = gossip_out->mcache;
1180 0 : ctx->gossip_out->sync = fd_mcache_seq_laddr( ctx->gossip_out->mcache );
1181 0 : ctx->gossip_out->depth = fd_mcache_depth( ctx->gossip_out->mcache );
1182 0 : ctx->gossip_out->seq = fd_mcache_seq_query( ctx->gossip_out->sync );
1183 0 : } else if( strcmp( out_link->name, "net_repair" ) == 0 ) {
1184 0 : fd_topo_link_t * repair_out = out_link;
1185 0 : ctx->repair_out->mcache = repair_out->mcache;
1186 0 : ctx->repair_out->sync = fd_mcache_seq_laddr( ctx->repair_out->mcache );
1187 0 : ctx->repair_out->depth = fd_mcache_depth( ctx->repair_out->mcache );
1188 0 : ctx->repair_out->seq = fd_mcache_seq_query( ctx->repair_out->sync );
1189 0 : } else if( strcmp( out_link->name, "net_netlnk" ) == 0 ) {
1190 0 : fd_topo_link_t * netlink_out = out_link;
1191 0 : ctx->neigh4_solicit->mcache = netlink_out->mcache;
1192 0 : ctx->neigh4_solicit->depth = fd_mcache_depth( ctx->neigh4_solicit->mcache );
1193 0 : ctx->neigh4_solicit->seq = fd_mcache_seq_query( fd_mcache_seq_laddr( ctx->neigh4_solicit->mcache ) );
1194 0 : } else if( strcmp( out_link->name, "net_send" ) == 0 ) {
1195 0 : fd_topo_link_t * send_out = out_link;
1196 0 : ctx->send_out->mcache = send_out->mcache;
1197 0 : ctx->send_out->sync = fd_mcache_seq_laddr( ctx->send_out->mcache );
1198 0 : ctx->send_out->depth = fd_mcache_depth( ctx->send_out->mcache );
1199 0 : ctx->send_out->seq = fd_mcache_seq_query( ctx->send_out->sync );
1200 0 : } else {
1201 0 : FD_LOG_ERR(( "unrecognized out link `%s`", out_link->name ));
1202 0 : }
1203 0 : }
1204 :
1205 : /* Check if any of the tiles we set a listen port for do not have an outlink. */
1206 0 : if( FD_UNLIKELY( ctx->shred_listen_port!=0 && ctx->shred_out->mcache==NULL ) ) {
1207 0 : FD_LOG_ERR(( "shred listen port set but no out link was found" ));
1208 0 : } else if( FD_UNLIKELY( ctx->quic_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
1209 0 : FD_LOG_ERR(( "quic transaction listen port set but no out link was found" ));
1210 0 : } else if( FD_UNLIKELY( ctx->legacy_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
1211 0 : FD_LOG_ERR(( "legacy transaction listen port set but no out link was found" ));
1212 0 : } else if( FD_UNLIKELY( ctx->gossip_listen_port!=0 && ctx->gossip_out->mcache==NULL ) ) {
1213 0 : FD_LOG_ERR(( "gossip listen port set but no out link was found" ));
1214 0 : } else if( FD_UNLIKELY( ctx->repair_intake_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
1215 0 : FD_LOG_ERR(( "repair intake port set but no out link was found" ));
1216 0 : } else if( FD_UNLIKELY( ctx->repair_serve_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
1217 0 : FD_LOG_ERR(( "repair serve listen port set but no out link was found" ));
1218 0 : } else if( FD_UNLIKELY( ctx->neigh4_solicit->mcache==NULL ) ) {
1219 0 : FD_LOG_ERR(( "netlink request link not found" ));
1220 0 : } else if( FD_UNLIKELY( ctx->send_src_port!=0 && ctx->send_out->mcache==NULL ) ) {
1221 0 : FD_LOG_ERR(( "send listen port set but no out link was found" ));
1222 0 : }
1223 :
1224 0 : for( uint j=0U; j<2U; j++ ) {
1225 0 : ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 );
1226 0 : ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) );
1227 0 : ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX;
1228 0 : }
1229 :
1230 : /* Join netbase objects */
1231 0 : ctx->fib_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_local_obj_id ) );
1232 0 : ctx->fib_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_main_obj_id ) );
1233 0 : if( FD_UNLIKELY( !ctx->fib_local || !ctx->fib_main ) ) FD_LOG_ERR(( "fd_fib4_join failed" ));
1234 0 : if( FD_UNLIKELY( !fd_neigh4_hmap_join(
1235 0 : ctx->neigh4,
1236 0 : fd_topo_obj_laddr( topo, tile->xdp.neigh4_obj_id ),
1237 0 : fd_topo_obj_laddr( topo, tile->xdp.neigh4_ele_obj_id ) ) ) ) {
1238 0 : FD_LOG_ERR(( "fd_neigh4_hmap_join failed" ));
1239 0 : }
1240 :
1241 : /* Initialize TX free ring */
1242 :
1243 0 : ulong const frame_sz = 2048UL;
1244 0 : ulong frame_off = 0UL;
1245 0 : ulong const tx_depth = ctx->free_tx.depth;
1246 0 : for( ulong j=0; j<tx_depth; j++ ) {
1247 0 : ctx->free_tx.queue[ j ] = (ulong)ctx->umem_frame0 + frame_off;
1248 0 : frame_off += frame_sz;
1249 0 : }
1250 0 : ctx->free_tx.prod = tx_depth;
1251 :
1252 : /* Initialize RX mcache chunks */
1253 :
1254 0 : for( ulong i=0UL; i<(tile->out_cnt); i++ ) {
1255 0 : fd_topo_link_t * out_link = &topo->links[ tile->out_link_id[ i ] ];
1256 0 : fd_frag_meta_t * mcache = out_link->mcache;
1257 0 : for( ulong j=0UL; j<fd_mcache_depth( mcache ); j++ ) {
1258 0 : mcache[ j ].chunk = (uint)( ctx->umem_chunk0 + (frame_off>>FD_CHUNK_LG_SZ) );
1259 0 : frame_off += frame_sz;
1260 0 : }
1261 0 : }
1262 :
1263 : /* Initialize FILL ring */
1264 :
1265 0 : int _charge_busy = 0;
1266 0 : for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
1267 0 : frame_off = net_xsk_bootstrap( ctx, j, frame_off );
1268 0 : net_rx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
1269 0 : net_tx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
1270 0 : }
1271 :
1272 0 : if( FD_UNLIKELY( frame_off > ctx->umem_sz ) ) {
1273 0 : FD_LOG_ERR(( "UMEM is too small" ));
1274 0 : }
1275 0 : }
1276 :
1277 : static ulong
1278 : populate_allowed_seccomp( fd_topo_t const * topo,
1279 : fd_topo_tile_t const * tile,
1280 : ulong out_cnt,
1281 0 : struct sock_filter * out ) {
1282 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1283 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1284 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
1285 :
1286 : /* A bit of a hack, if there is no loopback XSK for this tile, we still need to pass
1287 : two "allow" FD arguments to the net policy, so we just make them both the same. */
1288 0 : int allow_fd2 = ctx->xsk_cnt>1UL ? ctx->xsk[ 1 ].xsk_fd : ctx->xsk[ 0 ].xsk_fd;
1289 0 : FD_TEST( ctx->xsk[ 0 ].xsk_fd >= 0 && allow_fd2 >= 0 );
1290 0 : populate_sock_filter_policy_xdp( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ].xsk_fd, (uint)allow_fd2 );
1291 0 : return sock_filter_policy_xdp_instr_cnt;
1292 0 : }
1293 :
1294 : static ulong
1295 : populate_allowed_fds( fd_topo_t const * topo,
1296 : fd_topo_tile_t const * tile,
1297 : ulong out_fds_cnt,
1298 0 : int * out_fds ) {
1299 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1300 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1301 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
1302 :
1303 0 : if( FD_UNLIKELY( out_fds_cnt<6UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
1304 :
1305 0 : ulong out_cnt = 0UL;
1306 :
1307 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
1308 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
1309 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
1310 :
1311 0 : out_fds[ out_cnt++ ] = ctx->xsk[ 0 ].xsk_fd;
1312 0 : out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 0 ];
1313 0 : if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->xsk[ 1 ].xsk_fd;
1314 0 : if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 1 ];
1315 0 : return out_cnt;
1316 0 : }
1317 :
1318 0 : #define STEM_BURST (1UL)
1319 0 : #define STEM_LAZY ((ulong)30e3) /* 30 us */
1320 :
1321 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_net_ctx_t
1322 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_net_ctx_t)
1323 :
1324 0 : #define STEM_CALLBACK_METRICS_WRITE metrics_write
1325 0 : #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
1326 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
1327 0 : #define STEM_CALLBACK_BEFORE_FRAG before_frag
1328 0 : #define STEM_CALLBACK_DURING_FRAG during_frag
1329 0 : #define STEM_CALLBACK_AFTER_FRAG after_frag
1330 :
1331 : #include "../../stem/fd_stem.c"
1332 :
1333 : #ifndef FD_TILE_TEST
1334 : fd_topo_run_tile_t fd_tile_net = {
1335 : .name = "net",
1336 : .populate_allowed_seccomp = populate_allowed_seccomp,
1337 : .populate_allowed_fds = populate_allowed_fds,
1338 : .scratch_align = scratch_align,
1339 : .scratch_footprint = scratch_footprint,
1340 : .privileged_init = privileged_init,
1341 : .unprivileged_init = unprivileged_init,
1342 : .run = stem_run,
1343 : };
1344 : #endif
|