Line data Source code
1 : /* The xdp tile translates between AF_XDP and fd_tango
2 : traffic. It is responsible for setting up the XDP and
3 : XSK socket configuration. */
4 :
5 : #include <errno.h>
6 : #include <fcntl.h>
7 : #include <net/if.h>
8 : #include <netinet/in.h>
9 : #include <sys/socket.h> /* MSG_DONTWAIT needed before importing the net seccomp filter */
10 : #include <linux/if_xdp.h>
11 :
12 : #include "../fd_net_common.h"
13 : #include "../../metrics/fd_metrics.h"
14 : #include "../../netlink/fd_netlink_tile.h" /* neigh4_solicit */
15 : #include "../../topo/fd_topo.h"
16 :
17 : #include "../../../waltz/ip/fd_fib4.h"
18 : #include "../../../waltz/neigh/fd_neigh4_map.h"
19 : #include "../../../waltz/mib/fd_netdev_tbl.h"
20 : #include "../../../waltz/mib/fd_dbl_buf.h"
21 : #include "../../../waltz/xdp/fd_xdp_redirect_user.h" /* fd_xsk_activate */
22 : #include "../../../waltz/xdp/fd_xsk.h"
23 : #include "../../../util/log/fd_dtrace.h"
24 : #include "../../../util/net/fd_eth.h"
25 : #include "../../../util/net/fd_ip4.h"
26 : #include "../../../util/net/fd_gre.h"
27 :
28 : #include <unistd.h>
29 : #include <linux/if.h> /* struct ifreq */
30 : #include <sys/ioctl.h>
31 : #include <linux/unistd.h>
32 : #include <linux/if_arp.h>
33 :
34 : #include "generated/fd_xdp_tile_seccomp.h"
35 :
36 : /* MAX_NET_INS controls the max number of TX links that a net tile can
37 : serve. */
38 :
39 : #define MAX_NET_INS (32UL)
40 :
41 : /* FD_XDP_STATS_INTERVAL_NS controls the XDP stats refresh interval.
42 : This should be lower than the interval at which the metrics tile
43 : collects metrics. */
44 :
45 0 : #define FD_XDP_STATS_INTERVAL_NS (11e6) /* 11ms */
46 :
47 : /* XSK_IDX_{MAIN,LO} are the hardcoded XSK indices in ctx->xsk[ ... ].
48 : Only net tile 0 has XSK_IDX_LO, all net tiles have XSK_IDX_MAIN. */
49 :
50 420 : #define XSK_IDX_MAIN 0
51 276 : #define XSK_IDX_LO 1
52 :
53 : /* fd_net_in_ctx_t contains consumer information for an incoming tango
54 : link. It is used as part of the TX path. */
55 :
56 : typedef struct {
57 : fd_wksp_t * mem;
58 : ulong chunk0;
59 : ulong wmark;
60 : } fd_net_in_ctx_t;
61 :
62 : /* fd_net_out_ctx_t contains publisher information for a link to a
63 : downstream app tile. It is used as part of the RX path. */
64 :
65 : typedef struct {
66 : fd_frag_meta_t * mcache;
67 : ulong * sync;
68 : ulong depth;
69 : ulong seq;
70 : } fd_net_out_ctx_t;
71 :
72 : /* fd_net_flusher_t controls the pacing of XDP sendto calls for flushing
73 : TX batches. In the 'wakeup' XDP mode, no TX occurs unless the net
74 : tile wakes up the kernel periodically using the sendto() syscall.
75 : If sendto() is called too frequently, time is wasted on context
76 : switches. If sendto() is called not often enough, packets are
77 : delayed or dropped. sendto() calls make almost no guarantees how
78 : much packets are sent out, nor do they indicate when the kernel
79 : finishes a wakeup call (asynchronously dispatched). The net tile
80 : thus uses a myraid of flush triggers that were tested for best
81 : performance. */
82 :
83 : struct fd_net_flusher {
84 :
85 : /* Packets that were enqueued after the last sendto() wakeup are
86 : considered "pending". If there are more than pending_wmark packets
87 : pending, a wakeup is dispatched. Thus, this dispatch trigger is
88 : proportional to packet rate, but does not trigger if I/O is seldom. */
89 : ulong pending_cnt;
90 : ulong pending_wmark;
91 :
92 : /* Sometimes, packets are not flushed out even after a sendto()
93 : wakeup. This can result in the tail of a burst getting delayed or
94 : overrun. If more than tail_flush_backoff ticks pass since the last
95 : sendto() wakeup and there are still unacknowledged packets in the
96 : TX ring, issues another wakeup. */
97 : long next_tail_flush_ticks;
98 : long tail_flush_backoff;
99 :
100 : };
101 :
102 : typedef struct fd_net_flusher fd_net_flusher_t;
103 :
104 : FD_PROTOTYPES_BEGIN
105 :
106 : /* fd_net_flusher_inc marks a new packet as enqueued. */
107 :
108 : static inline void
109 : fd_net_flusher_inc( fd_net_flusher_t * flusher,
110 276 : long now ) {
111 276 : flusher->pending_cnt++;
112 276 : long next_flush = now + flusher->tail_flush_backoff;
113 276 : flusher->next_tail_flush_ticks = fd_long_min( flusher->next_tail_flush_ticks, next_flush );
114 276 : }
115 :
116 : /* fd_net_flusher_check returns 1 if a sendto() wakeup should be issued
117 : immediately. now is a recent fd_tickcount() value.
118 : If tx_ring_empty==0 then the kernel is caught up with the net tile
119 : on the XDP TX ring. (Otherwise, the kernel is behind the net tile) */
120 :
121 : static inline int
122 : fd_net_flusher_check( fd_net_flusher_t * flusher,
123 : long now,
124 402 : int tx_ring_empty ) {
125 402 : int flush_level = flusher->pending_cnt >= flusher->pending_wmark;
126 402 : int flush_timeout = now >= flusher->next_tail_flush_ticks;
127 402 : int flush = flush_level || flush_timeout;
128 402 : if( !flush ) return 0;
129 24 : if( FD_UNLIKELY( tx_ring_empty ) ) {
130 : /* Flush requested but caught up */
131 6 : flusher->pending_cnt = 0UL;
132 6 : flusher->next_tail_flush_ticks = LONG_MAX;
133 6 : return 0;
134 6 : }
135 18 : return 1;
136 24 : }
137 :
138 : /* fd_net_flusher_wakeup signals a sendto() wakeup was done. now is a
139 : recent fd_tickcount() value. */
140 :
141 : static inline void
142 : fd_net_flusher_wakeup( fd_net_flusher_t * flusher,
143 18 : long now ) {
144 18 : flusher->pending_cnt = 0UL;
145 18 : flusher->next_tail_flush_ticks = now + flusher->tail_flush_backoff;
146 18 : }
147 :
148 : FD_PROTOTYPES_END
149 :
150 : /* fd_net_free_ring is a FIFO queue that stores pointers to free XDP TX
151 : frames. */
152 :
153 : struct fd_net_free_ring {
154 : ulong prod;
155 : ulong cons;
156 : ulong depth;
157 : ulong * queue;
158 : };
159 : typedef struct fd_net_free_ring fd_net_free_ring_t;
160 :
161 : typedef struct {
162 : /* An "XSK" is an AF_XDP socket */
163 : uint xsk_cnt;
164 : fd_xsk_t xsk[ 2 ];
165 : int prog_link_fds[ 2 ];
166 :
167 : /* UMEM frame region within dcache */
168 : void * umem_frame0; /* First UMEM frame */
169 : ulong umem_sz; /* Usable UMEM size starting at frame0 */
170 :
171 : /* UMEM chunk region within workspace */
172 : uint umem_chunk0; /* Lowest allowed chunk number */
173 : uint umem_wmark; /* Highest allowed chunk number */
174 :
175 : /* All net tiles are subscribed to the same TX links. (These are
176 : incoming links from app tiles asking the net tile to send out packets)
177 : The net tiles "take turns" doing TX jobs based on the L3+L4 dst hash.
178 : net_tile_id is the index of the current interface, net_tile_cnt is the
179 : total amount of interfaces. */
180 : uint net_tile_id;
181 : uint net_tile_cnt;
182 :
183 : /* Details pertaining to an inflight send op */
184 : struct {
185 : uint xsk_idx;
186 : void * frame;
187 : uchar mac_addrs[12]; /* First 12 bytes of Ethernet header */
188 : uint src_ip; /* src_ip in net order */
189 :
190 : uint use_gre; /* The tx packet will be GRE-encapsulated */
191 : uint gre_outer_src_ip; /* For GRE: Outer iphdr's src_ip in net order */
192 : uint gre_outer_dst_ip; /* For GRE: Outer iphdr's dst_ip in net order */
193 : } tx_op;
194 :
195 : /* Round-robin cycle serivce operations */
196 : uint rr_idx;
197 :
198 : /* Ring tracking free packet buffers */
199 : fd_net_free_ring_t free_tx;
200 :
201 : uchar src_mac_addr[6];
202 :
203 : uint default_address;
204 : uint bind_address;
205 : ushort shred_listen_port;
206 : ushort quic_transaction_listen_port;
207 : ushort legacy_transaction_listen_port;
208 : ushort gossip_listen_port;
209 : ushort repair_intake_listen_port;
210 : ushort repair_serve_listen_port;
211 : ushort send_src_port;
212 :
213 : ulong in_cnt;
214 : fd_net_in_ctx_t in[ MAX_NET_INS ];
215 :
216 : fd_net_out_ctx_t quic_out[1];
217 : fd_net_out_ctx_t shred_out[1];
218 : fd_net_out_ctx_t gossvf_out[1];
219 : fd_net_out_ctx_t repair_out[1];
220 : fd_net_out_ctx_t send_out[1];
221 :
222 : /* XDP stats refresh timer */
223 : long xdp_stats_interval_ticks;
224 : long next_xdp_stats_refresh;
225 :
226 : /* TX flush timers */
227 : fd_net_flusher_t tx_flusher[2]; /* one per XSK */
228 :
229 : /* Route and neighbor tables */
230 : fd_fib4_t const * fib_local;
231 : fd_fib4_t const * fib_main;
232 : fd_neigh4_hmap_t neigh4[1];
233 : fd_netlink_neigh4_solicit_link_t neigh4_solicit[1];
234 :
235 : /* Netdev table */
236 : fd_dbl_buf_t * netdev_dbl_buf; /* remote copy of device table */
237 : uchar * netdev_buf; /* local copy of device table */
238 : ulong netdev_buf_sz;
239 : fd_netdev_tbl_join_t netdev_tbl; /* join to local copy of device table */
240 : int has_gre_interface; /* enable GRE support? */
241 :
242 : struct {
243 : ulong rx_pkt_cnt;
244 : ulong rx_bytes_total;
245 : ulong rx_undersz_cnt;
246 : ulong rx_fill_blocked_cnt;
247 : ulong rx_backp_cnt;
248 : long rx_busy_cnt;
249 : long rx_idle_cnt;
250 :
251 : ulong tx_submit_cnt;
252 : ulong tx_complete_cnt;
253 : ulong tx_bytes_total;
254 : ulong tx_route_fail_cnt;
255 : ulong tx_no_xdp_cnt;
256 : ulong tx_neigh_fail_cnt;
257 : ulong tx_full_fail_cnt;
258 : long tx_busy_cnt;
259 : long tx_idle_cnt;
260 :
261 : ulong xsk_tx_wakeup_cnt;
262 : ulong xsk_rx_wakeup_cnt;
263 :
264 : ulong rx_gre_cnt;
265 : ulong rx_gre_ignored_cnt;
266 : ulong rx_gre_inv_pkt_cnt;
267 : ulong tx_gre_cnt;
268 : ulong tx_gre_route_fail_cnt;
269 : } metrics;
270 : } fd_net_ctx_t;
271 :
272 : FD_FN_CONST static inline ulong
273 18 : scratch_align( void ) {
274 18 : return 4096UL;
275 18 : }
276 :
277 : FD_FN_PURE static inline ulong
278 6 : scratch_footprint( fd_topo_tile_t const * tile ) {
279 6 : ulong l = FD_LAYOUT_INIT;
280 6 : l = FD_LAYOUT_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
281 6 : l = FD_LAYOUT_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );
282 6 : l = FD_LAYOUT_APPEND( l, fd_netdev_tbl_align(), fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ) );
283 6 : return FD_LAYOUT_FINI( l, scratch_align() );
284 6 : }
285 :
286 : static void
287 0 : metrics_write( fd_net_ctx_t * ctx ) {
288 0 : FD_MCNT_SET( NET, RX_PKT_CNT, ctx->metrics.rx_pkt_cnt );
289 0 : FD_MCNT_SET( NET, RX_BYTES_TOTAL, ctx->metrics.rx_bytes_total );
290 0 : FD_MCNT_SET( NET, RX_UNDERSZ_CNT, ctx->metrics.rx_undersz_cnt );
291 0 : FD_MCNT_SET( NET, RX_FILL_BLOCKED_CNT, ctx->metrics.rx_fill_blocked_cnt );
292 0 : FD_MCNT_SET( NET, RX_BACKPRESSURE_CNT, ctx->metrics.rx_backp_cnt );
293 0 : FD_MGAUGE_SET( NET, RX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.rx_busy_cnt, 0L ) );
294 0 : FD_MGAUGE_SET( NET, RX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.rx_idle_cnt, 0L ) );
295 0 : FD_MGAUGE_SET( NET, TX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.tx_busy_cnt, 0L ) );
296 0 : FD_MGAUGE_SET( NET, TX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.tx_idle_cnt, 0L ) );
297 :
298 0 : FD_MCNT_SET( NET, TX_SUBMIT_CNT, ctx->metrics.tx_submit_cnt );
299 0 : FD_MCNT_SET( NET, TX_COMPLETE_CNT, ctx->metrics.tx_complete_cnt );
300 0 : FD_MCNT_SET( NET, TX_BYTES_TOTAL, ctx->metrics.tx_bytes_total );
301 0 : FD_MCNT_SET( NET, TX_ROUTE_FAIL_CNT, ctx->metrics.tx_route_fail_cnt );
302 0 : FD_MCNT_SET( NET, TX_NEIGHBOR_FAIL_CNT, ctx->metrics.tx_neigh_fail_cnt );
303 0 : FD_MCNT_SET( NET, TX_FULL_FAIL_CNT, ctx->metrics.tx_full_fail_cnt );
304 :
305 0 : FD_MCNT_SET( NET, XSK_TX_WAKEUP_CNT, ctx->metrics.xsk_tx_wakeup_cnt );
306 0 : FD_MCNT_SET( NET, XSK_RX_WAKEUP_CNT, ctx->metrics.xsk_rx_wakeup_cnt );
307 :
308 0 : FD_MCNT_SET( NET, RX_GRE_CNT, ctx->metrics.rx_gre_cnt );
309 0 : FD_MCNT_SET( NET, RX_GRE_INVALID_CNT, ctx->metrics.rx_gre_inv_pkt_cnt );
310 0 : FD_MCNT_SET( NET, RX_GRE_IGNORED_CNT, ctx->metrics.rx_gre_ignored_cnt );
311 0 : FD_MCNT_SET( NET, TX_GRE_CNT, ctx->metrics.tx_gre_cnt );
312 0 : FD_MCNT_SET( NET, TX_GRE_ROUTE_FAIL_CNT, ctx->metrics.tx_gre_route_fail_cnt );
313 0 : }
314 :
315 : struct xdp_statistics_v0 {
316 : __u64 rx_dropped; /* Dropped for other reasons */
317 : __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
318 : __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
319 : };
320 :
321 : struct xdp_statistics_v1 {
322 : __u64 rx_dropped; /* Dropped for other reasons */
323 : __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
324 : __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
325 : __u64 rx_ring_full; /* Dropped due to rx ring being full */
326 : __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */
327 : __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */
328 : };
329 :
330 : static void
331 0 : poll_xdp_statistics( fd_net_ctx_t * ctx ) {
332 0 : struct xdp_statistics_v1 stats = {0};
333 0 : ulong xsk_cnt = ctx->xsk_cnt;
334 0 : for( ulong j=0UL; j<xsk_cnt; j++ ) {
335 0 : struct xdp_statistics_v1 sub_stats;
336 0 : uint optlen = (uint)sizeof(struct xdp_statistics_v1);
337 0 : if( FD_UNLIKELY( -1==getsockopt( ctx->xsk[ j ].xsk_fd, SOL_XDP, XDP_STATISTICS, &sub_stats, &optlen ) ) )
338 0 : FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) failed: %s", strerror( errno ) ));
339 0 : if( FD_UNLIKELY( optlen!=sizeof(struct xdp_statistics_v0) &&
340 0 : optlen!=sizeof(struct xdp_statistics_v1) ) ) {
341 0 : FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) returned unexpected size %u", optlen ));
342 0 : }
343 0 : stats.rx_dropped += sub_stats.rx_dropped;
344 0 : stats.rx_invalid_descs += sub_stats.rx_invalid_descs;
345 0 : stats.tx_invalid_descs += sub_stats.tx_invalid_descs;
346 0 : stats.rx_ring_full += sub_stats.rx_ring_full;
347 0 : stats.rx_fill_ring_empty_descs += sub_stats.rx_fill_ring_empty_descs;
348 0 : stats.tx_ring_empty_descs += sub_stats.tx_ring_empty_descs;
349 0 : }
350 :
351 0 : FD_MCNT_SET( NET, XDP_RX_DROPPED_OTHER, stats.rx_dropped );
352 0 : FD_MCNT_SET( NET, XDP_RX_INVALID_DESCS, stats.rx_invalid_descs );
353 0 : FD_MCNT_SET( NET, XDP_TX_INVALID_DESCS, stats.tx_invalid_descs );
354 0 : FD_MCNT_SET( NET, XDP_RX_RING_FULL, stats.rx_ring_full );
355 0 : FD_MCNT_SET( NET, XDP_RX_FILL_RING_EMPTY_DESCS, stats.rx_fill_ring_empty_descs );
356 0 : FD_MCNT_SET( NET, XDP_TX_RING_EMPTY_DESCS, stats.tx_ring_empty_descs );
357 0 : }
358 :
359 : /* net_is_fatal_xdp_error returns 1 if the given errno returned by an
360 : XDP API indicates a non-recoverable error code. The net tile should
361 : crash if it sees such an error so the problem does not go undetected.
362 : Otherwise, returns 0. */
363 :
364 : static int
365 0 : net_is_fatal_xdp_error( int err ) {
366 0 : return err==ESOCKTNOSUPPORT || err==EOPNOTSUPP || err==EINVAL ||
367 0 : err==EPERM;
368 0 : }
369 :
370 : /* Load the netdev table to ctx->netdev_buf. Create a join in ctx->netdev_tbl_handle */
371 :
372 : static void
373 372 : net_load_netdev_tbl( fd_net_ctx_t * ctx ) {
374 : /* Copy netdev table from netlink tile. This could fail briefly
375 : during startup if the netlink tile is late to start up. */
376 372 : if( FD_UNLIKELY( !fd_dbl_buf_read( ctx->netdev_dbl_buf, ctx->netdev_buf_sz, ctx->netdev_buf, NULL ) ) ) return;
377 :
378 : /* Join local copy */
379 0 : if( FD_UNLIKELY( !fd_netdev_tbl_join( &ctx->netdev_tbl, ctx->netdev_buf ) ) ) FD_LOG_ERR(("netdev table join failed"));
380 0 : }
381 :
382 : /* Query the netdev table. Return a fd_netdev_t pointer to the net device of the
383 : interface specified by if_idx. Null if the if_idx is invalid */
384 :
385 : static fd_netdev_t *
386 : net_query_netdev_tbl( fd_net_ctx_t * ctx,
387 423 : uint if_idx ) {
388 : /* dev_tbl is one-indexed */
389 423 : if( if_idx>ctx->netdev_tbl.hdr->dev_cnt ) return NULL;
390 420 : return &ctx->netdev_tbl.dev_tbl[ if_idx ];
391 423 : }
392 :
393 : /* Iterates the netdev table and returns 1 if a GRE interface exists, 0 otherwise.
394 : Only called in privileged_init and during_housekeeping */
395 :
396 : static int
397 384 : net_check_gre_interface_exists( fd_net_ctx_t * ctx ) {
398 384 : fd_netdev_t * dev_tbl = ctx->netdev_tbl.dev_tbl;
399 384 : ushort dev_cnt = ctx->netdev_tbl.hdr->dev_cnt;
400 :
401 12858 : for( ushort if_idx = 0; if_idx<dev_cnt; if_idx++ ) {
402 12852 : if( dev_tbl[if_idx].dev_type==ARPHRD_IPGRE ) return 1;
403 12852 : }
404 6 : return 0;
405 384 : }
406 :
407 :
408 : /* net_tx_ready returns 1 if the current XSK is ready to submit a TX send
409 : job. If the XSK is blocked for sends, returns 0. Reasons for block
410 : include:
411 : - No XSK TX buffer is available
412 : - XSK TX ring is full */
413 :
414 : static int
415 : net_tx_ready( fd_net_ctx_t * ctx,
416 276 : uint xsk_idx ) {
417 276 : fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
418 276 : fd_xdp_ring_t * tx_ring = &xsk->ring_tx;
419 276 : fd_net_free_ring_t * free = &ctx->free_tx;
420 276 : if( free->prod == free->cons ) return 0; /* drop */
421 276 : if( tx_ring->prod - tx_ring->cons >= tx_ring->depth ) return 0; /* drop */
422 276 : return 1;
423 276 : }
424 :
425 : /* net_rx_wakeup triggers xsk_recvmsg to run in the kernel. Needs to be
426 : called periodically in order to receive packets. */
427 :
428 : static void
429 : net_rx_wakeup( fd_net_ctx_t * ctx,
430 : fd_xsk_t * xsk,
431 3 : int * charge_busy ) {
432 3 : if( !fd_xsk_rx_need_wakeup( xsk ) ) return;
433 0 : *charge_busy = 1;
434 0 : struct msghdr _ignored[ 1 ] = { 0 };
435 0 : if( FD_UNLIKELY( -1==recvmsg( xsk->xsk_fd, _ignored, MSG_DONTWAIT ) ) ) {
436 0 : if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
437 0 : FD_LOG_ERR(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
438 0 : }
439 0 : if( FD_UNLIKELY( errno!=EAGAIN ) ) {
440 0 : long ts = fd_log_wallclock();
441 0 : if( ts > xsk->log_suppress_until_ns ) {
442 0 : FD_LOG_WARNING(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
443 0 : xsk->log_suppress_until_ns = ts + (long)1e9;
444 0 : }
445 0 : }
446 0 : }
447 0 : ctx->metrics.xsk_rx_wakeup_cnt++;
448 0 : }
449 :
450 : /* net_tx_wakeup triggers xsk_sendmsg to run in the kernel. Needs to be
451 : called periodically in order to transmit packets. */
452 :
453 : static void
454 : net_tx_wakeup( fd_net_ctx_t * ctx,
455 : fd_xsk_t * xsk,
456 21 : int * charge_busy ) {
457 21 : if( !fd_xsk_tx_need_wakeup( xsk ) ) return;
458 0 : if( FD_VOLATILE_CONST( *xsk->ring_tx.prod )==FD_VOLATILE_CONST( *xsk->ring_tx.cons ) ) return;
459 0 : *charge_busy = 1;
460 0 : if( FD_UNLIKELY( -1==sendto( xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) {
461 0 : if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
462 0 : FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
463 0 : }
464 0 : if( FD_UNLIKELY( errno!=EAGAIN ) ) {
465 0 : long ts = fd_log_wallclock();
466 0 : if( ts > xsk->log_suppress_until_ns ) {
467 0 : FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
468 0 : xsk->log_suppress_until_ns = ts + (long)1e9;
469 0 : }
470 0 : }
471 0 : }
472 0 : ctx->metrics.xsk_tx_wakeup_cnt++;
473 0 : }
474 :
475 : /* net_tx_periodic_wakeup does a timer based xsk_sendmsg wakeup. */
476 :
477 : static inline int
478 : net_tx_periodic_wakeup( fd_net_ctx_t * ctx,
479 : uint xsk_idx,
480 : long now,
481 402 : int * charge_busy ) {
482 402 : uint tx_prod = FD_VOLATILE_CONST( *ctx->xsk[ xsk_idx ].ring_tx.prod );
483 402 : uint tx_cons = FD_VOLATILE_CONST( *ctx->xsk[ xsk_idx ].ring_tx.cons );
484 402 : int tx_ring_empty = tx_prod==tx_cons;
485 402 : if( fd_net_flusher_check( ctx->tx_flusher+xsk_idx, now, tx_ring_empty ) ) {
486 18 : net_tx_wakeup( ctx, &ctx->xsk[ xsk_idx ], charge_busy );
487 18 : fd_net_flusher_wakeup( ctx->tx_flusher+xsk_idx, now );
488 18 : }
489 402 : return 0;
490 402 : }
491 :
492 : static void
493 372 : during_housekeeping( fd_net_ctx_t * ctx ) {
494 372 : long now = fd_tickcount();
495 372 : net_load_netdev_tbl( ctx );
496 372 : ctx->has_gre_interface = net_check_gre_interface_exists( ctx );
497 :
498 372 : ctx->metrics.rx_busy_cnt = 0UL;
499 372 : ctx->metrics.rx_idle_cnt = 0UL;
500 372 : ctx->metrics.tx_busy_cnt = 0UL;
501 372 : ctx->metrics.tx_idle_cnt = fd_seq_diff( ctx->free_tx.prod, ctx->free_tx.cons );
502 744 : for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
503 372 : fd_xsk_t * xsk = &ctx->xsk[ j ];
504 : /* Refresh all sequence numbers (consumer first, then producer) */
505 372 : FD_COMPILER_MFENCE();
506 372 : xsk->ring_fr.cached_cons = FD_VOLATILE_CONST( *xsk->ring_fr.cons );
507 372 : xsk->ring_fr.cached_prod = FD_VOLATILE_CONST( *xsk->ring_fr.prod );
508 372 : xsk->ring_rx.cached_cons = FD_VOLATILE_CONST( *xsk->ring_rx.cons );
509 372 : xsk->ring_rx.cached_prod = FD_VOLATILE_CONST( *xsk->ring_rx.prod );
510 372 : xsk->ring_tx.cached_cons = FD_VOLATILE_CONST( *xsk->ring_tx.cons );
511 372 : xsk->ring_tx.cached_prod = FD_VOLATILE_CONST( *xsk->ring_tx.prod );
512 372 : xsk->ring_cr.cached_cons = FD_VOLATILE_CONST( *xsk->ring_cr.cons );
513 372 : xsk->ring_cr.cached_prod = FD_VOLATILE_CONST( *xsk->ring_cr.prod );
514 372 : FD_COMPILER_MFENCE();
515 372 : ctx->metrics.rx_busy_cnt += (long)(int)( xsk->ring_rx.cached_prod - xsk->ring_rx.cached_cons );
516 372 : ctx->metrics.rx_idle_cnt += (long)(int)( xsk->ring_fr.cached_prod - xsk->ring_fr.cached_cons );
517 372 : ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_tx.cached_prod - xsk->ring_tx.cached_cons );
518 372 : ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_cr.cached_prod - xsk->ring_cr.cached_cons );
519 372 : }
520 :
521 372 : if( now > ctx->next_xdp_stats_refresh ) {
522 0 : ctx->next_xdp_stats_refresh = now + ctx->xdp_stats_interval_ticks;
523 0 : poll_xdp_statistics( ctx );
524 0 : }
525 372 : }
526 :
527 :
528 : /* net_tx_route resolves the xsk index, src ip address, src MAC address, and
529 : dst MAC address. Returns 1 on success, 0 on failure.
530 : On success, tx_op->{xsk_idx,src_ip,mac_addrs} is set, and if the dst_ip
531 : belongs to a GRE interface, is_gre_inf will set to 1 and
532 : tx_op->{gre_outer_src_ip, gre_outer_dst_ip} will be loaded from the netdev
533 : table. is_gre_inf is set to 0 if dst_ip doesn't belong to a GRE interface. */
534 :
535 : static int
536 : net_tx_route( fd_net_ctx_t * ctx,
537 : uint dst_ip,
538 552 : uint * is_gre_inf ) {
539 :
540 : /* Route lookup */
541 :
542 552 : fd_fib4_hop_t hop[2] = {0};
543 552 : fd_fib4_lookup( ctx->fib_local, hop+0, dst_ip, 0UL );
544 552 : fd_fib4_lookup( ctx->fib_main, hop+1, dst_ip, 0UL );
545 552 : fd_fib4_hop_t const * next_hop = fd_fib4_hop_or( hop+0, hop+1 );
546 :
547 552 : uint rtype = next_hop->rtype;
548 552 : uint if_idx = next_hop->if_idx;
549 552 : uint ip4_src = next_hop->ip4_src;
550 :
551 552 : if( FD_UNLIKELY( rtype==FD_FIB4_RTYPE_LOCAL ) ) {
552 0 : rtype = FD_FIB4_RTYPE_UNICAST;
553 0 : if_idx = 1;
554 0 : }
555 :
556 552 : if( FD_UNLIKELY( rtype!=FD_FIB4_RTYPE_UNICAST ) ) {
557 129 : ctx->metrics.tx_route_fail_cnt++;
558 129 : return 0;
559 129 : }
560 :
561 423 : fd_netdev_t * netdev = net_query_netdev_tbl( ctx, if_idx );
562 423 : if( !netdev ) {
563 3 : ctx->metrics.tx_route_fail_cnt++;
564 3 : return 0;
565 3 : }
566 :
567 420 : ip4_src = fd_uint_if( !!ctx->bind_address, ctx->bind_address, ip4_src );
568 420 : ctx->tx_op.src_ip = ip4_src;
569 420 : ctx->tx_op.xsk_idx = UINT_MAX;
570 :
571 420 : FD_TEST( is_gre_inf );
572 420 : *is_gre_inf = 0;
573 420 : if( netdev->dev_type==ARPHRD_LOOPBACK ) {
574 : /* Set Ethernet src and dst address to 00:00:00:00:00:00 */
575 0 : memset( ctx->tx_op.mac_addrs, 0, 12UL );
576 0 : ctx->tx_op.xsk_idx = XSK_IDX_LO;
577 : /* Set preferred src address to 127.0.0.1 if no bind address is set */
578 0 : if( !ctx->tx_op.src_ip ) ctx->tx_op.src_ip = FD_IP4_ADDR( 127,0,0,1 );
579 0 : return 1;
580 420 : } else if( netdev->dev_type==ARPHRD_IPGRE ) {
581 : /* skip MAC addrs lookup for GRE inner dst ip */
582 144 : if( netdev->gre_src_ip ) ctx->tx_op.gre_outer_src_ip = netdev->gre_src_ip;
583 144 : ctx->tx_op.gre_outer_dst_ip = netdev->gre_dst_ip;
584 144 : *is_gre_inf = 1;
585 144 : return 1;
586 144 : }
587 :
588 276 : if( FD_UNLIKELY( netdev->dev_type!=ARPHRD_ETHER ) ) return 0; // drop
589 :
590 276 : if( FD_UNLIKELY( if_idx!=ctx->xsk[ XSK_IDX_MAIN ].if_idx ) ) {
591 0 : ctx->metrics.tx_no_xdp_cnt++;
592 0 : return 0;
593 0 : }
594 276 : ctx->tx_op.xsk_idx = XSK_IDX_MAIN;
595 :
596 : /* Neighbor resolve */
597 276 : uint neigh_ip = next_hop->ip4_gw;
598 276 : if( !neigh_ip ) neigh_ip = dst_ip;
599 :
600 276 : fd_neigh4_hmap_query_t neigh_query[1];
601 276 : int neigh_res = fd_neigh4_hmap_query_try( ctx->neigh4, &neigh_ip, NULL, neigh_query, 0 );
602 276 : if( FD_UNLIKELY( neigh_res!=FD_MAP_SUCCESS ) ) {
603 : /* Neighbor not found */
604 0 : fd_netlink_neigh4_solicit( ctx->neigh4_solicit, neigh_ip, if_idx, fd_frag_meta_ts_comp( fd_tickcount() ) );
605 0 : ctx->metrics.tx_neigh_fail_cnt++;
606 0 : return 0;
607 0 : }
608 276 : fd_neigh4_entry_t const * neigh = fd_neigh4_hmap_query_ele_const( neigh_query );
609 276 : if( FD_UNLIKELY( neigh->state != FD_NEIGH4_STATE_ACTIVE ) ) {
610 0 : ctx->metrics.tx_neigh_fail_cnt++;
611 0 : return 0;
612 0 : }
613 276 : ip4_src = fd_uint_if( !ip4_src, ctx->default_address, ip4_src );
614 276 : ctx->tx_op.src_ip = ip4_src;
615 276 : memcpy( ctx->tx_op.mac_addrs+0, neigh->mac_addr, 6 );
616 276 : memcpy( ctx->tx_op.mac_addrs+6, netdev->mac_addr, 6 );
617 :
618 276 : if( FD_UNLIKELY( fd_neigh4_hmap_query_test( neigh_query ) ) ) {
619 0 : ctx->metrics.tx_neigh_fail_cnt++;
620 0 : return 0;
621 0 : }
622 :
623 276 : return 1;
624 276 : }
625 :
626 : /* before_frag is called when a new metadata descriptor for a TX job is
627 : found. This callback determines whether this net tile is responsible
628 : for the TX job. If so, it prepares the TX op for the during_frag and
629 : after_frag callbacks. */
630 :
631 : static inline int
632 : before_frag( fd_net_ctx_t * ctx,
633 : ulong in_idx,
634 : ulong seq,
635 402 : ulong sig ) {
636 402 : (void)in_idx; (void)seq;
637 :
638 : /* Find interface index of next packet */
639 402 : ulong proto = fd_disco_netmux_sig_proto( sig );
640 402 : if( FD_UNLIKELY( proto!=DST_PROTO_OUTGOING ) ) return 1;
641 :
642 : /* Load balance TX */
643 402 : uint net_tile_cnt = ctx->net_tile_cnt;
644 402 : uint hash = (uint)fd_disco_netmux_sig_hash( sig );
645 402 : uint target_idx = hash % net_tile_cnt;
646 402 : uint net_tile_id = ctx->net_tile_id;
647 402 : uint dst_ip = fd_disco_netmux_sig_ip( sig );
648 :
649 : /* Skip if another net tile is responsible for this packet.
650 : Fast path for net tiles other than net_tile 0. */
651 :
652 402 : if( net_tile_id!=0 && net_tile_id!=target_idx ) return 1; /* ignore */
653 :
654 :
655 402 : ctx->tx_op.use_gre = 0;
656 402 : ctx->tx_op.gre_outer_dst_ip = 0;
657 402 : ctx->tx_op.gre_outer_src_ip = 0;
658 402 : uint is_gre_inf = 0;
659 :
660 402 : if( FD_UNLIKELY( !net_tx_route( ctx, dst_ip, &is_gre_inf ) ) ) {
661 126 : return 1; /* metrics incremented by net_tx_route */
662 126 : }
663 :
664 276 : uint xsk_idx = ctx->tx_op.xsk_idx;
665 :
666 276 : if( is_gre_inf ) {
667 144 : uint inner_src_ip = ctx->tx_op.src_ip;
668 144 : if( FD_UNLIKELY( !inner_src_ip ) ) {
669 0 : ctx->metrics.tx_gre_route_fail_cnt++;
670 0 : return 1;
671 0 : }
672 : /* Find the MAC addrs for the eth hdr, and src ip for outer ip4 hdr if not found in netdev tbl */
673 144 : ctx->tx_op.src_ip = 0;
674 144 : is_gre_inf = 0;
675 144 : if( FD_UNLIKELY( !net_tx_route( ctx, ctx->tx_op.gre_outer_dst_ip, &is_gre_inf ) ) ) {
676 0 : ctx->metrics.tx_gre_route_fail_cnt++;
677 0 : return 1;
678 0 : }
679 144 : if( is_gre_inf ) {
680 : /* Only one layer of tunnelling supported */
681 0 : ctx->metrics.tx_gre_route_fail_cnt++;
682 0 : return 1;
683 0 : }
684 144 : if( !ctx->tx_op.gre_outer_src_ip ) {
685 72 : ctx->tx_op.gre_outer_src_ip = ctx->tx_op.src_ip;
686 72 : }
687 144 : ctx->tx_op.use_gre = 1; /* indicate to during_frag to use GRE header */
688 144 : ctx->tx_op.src_ip = inner_src_ip;
689 144 : xsk_idx = XSK_IDX_MAIN;
690 144 : }
691 :
692 276 : if( FD_UNLIKELY( xsk_idx>=ctx->xsk_cnt ) ) {
693 : /* Packet does not route to an XDP interface */
694 0 : ctx->metrics.tx_no_xdp_cnt++;
695 0 : return 1;
696 0 : }
697 :
698 276 : if( xsk_idx==XSK_IDX_LO ) target_idx = 0; /* loopback always targets tile 0 */
699 :
700 : /* Skip if another net tile is responsible for this packet */
701 :
702 276 : if( net_tile_id!=target_idx ) return 1; /* ignore */
703 :
704 : /* Skip if TX is blocked */
705 :
706 276 : if( FD_UNLIKELY( !net_tx_ready( ctx, xsk_idx ) ) ) {
707 0 : ctx->metrics.tx_full_fail_cnt++;
708 0 : return 1;
709 0 : }
710 :
711 : /* Allocate buffer for receive */
712 :
713 276 : fd_net_free_ring_t * free = &ctx->free_tx;
714 276 : ulong alloc_seq = free->cons;
715 276 : void * frame = (void *)free->queue[ alloc_seq % free->depth ];
716 276 : free->cons = fd_seq_inc( alloc_seq, 1UL );
717 :
718 276 : ctx->tx_op.frame = frame;
719 :
720 276 : return 0; /* continue */
721 276 : }
722 :
723 : /* during_frag is called when before_frag has committed to transmit an
724 : outgoing packet. */
725 :
726 : static inline void
727 : during_frag( fd_net_ctx_t * ctx,
728 : ulong in_idx,
729 : ulong seq FD_PARAM_UNUSED,
730 : ulong sig FD_PARAM_UNUSED,
731 : ulong chunk,
732 : ulong sz,
733 276 : ulong ctl FD_PARAM_UNUSED ) {
734 276 : if( FD_UNLIKELY( chunk<ctx->in[ in_idx ].chunk0 || chunk>ctx->in[ in_idx ].wmark || sz>FD_NET_MTU ) )
735 0 : FD_LOG_ERR(( "chunk %lu %lu corrupt, not in range [%lu,%lu]", chunk, sz, ctx->in[ in_idx ].chunk0, ctx->in[ in_idx ].wmark ));
736 :
737 276 : if( FD_UNLIKELY( sz<( sizeof(fd_eth_hdr_t)+sizeof(fd_ip4_hdr_t) ) ) )
738 0 : FD_LOG_ERR(( "packet too small %lu (in_idx=%lu)", sz, in_idx ));
739 :
740 276 : if( FD_UNLIKELY( sz>FD_ETH_PAYLOAD_MAX ) )
741 0 : FD_LOG_ERR(( "packet too big %lu (in_idx=%lu)", sz, in_idx ));
742 :
743 276 : void * frame = ctx->tx_op.frame;
744 276 : if( FD_UNLIKELY( (ulong)frame < (ulong)ctx->umem_frame0 ) )
745 0 : FD_LOG_ERR(( "frame %p out of bounds (below %p)", frame, (void *)ctx->umem_frame0 ));
746 276 : ulong umem_off = (ulong)frame - (ulong)ctx->umem_frame0;
747 276 : if( FD_UNLIKELY( (ulong)umem_off > (ulong)ctx->umem_sz ) )
748 0 : FD_LOG_ERR(( "frame %p out of bounds (beyond %p)", frame, (void *)ctx->umem_sz ));
749 :
750 : /* Speculatively copy frame into XDP buffer */
751 276 : uchar const * src = fd_chunk_to_laddr_const( ctx->in[ in_idx ].mem, chunk );
752 :
753 276 : if( ctx->tx_op.use_gre ) {
754 : /* Discard the ethernet hdr from src. Copy the rest to where the inner ip4_hdr is.
755 : Safe from overflow: FD_ETH_PAYLOAD_MAX + header overhead < frame size (2048UL) */
756 144 : ulong overhead = sizeof(fd_eth_hdr_t) + sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t);
757 144 : fd_memcpy( (void *)( (ulong)ctx->tx_op.frame + overhead ), src + sizeof(fd_eth_hdr_t), sz - sizeof(fd_eth_hdr_t) );
758 144 : } else {
759 132 : fd_memcpy( ctx->tx_op.frame, src, sz );
760 132 : }
761 276 : }
762 :
763 : /* after_frag is called when the during_frag memcpy was _not_ overrun. */
764 :
765 : static void
766 : after_frag( fd_net_ctx_t * ctx,
767 : ulong in_idx,
768 : ulong seq,
769 : ulong sig,
770 : ulong sz,
771 : ulong tsorig,
772 : ulong tspub,
773 276 : fd_stem_context_t * stem ) {
774 276 : (void)in_idx; (void)seq; (void)sig; (void)tsorig; (void)tspub; (void)stem;
775 :
776 : /* Current send operation */
777 :
778 276 : uchar * frame = ctx->tx_op.frame;
779 276 : uint xsk_idx = ctx->tx_op.xsk_idx;
780 :
781 : /* Select Ethernet addresses */
782 276 : memcpy( frame, ctx->tx_op.mac_addrs, 12 );
783 :
784 276 : uchar * iphdr = frame + sizeof(fd_eth_hdr_t);
785 :
786 276 : if( ctx->tx_op.use_gre ) {
787 :
788 : /* For GRE packets, the ethertype will always be FD_ETH_HDR_TYPE_IP. outer source ip can't be 0 */
789 144 : if( FD_UNLIKELY( ctx->tx_op.gre_outer_src_ip==0 ) ) {
790 0 : ctx->metrics.tx_gre_route_fail_cnt++;
791 0 : return;
792 0 : }
793 :
794 : /* Write the last two bytes for eth_hdr */
795 144 : FD_STORE( ushort, frame+12, fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) );
796 :
797 144 : uchar * outer_iphdr = frame + sizeof(fd_eth_hdr_t);
798 144 : uchar * gre_hdr = outer_iphdr + sizeof(fd_ip4_hdr_t);
799 144 : uchar * inner_iphdr = gre_hdr + sizeof(fd_gre_hdr_t);
800 :
801 : /* outer hdr + gre hdr + inner net_tot_len */
802 144 : ushort outer_net_tot_len = (ushort)( sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t) + fd_ushort_bswap( ( (fd_ip4_hdr_t *)inner_iphdr )->net_tot_len ) );
803 :
804 : /* Construct outer ip header */
805 144 : fd_ip4_hdr_t ip4_outer = (fd_ip4_hdr_t) {
806 144 : .verihl = FD_IP4_VERIHL( 4,5 ),
807 144 : .tos = 0,
808 144 : .net_tot_len = fd_ushort_bswap( outer_net_tot_len ),
809 144 : .net_id = 0,
810 144 : .net_frag_off = fd_ushort_bswap( FD_IP4_HDR_FRAG_OFF_DF ),
811 144 : .ttl = 64,
812 144 : .protocol = FD_IP4_HDR_PROTOCOL_GRE,
813 144 : .check = 0,
814 144 : .saddr = ctx->tx_op.gre_outer_src_ip,
815 144 : .daddr = ctx->tx_op.gre_outer_dst_ip,
816 144 : };
817 144 : ip4_outer.check = fd_ip4_hdr_check_fast( &ip4_outer );
818 144 : FD_STORE( fd_ip4_hdr_t, outer_iphdr, ip4_outer );
819 :
820 : /* Construct gre header */
821 144 : fd_gre_hdr_t gre_hdr_ = {
822 144 : .flags_version = FD_GRE_HDR_FLG_VER_BASIC,
823 144 : .protocol = fd_ushort_bswap( FD_ETH_HDR_TYPE_IP )
824 144 : };
825 144 : FD_STORE( fd_gre_hdr_t, gre_hdr, gre_hdr_ );
826 :
827 144 : iphdr = inner_iphdr;
828 144 : sz = sizeof(fd_eth_hdr_t) + outer_net_tot_len;
829 144 : xsk_idx = 0;
830 144 : }
831 :
832 : /* Construct (inner) ip header */
833 276 : uint ihl = FD_IP4_GET_LEN( *(fd_ip4_hdr_t *)iphdr );
834 276 : uint ver = FD_IP4_GET_VERSION( *(fd_ip4_hdr_t *)iphdr );
835 276 : uint ip4_saddr = FD_LOAD( uint, iphdr+12 );
836 276 : ushort ethertype = FD_LOAD( ushort, frame+12 );
837 276 : if( ethertype==fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) && ver!=0x4 ) {
838 0 : ctx->metrics.tx_route_fail_cnt++; // Not an IPv4 packet. drop
839 0 : return;
840 0 : }
841 :
842 276 : if( ethertype==fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) && ip4_saddr==0 ) {
843 276 : if( FD_UNLIKELY( ctx->tx_op.src_ip==0 ||
844 276 : ihl<sizeof(fd_ip4_hdr_t) ||
845 276 : (sizeof(fd_eth_hdr_t)+ihl)>sz ) ) {
846 : /* Outgoing IPv4 packet with unknown src IP or invalid IHL */
847 : /* FIXME should select first IPv4 address of device table here */
848 0 : ctx->metrics.tx_route_fail_cnt++;
849 0 : return;
850 0 : }
851 : /* Recompute checksum after changing header */
852 276 : FD_STORE( uint, iphdr+12, ctx->tx_op.src_ip );
853 276 : FD_STORE( ushort, iphdr+10, 0 );
854 276 : FD_STORE( ushort, iphdr+10, fd_ip4_hdr_check( iphdr ) );
855 276 : }
856 :
857 : /* Submit packet TX job
858 :
859 : Invariant for ring_tx: prod-cons<length
860 : (This invariant breaks if any other packet is sent over this ring
861 : between before_frag and this point, e.g. send_arp_probe.) */
862 :
863 :
864 276 : fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
865 276 : fd_xdp_ring_t * tx_ring = &xsk->ring_tx;
866 276 : uint tx_seq = FD_VOLATILE_CONST( *tx_ring->prod );
867 276 : uint tx_mask = tx_ring->depth - 1U;
868 276 : xsk->ring_tx.packet_ring[ tx_seq&tx_mask ] = (struct xdp_desc) {
869 276 : .addr = (ulong)frame - (ulong)ctx->umem_frame0,
870 276 : .len = (uint)sz,
871 276 : .options = 0
872 276 : };
873 :
874 : /* Frame is now owned by kernel. Clear tx_op. */
875 276 : ctx->tx_op.frame = NULL;
876 :
877 : /* Register newly enqueued packet */
878 276 : FD_VOLATILE( *xsk->ring_tx.prod ) = tx_ring->cached_prod = tx_seq+1U;
879 276 : ctx->metrics.tx_submit_cnt++;
880 276 : ctx->metrics.tx_bytes_total += sz;
881 276 : if( ctx->tx_op.use_gre ) ctx->metrics.tx_gre_cnt++;
882 276 : fd_net_flusher_inc( ctx->tx_flusher+xsk_idx, fd_tickcount() );
883 :
884 276 : }
885 :
886 : /* net_rx_packet is called when a new Ethernet frame is available.
887 : Attempts to copy out the frame to a downstream tile. */
888 :
889 : static void
890 : net_rx_packet( fd_net_ctx_t * ctx,
891 : ulong umem_off,
892 : ulong sz,
893 402 : uint * freed_chunk ) {
894 :
895 402 : if( FD_UNLIKELY( sz<sizeof(fd_eth_hdr_t)+sizeof(fd_ip4_hdr_t)+sizeof(fd_udp_hdr_t) ) ) {
896 0 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
897 0 : ctx->metrics.rx_undersz_cnt++;
898 0 : return;
899 0 : }
900 :
901 402 : uchar * packet = (uchar *)ctx->umem_frame0 + umem_off;
902 402 : uchar const * packet_end = packet + sz;
903 402 : fd_ip4_hdr_t * iphdr = (fd_ip4_hdr_t *)(packet + sizeof(fd_eth_hdr_t));
904 :
905 402 : if( FD_UNLIKELY( ((fd_eth_hdr_t *)packet)->net_type!=fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) ) ) return;
906 :
907 402 : int is_packet_gre = 0;
908 : /* Discard the GRE overhead (outer iphdr and gre hdr) */
909 402 : if( iphdr->protocol == FD_IP4_HDR_PROTOCOL_GRE ) {
910 204 : if( FD_UNLIKELY( ctx->has_gre_interface==0 ) ) {
911 0 : ctx->metrics.rx_gre_ignored_cnt++; // drop. No gre interface in netdev table
912 0 : return;
913 0 : }
914 204 : if( FD_UNLIKELY( FD_IP4_GET_VERSION( *iphdr )!=0x4 ) ) {
915 0 : ctx->metrics.rx_gre_inv_pkt_cnt++; // drop. IP version!=IPv4
916 0 : return;
917 0 : }
918 :
919 204 : ulong overhead = FD_IP4_GET_LEN( *iphdr ) + sizeof(fd_gre_hdr_t);
920 :
921 204 : if( FD_UNLIKELY( (uchar *)iphdr+overhead+sizeof(fd_ip4_hdr_t)>packet_end ) ) {
922 0 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
923 0 : ctx->metrics.rx_undersz_cnt++; // inner ip4 header invalid
924 0 : return;
925 0 : }
926 :
927 : /* The new iphdr is where the inner iphdr was. Copy over the eth_hdr */
928 204 : iphdr = (fd_ip4_hdr_t *)((uchar *)iphdr + overhead);
929 204 : uchar * new_packet = (uchar *)iphdr - sizeof(fd_eth_hdr_t);
930 204 : fd_memcpy( new_packet, packet, sizeof(fd_eth_hdr_t) );
931 204 : sz -= overhead;
932 204 : packet = new_packet;
933 204 : umem_off = (ulong)( packet - (uchar *)ctx->umem_frame0 );
934 204 : is_packet_gre = 1;
935 204 : }
936 :
937 : /* Translate packet to UMEM frame index */
938 402 : ulong chunk = ctx->umem_chunk0 + (umem_off>>FD_CHUNK_LG_SZ);
939 402 : ulong ctl = umem_off & 0x3fUL;
940 :
941 : /* Filter for UDP/IPv4 packets. */
942 402 : if( FD_UNLIKELY( ( FD_IP4_GET_VERSION( *iphdr )!=0x4 ) ||
943 402 : ( iphdr->protocol!=FD_IP4_HDR_PROTOCOL_UDP ) ) ) return;
944 :
945 : /* IPv4 is variable-length, so lookup IHL to find start of UDP */
946 402 : uint iplen = FD_IP4_GET_LEN( *iphdr );
947 402 : uchar const * udp = (uchar *)iphdr + iplen;
948 :
949 402 : if( FD_UNLIKELY( udp+sizeof(fd_udp_hdr_t) > packet_end ) ) {
950 0 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
951 0 : ctx->metrics.rx_undersz_cnt++;
952 0 : return;
953 0 : }
954 :
955 : /* Extract IP dest addr and UDP src/dest port */
956 402 : fd_udp_hdr_t * udp_hdr = (fd_udp_hdr_t *)udp;
957 402 : uint ip_srcaddr = iphdr->saddr;
958 402 : ushort udp_srcport = fd_ushort_bswap( udp_hdr->net_sport );
959 402 : ushort udp_dstport = fd_ushort_bswap( udp_hdr->net_dport );
960 :
961 402 : FD_DTRACE_PROBE_4( net_tile_pkt_rx, ip_srcaddr, udp_srcport, udp_dstport, sz );
962 :
963 : /* Route packet to downstream tile */
964 402 : ushort proto;
965 402 : fd_net_out_ctx_t * out;
966 402 : if( FD_UNLIKELY( udp_dstport==ctx->shred_listen_port ) ) {
967 18 : proto = DST_PROTO_SHRED;
968 18 : out = ctx->shred_out;
969 384 : } else if( FD_UNLIKELY( udp_dstport==ctx->quic_transaction_listen_port ) ) {
970 384 : proto = DST_PROTO_TPU_QUIC;
971 384 : out = ctx->quic_out;
972 384 : } else if( FD_UNLIKELY( udp_dstport==ctx->legacy_transaction_listen_port ) ) {
973 0 : proto = DST_PROTO_TPU_UDP;
974 0 : out = ctx->quic_out;
975 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->gossip_listen_port ) ) {
976 0 : proto = DST_PROTO_GOSSIP;
977 0 : out = ctx->gossvf_out;
978 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->repair_intake_listen_port ) ) {
979 0 : proto = DST_PROTO_REPAIR;
980 0 : if( FD_UNLIKELY( sz == REPAIR_PING_SZ ) ) out = ctx->repair_out; /* ping-pong */
981 0 : else out = ctx->shred_out;
982 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->repair_serve_listen_port ) ) {
983 0 : proto = DST_PROTO_REPAIR;
984 0 : out = ctx->repair_out;
985 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->send_src_port ) ) {
986 0 : proto = DST_PROTO_SEND;
987 0 : out = ctx->send_out;
988 0 : } else {
989 :
990 0 : FD_LOG_ERR(( "Firedancer received a UDP packet on port %hu which was not expected. "
991 0 : "Only the following ports should be configured to forward packets: "
992 0 : "%hu, %hu, %hu, %hu, %hu, %hu (excluding any 0 ports, which can be ignored)."
993 0 : "Please report this error to Firedancer maintainers.",
994 0 : udp_dstport,
995 0 : ctx->shred_listen_port,
996 0 : ctx->quic_transaction_listen_port,
997 0 : ctx->legacy_transaction_listen_port,
998 0 : ctx->gossip_listen_port,
999 0 : ctx->repair_intake_listen_port,
1000 0 : ctx->repair_serve_listen_port ));
1001 0 : }
1002 :
1003 : /* tile can decide how to partition based on src ip addr and src port */
1004 402 : ulong sig = fd_disco_netmux_sig( ip_srcaddr, udp_srcport, ip_srcaddr, proto, 14UL+8UL+iplen );
1005 :
1006 : /* Peek the mline for an old frame */
1007 402 : fd_frag_meta_t * mline = out->mcache + fd_mcache_line_idx( out->seq, out->depth );
1008 402 : *freed_chunk = mline->chunk;
1009 :
1010 : /* Overwrite the mline with the new frame */
1011 402 : ulong tspub = (ulong)fd_frag_meta_ts_comp( fd_tickcount() );
1012 402 : fd_mcache_publish( out->mcache, out->depth, out->seq, sig, chunk, sz, ctl, 0, tspub );
1013 :
1014 : /* Wind up for the next iteration */
1015 402 : out->seq = fd_seq_inc( out->seq, 1UL );
1016 :
1017 402 : if( is_packet_gre ) ctx->metrics.rx_gre_cnt++;
1018 402 : ctx->metrics.rx_pkt_cnt++;
1019 402 : ctx->metrics.rx_bytes_total += sz;
1020 402 : }
1021 :
1022 : /* net_comp_event is called when an XDP TX frame is free again. */
1023 :
1024 : static void
1025 : net_comp_event( fd_net_ctx_t * ctx,
1026 : fd_xsk_t * xsk,
1027 0 : uint comp_seq ) {
1028 :
1029 : /* Locate the incoming frame */
1030 :
1031 0 : fd_xdp_ring_t * comp_ring = &xsk->ring_cr;
1032 0 : uint comp_mask = comp_ring->depth - 1U;
1033 0 : ulong frame = FD_VOLATILE_CONST( comp_ring->frame_ring[ comp_seq&comp_mask ] );
1034 0 : ulong const frame_mask = FD_NET_MTU - 1UL;
1035 0 : if( FD_UNLIKELY( frame+FD_NET_MTU > ctx->umem_sz ) ) {
1036 0 : FD_LOG_ERR(( "Bounds check failed: frame=0x%lx umem_sz=0x%lx",
1037 0 : frame, (ulong)ctx->umem_sz ));
1038 0 : }
1039 :
1040 : /* Check if we have space to return the freed frame */
1041 :
1042 0 : fd_net_free_ring_t * free = &ctx->free_tx;
1043 0 : ulong free_prod = free->prod;
1044 0 : ulong free_mask = free->depth - 1UL;
1045 0 : long free_cnt = fd_seq_diff( free_prod, free->cons );
1046 0 : if( FD_UNLIKELY( free_cnt>=(long)free->depth ) ) return; /* blocked */
1047 :
1048 0 : free->queue[ free_prod&free_mask ] = (ulong)ctx->umem_frame0 + (frame & (~frame_mask));
1049 0 : free->prod = fd_seq_inc( free_prod, 1UL );
1050 :
1051 : /* Wind up for next iteration */
1052 :
1053 0 : FD_VOLATILE( *comp_ring->cons ) = comp_ring->cached_cons = comp_seq+1U;
1054 :
1055 0 : ctx->metrics.tx_complete_cnt++;
1056 :
1057 0 : }
1058 :
1059 : /* net_rx_event is called when a new XDP RX frame is available. Calls
1060 : net_rx_packet, then returns the packet back to the kernel via the fill
1061 : ring. */
1062 :
1063 : static void
1064 : net_rx_event( fd_net_ctx_t * ctx,
1065 : fd_xsk_t * xsk,
1066 402 : uint rx_seq ) {
1067 : /* Locate the incoming frame */
1068 :
1069 402 : fd_xdp_ring_t * rx_ring = &xsk->ring_rx;
1070 402 : uint rx_mask = rx_ring->depth - 1U;
1071 402 : struct xdp_desc frame = FD_VOLATILE_CONST( rx_ring->packet_ring[ rx_seq&rx_mask ] );
1072 :
1073 402 : if( FD_UNLIKELY( frame.len>FD_NET_MTU ) )
1074 0 : FD_LOG_ERR(( "received a UDP packet with a too large payload (%u)", frame.len ));
1075 :
1076 : /* Check if we have space in the fill ring to free the frame */
1077 :
1078 402 : fd_xdp_ring_t * fill_ring = &xsk->ring_fr;
1079 402 : uint fill_depth = fill_ring->depth;
1080 402 : uint fill_mask = fill_depth-1U;
1081 402 : ulong frame_mask = FD_NET_MTU - 1UL;
1082 402 : uint fill_prod = FD_VOLATILE_CONST( *fill_ring->prod );
1083 402 : uint fill_cons = FD_VOLATILE_CONST( *fill_ring->cons );
1084 :
1085 402 : if( FD_UNLIKELY( (int)(fill_prod-fill_cons) >= (int)fill_depth ) ) {
1086 0 : ctx->metrics.rx_fill_blocked_cnt++;
1087 0 : return; /* blocked */
1088 0 : }
1089 :
1090 : /* Pass it to the receive handler */
1091 :
1092 402 : uint freed_chunk = (uint)( ctx->umem_chunk0 + (frame.addr>>FD_CHUNK_LG_SZ) );
1093 402 : net_rx_packet( ctx, frame.addr, frame.len, &freed_chunk );
1094 :
1095 402 : FD_COMPILER_MFENCE();
1096 402 : FD_VOLATILE( *rx_ring->cons ) = rx_ring->cached_cons = rx_seq+1U;
1097 :
1098 : /* Every RX operation returns one frame to the FILL ring. If the
1099 : packet was forwarded to a downstream ring, the newly shadowed frame
1100 : is returned. Otherwise, the frame just received is returned. */
1101 :
1102 402 : if( FD_UNLIKELY( ( freed_chunk < ctx->umem_chunk0 ) |
1103 402 : ( freed_chunk > ctx->umem_wmark ) ) ) {
1104 0 : FD_LOG_CRIT(( "mcache corruption detected: chunk=%u chunk0=%u wmark=%u",
1105 0 : freed_chunk, ctx->umem_chunk0, ctx->umem_wmark ));
1106 0 : }
1107 402 : ulong freed_off = (freed_chunk - ctx->umem_chunk0)<<FD_CHUNK_LG_SZ;
1108 402 : fill_ring->frame_ring[ fill_prod&fill_mask ] = freed_off & (~frame_mask);
1109 402 : FD_VOLATILE( *fill_ring->prod ) = fill_ring->cached_prod = fill_prod+1U;
1110 :
1111 402 : }
1112 :
1113 : /* before_credit is called every loop iteration. */
1114 :
1115 : static void
1116 : before_credit( fd_net_ctx_t * ctx,
1117 : fd_stem_context_t * stem,
1118 402 : int * charge_busy ) {
1119 402 : (void)stem;
1120 : /* A previous send attempt was overrun. A corrupt copy of the packet was
1121 : placed into an XDP frame, but the frame was not yet submitted to the
1122 : TX ring. Return the tx buffer to the free list. */
1123 :
1124 402 : if( ctx->tx_op.frame ) {
1125 0 : *charge_busy = 1;
1126 0 : fd_net_free_ring_t * free = &ctx->free_tx;
1127 0 : ulong alloc_seq = free->prod;
1128 0 : free->queue[ alloc_seq % free->depth ] = (ulong)ctx->tx_op.frame;
1129 0 : free->prod = fd_seq_inc( alloc_seq, 1UL );
1130 0 : ctx->tx_op.frame = NULL;
1131 0 : }
1132 :
1133 : /* Check if new packets are available or if TX frames are free again
1134 : (Round-robin through sockets) */
1135 :
1136 402 : uint rr_idx = ctx->rr_idx;
1137 402 : fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ];
1138 :
1139 402 : net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy );
1140 :
1141 402 : uint rx_cons = rr_xsk->ring_rx.cached_cons;
1142 402 : uint rx_prod = FD_VOLATILE_CONST( *rr_xsk->ring_rx.prod );
1143 402 : if( rx_cons!=rx_prod ) {
1144 402 : *charge_busy = 1;
1145 402 : rr_xsk->ring_rx.cached_prod = rx_prod;
1146 402 : net_rx_event( ctx, rr_xsk, rx_cons );
1147 402 : } else {
1148 0 : net_rx_wakeup( ctx, rr_xsk, charge_busy );
1149 0 : ctx->rr_idx++;
1150 0 : ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx );
1151 0 : }
1152 :
1153 402 : uint comp_cons = FD_VOLATILE_CONST( *rr_xsk->ring_cr.cons );
1154 402 : uint comp_prod = FD_VOLATILE_CONST( *rr_xsk->ring_cr.prod );
1155 402 : if( comp_cons!=comp_prod ) {
1156 0 : *charge_busy = 1;
1157 0 : rr_xsk->ring_cr.cached_prod = comp_prod;
1158 0 : net_comp_event( ctx, rr_xsk, comp_cons );
1159 0 : }
1160 :
1161 402 : }
1162 :
1163 : /* net_xsk_bootstrap assigns UMEM frames to the FILL ring. */
1164 :
1165 : static ulong
1166 : net_xsk_bootstrap( fd_net_ctx_t * ctx,
1167 : uint xsk_idx,
1168 3 : ulong frame_off ) {
1169 3 : fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
1170 :
1171 3 : ulong const frame_sz = FD_NET_MTU;
1172 3 : ulong const fr_depth = ctx->xsk[ xsk_idx ].ring_fr.depth/2UL;
1173 :
1174 3 : fd_xdp_ring_t * fill = &xsk->ring_fr;
1175 3 : uint fill_prod = fill->cached_prod;
1176 98307 : for( ulong j=0UL; j<fr_depth; j++ ) {
1177 98304 : fill->frame_ring[ j ] = frame_off;
1178 98304 : frame_off += frame_sz;
1179 98304 : }
1180 3 : FD_VOLATILE( *fill->prod ) = fill->cached_prod = fill_prod + (uint)fr_depth;
1181 :
1182 3 : return frame_off;
1183 3 : }
1184 :
1185 : /* FIXME source MAC address from netlnk tile instead */
1186 :
1187 : static void
1188 : interface_addrs( const char * interface,
1189 : uchar * mac,
1190 0 : uint * ip4_addr ) {
1191 0 : int fd = socket( AF_INET, SOCK_DGRAM, 0 );
1192 0 : struct ifreq ifr;
1193 0 : ifr.ifr_addr.sa_family = AF_INET;
1194 :
1195 0 : strncpy( ifr.ifr_name, interface, IFNAMSIZ );
1196 0 : if( FD_UNLIKELY( ioctl( fd, SIOCGIFHWADDR, &ifr ) ) )
1197 0 : FD_LOG_ERR(( "could not get MAC address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
1198 0 : fd_memcpy( mac, ifr.ifr_hwaddr.sa_data, 6 );
1199 :
1200 0 : if( FD_UNLIKELY( ioctl( fd, SIOCGIFADDR, &ifr ) ) )
1201 0 : FD_LOG_ERR(( "could not get IP address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
1202 0 : *ip4_addr = ((struct sockaddr_in *)fd_type_pun( &ifr.ifr_addr ))->sin_addr.s_addr;
1203 :
1204 0 : if( FD_UNLIKELY( close(fd) ) )
1205 0 : FD_LOG_ERR(( "could not close socket (%i-%s)", errno, fd_io_strerror( errno ) ));
1206 0 : }
1207 :
1208 : /* privileged_init does the following initialization steps:
1209 :
1210 : - Create an AF_XDP socket
1211 : - Map XDP metadata rings
1212 : - Register UMEM data region with socket
1213 : - Insert AF_XDP socket into xsk_map
1214 :
1215 : Net tile 0 also runs fd_xdp_install and repeats the above step for
1216 : the loopback device. (Unless the main interface is already loopback)
1217 :
1218 : Kernel object references:
1219 :
1220 : BPF_LINK file descriptor
1221 : |
1222 : +-> XDP program installation on NIC
1223 : | |
1224 : | +-> XDP program <-- BPF_PROG file descriptor (prog_fd)
1225 : |
1226 : +-> XSKMAP object <-- BPF_MAP file descriptor (xsk_map) */
1227 :
1228 : FD_FN_UNUSED static void
1229 : privileged_init( fd_topo_t * topo,
1230 0 : fd_topo_tile_t * tile ) {
1231 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1232 :
1233 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1234 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
1235 0 : ulong * free_tx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );;
1236 :
1237 0 : fd_memset( ctx, 0, sizeof(fd_net_ctx_t) );
1238 :
1239 0 : uint if_idx = if_nametoindex( tile->xdp.interface );
1240 0 : if( FD_UNLIKELY( !if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed", tile->xdp.interface ));
1241 :
1242 0 : interface_addrs( tile->xdp.interface, ctx->src_mac_addr, &ctx->default_address );
1243 :
1244 : /* Load up dcache containing UMEM */
1245 :
1246 0 : void * const dcache_mem = fd_topo_obj_laddr( topo, tile->net.umem_dcache_obj_id );
1247 0 : void * const umem_dcache = fd_dcache_join( dcache_mem );
1248 0 : ulong const umem_dcache_data_sz = fd_dcache_data_sz( umem_dcache );
1249 0 : ulong const umem_frame_sz = 2048UL;
1250 :
1251 : /* Left shrink UMEM region to be 4096 byte aligned */
1252 :
1253 0 : void * const umem_frame0 = (void *)fd_ulong_align_up( (ulong)umem_dcache, 4096UL );
1254 0 : ulong umem_sz = umem_dcache_data_sz - ((ulong)umem_frame0 - (ulong)umem_dcache);
1255 0 : umem_sz = fd_ulong_align_dn( umem_sz, umem_frame_sz );
1256 :
1257 : /* Derive chunk bounds */
1258 :
1259 0 : void * const umem_base = fd_wksp_containing( dcache_mem );
1260 0 : ulong const umem_chunk0 = ( (ulong)umem_frame0 - (ulong)umem_base )>>FD_CHUNK_LG_SZ;
1261 0 : ulong const umem_wmark = umem_chunk0 + ( ( umem_sz-umem_frame_sz )>>FD_CHUNK_LG_SZ );
1262 :
1263 0 : if( FD_UNLIKELY( umem_chunk0>UINT_MAX || umem_wmark>UINT_MAX || umem_chunk0>umem_wmark ) ) {
1264 0 : FD_LOG_ERR(( "Calculated invalid UMEM bounds [%lu,%lu]", umem_chunk0, umem_wmark ));
1265 0 : }
1266 :
1267 0 : if( FD_UNLIKELY( !umem_base ) ) FD_LOG_ERR(( "UMEM dcache is not in a workspace" ));
1268 0 : if( FD_UNLIKELY( !umem_dcache ) ) FD_LOG_ERR(( "Failed to join UMEM dcache" ));
1269 :
1270 0 : ctx->umem_frame0 = umem_frame0;
1271 0 : ctx->umem_sz = umem_sz;
1272 0 : ctx->umem_chunk0 = (uint)umem_chunk0;
1273 0 : ctx->umem_wmark = (uint)umem_wmark;
1274 :
1275 0 : ctx->free_tx.queue = free_tx;
1276 0 : ctx->free_tx.depth = tile->xdp.xdp_tx_queue_size;
1277 :
1278 : /* Create and install XSKs */
1279 :
1280 0 : fd_xsk_params_t params0 = {
1281 0 : .if_idx = if_idx,
1282 0 : .if_queue_id = (uint)tile->kind_id,
1283 :
1284 : /* Some kernels produce EOPNOTSUP errors on sendto calls when
1285 : starting up without either XDP_ZEROCOPY or XDP_COPY
1286 : (e.g. 5.14.0-503.23.1.el9_5 with i40e) */
1287 0 : .bind_flags = tile->xdp.zero_copy ? XDP_ZEROCOPY : XDP_COPY,
1288 :
1289 0 : .fr_depth = tile->xdp.xdp_rx_queue_size*2,
1290 0 : .rx_depth = tile->xdp.xdp_rx_queue_size,
1291 0 : .cr_depth = tile->xdp.xdp_tx_queue_size,
1292 0 : .tx_depth = tile->xdp.xdp_tx_queue_size,
1293 :
1294 0 : .umem_addr = umem_frame0,
1295 0 : .frame_sz = umem_frame_sz,
1296 0 : .umem_sz = umem_sz
1297 0 : };
1298 :
1299 0 : int xsk_map_fd = 123462;
1300 0 : ctx->prog_link_fds[ 0 ] = 123463;
1301 : /* Init XSK */
1302 0 : if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 0 ], ¶ms0 ) ) ) FD_LOG_ERR(( "failed to bind xsk for net tile %lu", tile->kind_id ));
1303 0 : if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 0 ], xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate xsk for net tile %lu", tile->kind_id ));
1304 0 : ctx->xsk_cnt = 1;
1305 :
1306 0 : if( FD_UNLIKELY( fd_sandbox_gettid()==fd_sandbox_getpid() ) ) {
1307 : /* Kind of gross.. in single threaded mode we don't want to close the xsk_map_fd
1308 : since it's shared with other net tiles. Just check for that by seeing if we
1309 : are the only thread in the process. */
1310 0 : if( FD_UNLIKELY( -1==close( xsk_map_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
1311 0 : }
1312 :
1313 : /* Networking tile at index 0 also binds to loopback (only queue 0 available on lo) */
1314 :
1315 0 : if( FD_UNLIKELY( strcmp( tile->xdp.interface, "lo" ) && !tile->kind_id ) ) {
1316 0 : ctx->xsk_cnt = 2;
1317 :
1318 0 : ushort udp_port_candidates[] = {
1319 0 : (ushort)tile->xdp.net.legacy_transaction_listen_port,
1320 0 : (ushort)tile->xdp.net.quic_transaction_listen_port,
1321 0 : (ushort)tile->xdp.net.shred_listen_port,
1322 0 : (ushort)tile->xdp.net.gossip_listen_port,
1323 0 : (ushort)tile->xdp.net.repair_intake_listen_port,
1324 0 : (ushort)tile->xdp.net.repair_serve_listen_port,
1325 0 : (ushort)tile->xdp.net.send_src_port
1326 0 : };
1327 :
1328 0 : uint lo_idx = if_nametoindex( "lo" );
1329 0 : if( FD_UNLIKELY( !lo_idx ) ) FD_LOG_ERR(( "if_nametoindex(lo) failed" ));
1330 :
1331 : /* FIXME move this to fd_topo_run */
1332 0 : fd_xdp_fds_t lo_fds = fd_xdp_install( lo_idx,
1333 0 : tile->net.bind_address,
1334 0 : sizeof(udp_port_candidates)/sizeof(udp_port_candidates[0]),
1335 0 : udp_port_candidates,
1336 0 : "skb" );
1337 :
1338 0 : ctx->prog_link_fds[ 1 ] = lo_fds.prog_link_fd;
1339 : /* init xsk 1 */
1340 0 : fd_xsk_params_t params1 = params0;
1341 0 : params1.if_idx = lo_idx; /* probably always 1 */
1342 0 : params1.if_queue_id = 0;
1343 0 : params1.bind_flags = 0;
1344 0 : if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 1 ], ¶ms1 ) ) ) FD_LOG_ERR(( "failed to bind lo_xsk" ));
1345 0 : if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 1 ], lo_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate lo_xsk" ));
1346 0 : if( FD_UNLIKELY( -1==close( lo_fds.xsk_map_fd ) ) ) FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
1347 0 : }
1348 :
1349 0 : double tick_per_ns = fd_tempo_tick_per_ns( NULL );
1350 0 : ctx->xdp_stats_interval_ticks = (long)( FD_XDP_STATS_INTERVAL_NS * tick_per_ns );
1351 :
1352 0 : ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
1353 0 : if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
1354 0 : FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
1355 0 : }
1356 :
1357 : /* init_device_table joins the net tile to the netlink tile's device
1358 : table. The device table is very frequently read, and rarely updated.
1359 : Therefore, the net tile keeps a local copy of the device table in
1360 : scratch memory. This table is periodically copied over from the
1361 : netlink tile via a double buffer (netdev_dbl_buf).
1362 :
1363 : On startup, the netlink tile might not have produced its initial
1364 : device table. Therefore, initialize the local copy to an empty
1365 : table. */
1366 :
1367 : static void
1368 : init_device_table( fd_net_ctx_t * ctx,
1369 6 : void * netdev_dbl_buf ) {
1370 :
1371 : /* Join remote double buffer containing device table updates */
1372 6 : ctx->netdev_dbl_buf = fd_dbl_buf_join( netdev_dbl_buf );
1373 6 : if( FD_UNLIKELY( !ctx->netdev_dbl_buf ) ) FD_LOG_ERR(( "fd_dbl_buf_join failed" ));
1374 6 : ctx->netdev_buf_sz = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX );
1375 :
1376 : /* Create temporary empty device table during startup */
1377 6 : FD_TEST( fd_netdev_tbl_join( &ctx->netdev_tbl, fd_netdev_tbl_new( ctx->netdev_buf, 1, 1 ) ) );
1378 :
1379 6 : }
1380 :
1381 : FD_FN_UNUSED static void
1382 : unprivileged_init( fd_topo_t * topo,
1383 3 : fd_topo_tile_t * tile ) {
1384 3 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1385 :
1386 3 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1387 3 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
1388 3 : FD_TEST( ctx->xsk_cnt!=0 );
1389 3 : FD_TEST( ctx->free_tx.queue!=NULL );
1390 3 : (void)FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );
1391 3 : ctx->netdev_buf = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_buf_sz );
1392 :
1393 0 : ctx->net_tile_id = (uint)tile->kind_id;
1394 3 : ctx->net_tile_cnt = (uint)fd_topo_tile_name_cnt( topo, tile->name );
1395 :
1396 3 : ctx->bind_address = tile->net.bind_address;
1397 3 : ctx->shred_listen_port = tile->net.shred_listen_port;
1398 3 : ctx->quic_transaction_listen_port = tile->net.quic_transaction_listen_port;
1399 3 : ctx->legacy_transaction_listen_port = tile->net.legacy_transaction_listen_port;
1400 3 : ctx->gossip_listen_port = tile->net.gossip_listen_port;
1401 3 : ctx->repair_intake_listen_port = tile->net.repair_intake_listen_port;
1402 3 : ctx->repair_serve_listen_port = tile->net.repair_serve_listen_port;
1403 3 : ctx->send_src_port = tile->net.send_src_port;
1404 :
1405 : /* Put a bound on chunks we read from the input, to make sure they
1406 : are within in the data region of the workspace. */
1407 :
1408 3 : if( FD_UNLIKELY( !tile->in_cnt ) ) FD_LOG_ERR(( "net tile in link cnt is zero" ));
1409 3 : if( FD_UNLIKELY( tile->in_cnt>MAX_NET_INS ) ) FD_LOG_ERR(( "net tile in link cnt %lu exceeds MAX_NET_INS %lu", tile->in_cnt, MAX_NET_INS ));
1410 3 : FD_TEST( tile->in_cnt<=32 );
1411 18 : for( ulong i=0UL; i<tile->in_cnt; i++ ) {
1412 15 : fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ];
1413 15 : if( FD_UNLIKELY( link->mtu!=FD_NET_MTU ) ) FD_LOG_ERR(( "net tile in link %s does not have a normal MTU", link->name ));
1414 :
1415 15 : ctx->in[ i ].mem = topo->workspaces[ topo->objs[ link->dcache_obj_id ].wksp_id ].wksp;
1416 15 : ctx->in[ i ].chunk0 = fd_dcache_compact_chunk0( ctx->in[ i ].mem, link->dcache );
1417 15 : ctx->in[ i ].wmark = fd_dcache_compact_wmark( ctx->in[ i ].mem, link->dcache, link->mtu );
1418 15 : }
1419 :
1420 21 : for( ulong i = 0; i < tile->out_cnt; i++ ) {
1421 18 : fd_topo_link_t * out_link = &topo->links[ tile->out_link_id[ i ] ];
1422 18 : if( strcmp( out_link->name, "net_quic" ) == 0 ) {
1423 3 : fd_topo_link_t * quic_out = out_link;
1424 3 : ctx->quic_out->mcache = quic_out->mcache;
1425 3 : ctx->quic_out->sync = fd_mcache_seq_laddr( ctx->quic_out->mcache );
1426 3 : ctx->quic_out->depth = fd_mcache_depth( ctx->quic_out->mcache );
1427 3 : ctx->quic_out->seq = fd_mcache_seq_query( ctx->quic_out->sync );
1428 15 : } else if( strcmp( out_link->name, "net_shred" ) == 0 ) {
1429 3 : fd_topo_link_t * shred_out = out_link;
1430 3 : ctx->shred_out->mcache = shred_out->mcache;
1431 3 : ctx->shred_out->sync = fd_mcache_seq_laddr( ctx->shred_out->mcache );
1432 3 : ctx->shred_out->depth = fd_mcache_depth( ctx->shred_out->mcache );
1433 3 : ctx->shred_out->seq = fd_mcache_seq_query( ctx->shred_out->sync );
1434 12 : } else if( strcmp( out_link->name, "net_gossvf" ) == 0 ) {
1435 3 : fd_topo_link_t * gossip_out = out_link;
1436 3 : ctx->gossvf_out->mcache = gossip_out->mcache;
1437 3 : ctx->gossvf_out->sync = fd_mcache_seq_laddr( ctx->gossvf_out->mcache );
1438 3 : ctx->gossvf_out->depth = fd_mcache_depth( ctx->gossvf_out->mcache );
1439 3 : ctx->gossvf_out->seq = fd_mcache_seq_query( ctx->gossvf_out->sync );
1440 9 : } else if( strcmp( out_link->name, "net_repair" ) == 0 ) {
1441 3 : fd_topo_link_t * repair_out = out_link;
1442 3 : ctx->repair_out->mcache = repair_out->mcache;
1443 3 : ctx->repair_out->sync = fd_mcache_seq_laddr( ctx->repair_out->mcache );
1444 3 : ctx->repair_out->depth = fd_mcache_depth( ctx->repair_out->mcache );
1445 3 : ctx->repair_out->seq = fd_mcache_seq_query( ctx->repair_out->sync );
1446 6 : } else if( strcmp( out_link->name, "net_netlnk" ) == 0 ) {
1447 3 : fd_topo_link_t * netlink_out = out_link;
1448 3 : ctx->neigh4_solicit->mcache = netlink_out->mcache;
1449 3 : ctx->neigh4_solicit->depth = fd_mcache_depth( ctx->neigh4_solicit->mcache );
1450 3 : ctx->neigh4_solicit->seq = fd_mcache_seq_query( fd_mcache_seq_laddr( ctx->neigh4_solicit->mcache ) );
1451 3 : } else if( strcmp( out_link->name, "net_send" ) == 0 ) {
1452 3 : fd_topo_link_t * send_out = out_link;
1453 3 : ctx->send_out->mcache = send_out->mcache;
1454 3 : ctx->send_out->sync = fd_mcache_seq_laddr( ctx->send_out->mcache );
1455 3 : ctx->send_out->depth = fd_mcache_depth( ctx->send_out->mcache );
1456 3 : ctx->send_out->seq = fd_mcache_seq_query( ctx->send_out->sync );
1457 3 : } else {
1458 0 : FD_LOG_ERR(( "unrecognized out link `%s`", out_link->name ));
1459 0 : }
1460 18 : }
1461 :
1462 : /* Check if any of the tiles we set a listen port for do not have an outlink. */
1463 3 : if( FD_UNLIKELY( ctx->shred_listen_port!=0 && ctx->shred_out->mcache==NULL ) ) {
1464 0 : FD_LOG_ERR(( "shred listen port set but no out link was found" ));
1465 3 : } else if( FD_UNLIKELY( ctx->quic_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
1466 0 : FD_LOG_ERR(( "quic transaction listen port set but no out link was found" ));
1467 3 : } else if( FD_UNLIKELY( ctx->legacy_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
1468 0 : FD_LOG_ERR(( "legacy transaction listen port set but no out link was found" ));
1469 3 : } else if( FD_UNLIKELY( ctx->gossip_listen_port!=0 && ctx->gossvf_out->mcache==NULL ) ) {
1470 0 : FD_LOG_ERR(( "gossip listen port set but no out link was found" ));
1471 3 : } else if( FD_UNLIKELY( ctx->repair_intake_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
1472 0 : FD_LOG_ERR(( "repair intake port set but no out link was found" ));
1473 3 : } else if( FD_UNLIKELY( ctx->repair_serve_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
1474 0 : FD_LOG_ERR(( "repair serve listen port set but no out link was found" ));
1475 3 : } else if( FD_UNLIKELY( ctx->neigh4_solicit->mcache==NULL ) ) {
1476 0 : FD_LOG_ERR(( "netlink request link not found" ));
1477 3 : } else if( FD_UNLIKELY( ctx->send_src_port!=0 && ctx->send_out->mcache==NULL ) ) {
1478 0 : FD_LOG_ERR(( "send listen port set but no out link was found" ));
1479 0 : }
1480 :
1481 9 : for( uint j=0U; j<2U; j++ ) {
1482 6 : ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 );
1483 6 : ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) );
1484 6 : ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX;
1485 6 : }
1486 :
1487 : /* Join netbase objects */
1488 3 : ctx->fib_local = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_local_obj_id ) );
1489 3 : ctx->fib_main = fd_fib4_join( fd_topo_obj_laddr( topo, tile->xdp.fib4_main_obj_id ) );
1490 3 : if( FD_UNLIKELY( !ctx->fib_local || !ctx->fib_main ) ) FD_LOG_ERR(( "fd_fib4_join failed" ));
1491 3 : if( FD_UNLIKELY( !fd_neigh4_hmap_join(
1492 3 : ctx->neigh4,
1493 3 : fd_topo_obj_laddr( topo, tile->xdp.neigh4_obj_id ),
1494 3 : fd_topo_obj_laddr( topo, tile->xdp.neigh4_ele_obj_id ) ) ) ) {
1495 0 : FD_LOG_ERR(( "fd_neigh4_hmap_join failed" ));
1496 0 : }
1497 :
1498 3 : init_device_table( ctx, fd_topo_obj_laddr( topo, tile->xdp.netdev_dbl_buf_obj_id ) );
1499 :
1500 : /* Initialize TX free ring */
1501 :
1502 3 : ulong const frame_sz = 2048UL;
1503 3 : ulong frame_off = 0UL;
1504 3 : ulong const tx_depth = ctx->free_tx.depth;
1505 98307 : for( ulong j=0; j<tx_depth; j++ ) {
1506 98304 : ctx->free_tx.queue[ j ] = (ulong)ctx->umem_frame0 + frame_off;
1507 98304 : frame_off += frame_sz;
1508 98304 : }
1509 3 : ctx->free_tx.prod = tx_depth;
1510 :
1511 : /* Initialize RX mcache chunks */
1512 :
1513 21 : for( ulong i=0UL; i<(tile->out_cnt); i++ ) {
1514 18 : fd_topo_link_t * out_link = &topo->links[ tile->out_link_id[ i ] ];
1515 18 : fd_frag_meta_t * mcache = out_link->mcache;
1516 246162 : for( ulong j=0UL; j<fd_mcache_depth( mcache ); j++ ) {
1517 246144 : mcache[ j ].chunk = (uint)( ctx->umem_chunk0 + (frame_off>>FD_CHUNK_LG_SZ) );
1518 246144 : frame_off += frame_sz;
1519 246144 : }
1520 18 : }
1521 :
1522 : /* Initialize FILL ring */
1523 :
1524 3 : int _charge_busy = 0;
1525 6 : for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
1526 3 : frame_off = net_xsk_bootstrap( ctx, j, frame_off );
1527 3 : net_rx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
1528 3 : net_tx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
1529 3 : }
1530 :
1531 3 : if( FD_UNLIKELY( frame_off > ctx->umem_sz ) ) {
1532 0 : FD_LOG_ERR(( "UMEM is too small" ));
1533 0 : }
1534 3 : }
1535 :
1536 : FD_FN_UNUSED static ulong
1537 : populate_allowed_seccomp( fd_topo_t const * topo,
1538 : fd_topo_tile_t const * tile,
1539 : ulong out_cnt,
1540 0 : struct sock_filter * out ) {
1541 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1542 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1543 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
1544 :
1545 : /* A bit of a hack, if there is no loopback XSK for this tile, we still need to pass
1546 : two "allow" FD arguments to the net policy, so we just make them both the same. */
1547 0 : int allow_fd2 = ctx->xsk_cnt>1UL ? ctx->xsk[ 1 ].xsk_fd : ctx->xsk[ 0 ].xsk_fd;
1548 0 : FD_TEST( ctx->xsk[ 0 ].xsk_fd >= 0 && allow_fd2 >= 0 );
1549 :
1550 0 : populate_sock_filter_policy_fd_xdp_tile( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ].xsk_fd, (uint)allow_fd2 );
1551 0 : return sock_filter_policy_fd_xdp_tile_instr_cnt;
1552 0 : }
1553 :
1554 : FD_FN_UNUSED static ulong
1555 : populate_allowed_fds( fd_topo_t const * topo,
1556 : fd_topo_tile_t const * tile,
1557 : ulong out_fds_cnt,
1558 0 : int * out_fds ) {
1559 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1560 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1561 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
1562 :
1563 0 : if( FD_UNLIKELY( out_fds_cnt<6UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
1564 :
1565 0 : ulong out_cnt = 0UL;
1566 :
1567 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
1568 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
1569 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
1570 :
1571 0 : out_fds[ out_cnt++ ] = ctx->xsk[ 0 ].xsk_fd;
1572 0 : out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 0 ];
1573 0 : if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->xsk[ 1 ].xsk_fd;
1574 0 : if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 1 ];
1575 0 : return out_cnt;
1576 0 : }
1577 :
1578 0 : #define STEM_BURST (1UL)
1579 0 : #define STEM_LAZY ((ulong)30e3) /* 30 us */
1580 :
1581 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_net_ctx_t
1582 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_net_ctx_t)
1583 :
1584 0 : #define STEM_CALLBACK_METRICS_WRITE metrics_write
1585 0 : #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
1586 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
1587 0 : #define STEM_CALLBACK_BEFORE_FRAG before_frag
1588 0 : #define STEM_CALLBACK_DURING_FRAG during_frag
1589 0 : #define STEM_CALLBACK_AFTER_FRAG after_frag
1590 :
1591 : #include "../../stem/fd_stem.c"
1592 :
1593 : #ifndef FD_TILE_TEST
1594 : fd_topo_run_tile_t fd_tile_net = {
1595 : .name = "net",
1596 : .populate_allowed_seccomp = populate_allowed_seccomp,
1597 : .populate_allowed_fds = populate_allowed_fds,
1598 : .scratch_align = scratch_align,
1599 : .scratch_footprint = scratch_footprint,
1600 : .privileged_init = privileged_init,
1601 : .unprivileged_init = unprivileged_init,
1602 : .run = stem_run,
1603 : };
1604 : #endif
|