Line data Source code
1 : /* The xdp tile translates between AF_XDP and fd_tango
2 : traffic. It is responsible for setting up the XDP and
3 : XSK socket configuration. */
4 :
5 : #include "../fd_net_tile.h"
6 :
7 : #include <errno.h>
8 : #include <fcntl.h>
9 : #include <net/if.h>
10 : #include <netinet/in.h>
11 : #include <sys/socket.h> /* MSG_DONTWAIT needed before importing the net seccomp filter */
12 : #include <linux/if_xdp.h>
13 :
14 : #include "../fd_net_common.h"
15 : #include "../../metrics/fd_metrics.h"
16 : #include "../../netlink/fd_netlink_tile.h" /* neigh4_solicit */
17 : #include "../../topo/fd_topo.h"
18 :
19 : #include "../../../waltz/ip/fd_fib4.h"
20 : #include "../../../waltz/neigh/fd_neigh4_map.h"
21 : #include "../../../waltz/mib/fd_netdev_tbl.h"
22 : #include "../../../waltz/xdp/fd_xdp_redirect_user.h" /* fd_xsk_activate */
23 : #include "../../../waltz/xdp/fd_xsk.h"
24 : #include "../../../util/log/fd_dtrace.h"
25 : #include "../../../util/net/fd_eth.h"
26 : #include "../../../util/net/fd_ip4.h"
27 : #include "../../../util/net/fd_gre.h"
28 : #include "../../../util/pod/fd_pod_format.h"
29 :
30 : #include <unistd.h>
31 : #include <linux/if.h> /* struct ifreq */
32 : #include <sys/ioctl.h>
33 : #include <linux/if_arp.h>
34 :
35 : #include "generated/fd_xdp_tile_seccomp.h"
36 :
37 : /* MAX_NET_INS controls the max number of TX links that a net tile can
38 : serve. */
39 :
40 : #define MAX_NET_INS (32UL)
41 :
42 : /* FD_XDP_STATS_INTERVAL_NS controls the XDP stats refresh interval.
43 : This should be lower than the interval at which the metrics tile
44 : collects metrics. */
45 :
46 0 : #define FD_XDP_STATS_INTERVAL_NS (11e6) /* 11ms */
47 :
48 : /* XSK_IDX_{MAIN,LO} are the hardcoded XSK indices in ctx->xsk[ ... ].
49 : Only net tile 0 has XSK_IDX_LO, all net tiles have XSK_IDX_MAIN. */
50 :
51 30 : #define XSK_IDX_MAIN 0
52 18 : #define XSK_IDX_LO 1
53 :
54 : /* fd_net_in_ctx_t contains consumer information for an incoming tango
55 : link. It is used as part of the TX path. */
56 :
57 : typedef struct {
58 : fd_wksp_t * mem;
59 : ulong chunk0;
60 : ulong wmark;
61 : } fd_net_in_ctx_t;
62 :
63 : /* fd_net_out_ctx_t contains publisher information for a link to a
64 : downstream app tile. It is used as part of the RX path. */
65 :
66 : typedef struct {
67 : fd_frag_meta_t * mcache;
68 : ulong * sync;
69 : ulong depth;
70 : ulong seq;
71 : } fd_net_out_ctx_t;
72 :
73 : /* fd_net_flusher_t controls the pacing of XDP sendto calls for flushing
74 : TX batches. In the 'wakeup' XDP mode, no TX occurs unless the net
75 : tile wakes up the kernel periodically using the sendto() syscall.
76 : If sendto() is called too frequently, time is wasted on context
77 : switches. If sendto() is called not often enough, packets are
78 : delayed or dropped. sendto() calls make almost no guarantees how
79 : much packets are sent out, nor do they indicate when the kernel
80 : finishes a wakeup call (asynchronously dispatched). The net tile
81 : thus uses a myraid of flush triggers that were tested for best
82 : performance. */
83 :
84 : struct fd_net_flusher {
85 :
86 : /* Packets that were enqueued after the last sendto() wakeup are
87 : considered "pending". If there are more than pending_wmark packets
88 : pending, a wakeup is dispatched. Thus, this dispatch trigger is
89 : proportional to packet rate, but does not trigger if I/O is seldom. */
90 : ulong pending_cnt;
91 : ulong pending_wmark;
92 :
93 : /* Sometimes, packets are not flushed out even after a sendto()
94 : wakeup. This can result in the tail of a burst getting delayed or
95 : overrun. If more than tail_flush_backoff ticks pass since the last
96 : sendto() wakeup and there are still unacknowledged packets in the
97 : TX ring, issues another wakeup. */
98 : long next_tail_flush_ticks;
99 : long tail_flush_backoff;
100 :
101 : };
102 :
103 : typedef struct fd_net_flusher fd_net_flusher_t;
104 :
105 : FD_PROTOTYPES_BEGIN
106 :
107 : /* fd_net_flusher_inc marks a new packet as enqueued. */
108 :
109 : static inline void
110 : fd_net_flusher_inc( fd_net_flusher_t * flusher,
111 18 : long now ) {
112 18 : flusher->pending_cnt++;
113 18 : long next_flush = now + flusher->tail_flush_backoff;
114 18 : flusher->next_tail_flush_ticks = fd_long_min( flusher->next_tail_flush_ticks, next_flush );
115 18 : }
116 :
117 : /* fd_net_flusher_check returns 1 if a sendto() wakeup should be issued
118 : immediately. now is a recent fd_tickcount() value.
119 : If tx_ring_empty==0 then the kernel is caught up with the net tile
120 : on the XDP TX ring. (Otherwise, the kernel is behind the net tile) */
121 :
122 : static inline int
123 : fd_net_flusher_check( fd_net_flusher_t * flusher,
124 : long now,
125 27 : int tx_ring_empty ) {
126 27 : int flush_level = flusher->pending_cnt >= flusher->pending_wmark;
127 27 : int flush_timeout = now >= flusher->next_tail_flush_ticks;
128 27 : int flush = flush_level || flush_timeout;
129 27 : if( !flush ) return 0;
130 27 : if( FD_UNLIKELY( tx_ring_empty ) ) {
131 : /* Flush requested but caught up */
132 3 : flusher->pending_cnt = 0UL;
133 3 : flusher->next_tail_flush_ticks = LONG_MAX;
134 3 : return 0;
135 3 : }
136 24 : return 1;
137 27 : }
138 :
139 : /* fd_net_flusher_wakeup signals a sendto() wakeup was done. now is a
140 : recent fd_tickcount() value. */
141 :
142 : static inline void
143 : fd_net_flusher_wakeup( fd_net_flusher_t * flusher,
144 24 : long now ) {
145 24 : flusher->pending_cnt = 0UL;
146 24 : flusher->next_tail_flush_ticks = now + flusher->tail_flush_backoff;
147 24 : }
148 :
149 : FD_PROTOTYPES_END
150 :
151 : /* fd_net_free_ring is a FIFO queue that stores pointers to free XDP TX
152 : frames. */
153 :
154 : struct fd_net_free_ring {
155 : ulong prod;
156 : ulong cons;
157 : ulong depth;
158 : ulong * queue;
159 : };
160 : typedef struct fd_net_free_ring fd_net_free_ring_t;
161 :
162 : typedef struct {
163 : /* An "XSK" is an AF_XDP socket */
164 : uint xsk_cnt;
165 : fd_xsk_t xsk[ 2 ];
166 : int prog_link_fds[ 2 ];
167 : uint if_virt;
168 :
169 : /* UMEM frame region within dcache */
170 : void * umem; /* Start of UMEM */
171 : ulong umem_sz; /* Size of UMEM */
172 :
173 : /* UMEM chunk region within workspace */
174 : uint umem_chunk0; /* Lowest allowed chunk number */
175 : uint umem_wmark; /* Highest allowed chunk number */
176 :
177 : /* All net tiles are subscribed to the same TX links. (These are
178 : incoming links from app tiles asking the net tile to send out packets)
179 : The net tiles "take turns" doing TX jobs based on the L3+L4 dst hash.
180 : net_tile_id is the index of the current interface, net_tile_cnt is the
181 : total amount of interfaces. */
182 : uint net_tile_id;
183 : uint net_tile_cnt;
184 :
185 : /* Details pertaining to an inflight send op */
186 : struct {
187 : uint xsk_idx;
188 : void * frame;
189 : uchar mac_addrs[12]; /* First 12 bytes of Ethernet header */
190 : uint src_ip; /* src_ip in net order */
191 :
192 : uint use_gre; /* The tx packet will be GRE-encapsulated */
193 : uint gre_outer_src_ip; /* For GRE: Outer iphdr's src_ip in net order */
194 : uint gre_outer_dst_ip; /* For GRE: Outer iphdr's dst_ip in net order */
195 : } tx_op;
196 :
197 : /* Round-robin cycle serivce operations */
198 : uint rr_idx;
199 :
200 : /* Ring tracking free packet buffers */
201 : fd_net_free_ring_t free_tx;
202 :
203 : uchar src_mac_addr[6];
204 : uint default_address;
205 :
206 : uint bind_address;
207 : ushort shred_listen_port;
208 : ushort quic_transaction_listen_port;
209 : ushort legacy_transaction_listen_port;
210 : ushort gossip_listen_port;
211 : ushort repair_intake_listen_port;
212 : ushort repair_serve_listen_port;
213 : ushort txsend_src_port;
214 :
215 : ulong in_cnt;
216 : fd_net_in_ctx_t in[ MAX_NET_INS ];
217 :
218 : fd_net_out_ctx_t quic_out[1];
219 : fd_net_out_ctx_t shred_out[1];
220 : fd_net_out_ctx_t gossvf_out[1];
221 : fd_net_out_ctx_t repair_out[1];
222 : fd_net_out_ctx_t txsend_out[1];
223 :
224 : /* XDP stats refresh timer */
225 : long xdp_stats_interval_ticks;
226 : long next_xdp_stats_refresh;
227 :
228 : /* TX flush timers */
229 : fd_net_flusher_t tx_flusher[2]; /* one per XSK */
230 :
231 : /* Route and neighbor tables */
232 : fd_fib4_t fib_local[1];
233 : fd_fib4_t fib_main[1];
234 : fd_neigh4_hmap_t neigh4[1];
235 : fd_netlink_neigh4_solicit_link_t neigh4_solicit[1];
236 :
237 : /* Netdev table */
238 : fd_netdev_tbl_join_t netdev_tbl; /* local copy in scratch (hot path) */
239 : fd_netdev_tbl_join_t netdev_shared; /* shared table in netbase (seqlock protected) */
240 : uint gre_tunnel_ip; /* 0 means GRE disabled */
241 :
242 : struct {
243 : ulong rx_pkt_cnt;
244 : ulong rx_bytes_total;
245 : ulong rx_src_addr_invalid_cnt;
246 : ulong rx_undersz_cnt;
247 : ulong rx_fill_blocked_cnt;
248 : ulong rx_backp_cnt;
249 : long rx_busy_cnt;
250 : long rx_idle_cnt;
251 :
252 : ulong tx_submit_cnt;
253 : ulong tx_complete_cnt;
254 : ulong tx_bytes_total;
255 : ulong tx_route_fail_cnt;
256 : ulong tx_no_xdp_cnt;
257 : ulong tx_neigh_fail_cnt;
258 : ulong tx_full_fail_cnt;
259 : long tx_busy_cnt;
260 : long tx_idle_cnt;
261 :
262 : ulong xsk_tx_wakeup_cnt;
263 : ulong xsk_rx_wakeup_cnt;
264 :
265 : ulong rx_gre_cnt;
266 : ulong rx_gre_ignored_cnt;
267 : ulong rx_gre_inv_pkt_cnt;
268 : ulong tx_gre_cnt;
269 : ulong tx_gre_route_fail_cnt;
270 : } metrics;
271 : } fd_net_ctx_t;
272 :
273 : FD_FN_CONST static inline ulong
274 9 : scratch_align( void ) {
275 9 : return 4096UL;
276 9 : }
277 :
278 : FD_FN_PURE static inline ulong
279 3 : scratch_footprint( fd_topo_tile_t const * tile ) {
280 3 : ulong l = FD_LAYOUT_INIT;
281 3 : l = FD_LAYOUT_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
282 3 : l = FD_LAYOUT_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );
283 3 : l = FD_LAYOUT_APPEND( l, fd_netdev_tbl_align(), fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ) );
284 3 : return FD_LAYOUT_FINI( l, scratch_align() );
285 3 : }
286 :
287 : static void
288 0 : metrics_write( fd_net_ctx_t * ctx ) {
289 0 : FD_MCNT_SET( NET, RX_PKT_CNT, ctx->metrics.rx_pkt_cnt );
290 0 : FD_MCNT_SET( NET, RX_BYTES_TOTAL, ctx->metrics.rx_bytes_total );
291 0 : FD_MCNT_SET( NET, RX_UNDERSZ_CNT, ctx->metrics.rx_undersz_cnt );
292 0 : FD_MCNT_SET( NET, RX_FILL_BLOCKED_CNT, ctx->metrics.rx_fill_blocked_cnt );
293 0 : FD_MCNT_SET( NET, RX_BACKPRESSURE_CNT, ctx->metrics.rx_backp_cnt );
294 0 : FD_MGAUGE_SET( NET, RX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.rx_busy_cnt, 0L ) );
295 0 : FD_MGAUGE_SET( NET, RX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.rx_idle_cnt, 0L ) );
296 0 : FD_MGAUGE_SET( NET, TX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.tx_busy_cnt, 0L ) );
297 0 : FD_MGAUGE_SET( NET, TX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.tx_idle_cnt, 0L ) );
298 :
299 0 : FD_MCNT_SET( NET, TX_SUBMIT_CNT, ctx->metrics.tx_submit_cnt );
300 0 : FD_MCNT_SET( NET, TX_COMPLETE_CNT, ctx->metrics.tx_complete_cnt );
301 0 : FD_MCNT_SET( NET, TX_BYTES_TOTAL, ctx->metrics.tx_bytes_total );
302 0 : FD_MCNT_SET( NET, TX_ROUTE_FAIL_CNT, ctx->metrics.tx_route_fail_cnt );
303 0 : FD_MCNT_SET( NET, TX_NEIGHBOR_FAIL_CNT, ctx->metrics.tx_neigh_fail_cnt );
304 0 : FD_MCNT_SET( NET, TX_FULL_FAIL_CNT, ctx->metrics.tx_full_fail_cnt );
305 :
306 0 : FD_MCNT_SET( NET, XSK_TX_WAKEUP_CNT, ctx->metrics.xsk_tx_wakeup_cnt );
307 0 : FD_MCNT_SET( NET, XSK_RX_WAKEUP_CNT, ctx->metrics.xsk_rx_wakeup_cnt );
308 :
309 0 : FD_MCNT_SET( NET, RX_GRE_CNT, ctx->metrics.rx_gre_cnt );
310 0 : FD_MCNT_SET( NET, RX_GRE_INVALID_CNT, ctx->metrics.rx_gre_inv_pkt_cnt );
311 0 : FD_MCNT_SET( NET, RX_GRE_IGNORED_CNT, ctx->metrics.rx_gre_ignored_cnt );
312 0 : FD_MCNT_SET( NET, TX_GRE_CNT, ctx->metrics.tx_gre_cnt );
313 0 : FD_MCNT_SET( NET, TX_GRE_ROUTE_FAIL_CNT, ctx->metrics.tx_gre_route_fail_cnt );
314 0 : FD_MCNT_SET( NET, RX_SRC_ADDR_INVALID_CNT, ctx->metrics.rx_src_addr_invalid_cnt );
315 0 : }
316 :
317 : struct xdp_statistics_v0 {
318 : __u64 rx_dropped; /* Dropped for other reasons */
319 : __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
320 : __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
321 : };
322 :
323 : struct xdp_statistics_v1 {
324 : __u64 rx_dropped; /* Dropped for other reasons */
325 : __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
326 : __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
327 : __u64 rx_ring_full; /* Dropped due to rx ring being full */
328 : __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */
329 : __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */
330 : };
331 :
332 : static void
333 0 : poll_xdp_statistics( fd_net_ctx_t * ctx ) {
334 0 : struct xdp_statistics_v1 stats = {0};
335 0 : ulong xsk_cnt = ctx->xsk_cnt;
336 0 : for( ulong j=0UL; j<xsk_cnt; j++ ) {
337 0 : struct xdp_statistics_v1 sub_stats = {0};
338 0 : uint optlen = (uint)sizeof(struct xdp_statistics_v1);
339 0 : if( FD_UNLIKELY( -1==getsockopt( ctx->xsk[ j ].xsk_fd, SOL_XDP, XDP_STATISTICS, &sub_stats, &optlen ) ) )
340 0 : FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) failed: %s", strerror( errno ) ));
341 0 : if( FD_UNLIKELY( optlen!=sizeof(struct xdp_statistics_v0) &&
342 0 : optlen!=sizeof(struct xdp_statistics_v1) ) ) {
343 0 : FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) returned unexpected size %u", optlen ));
344 0 : }
345 0 : stats.rx_dropped += sub_stats.rx_dropped;
346 0 : stats.rx_invalid_descs += sub_stats.rx_invalid_descs;
347 0 : stats.tx_invalid_descs += sub_stats.tx_invalid_descs;
348 0 : stats.rx_ring_full += sub_stats.rx_ring_full;
349 0 : stats.rx_fill_ring_empty_descs += sub_stats.rx_fill_ring_empty_descs;
350 0 : stats.tx_ring_empty_descs += sub_stats.tx_ring_empty_descs;
351 0 : }
352 :
353 0 : FD_MCNT_SET( NET, XDP_RX_DROPPED_OTHER, stats.rx_dropped );
354 0 : FD_MCNT_SET( NET, XDP_RX_INVALID_DESCS, stats.rx_invalid_descs );
355 0 : FD_MCNT_SET( NET, XDP_TX_INVALID_DESCS, stats.tx_invalid_descs );
356 0 : FD_MCNT_SET( NET, XDP_RX_RING_FULL, stats.rx_ring_full );
357 0 : FD_MCNT_SET( NET, XDP_RX_FILL_RING_EMPTY_DESCS, stats.rx_fill_ring_empty_descs );
358 0 : FD_MCNT_SET( NET, XDP_TX_RING_EMPTY_DESCS, stats.tx_ring_empty_descs );
359 0 : }
360 :
361 : /* net_is_fatal_xdp_error returns 1 if the given errno returned by an
362 : XDP API indicates a non-recoverable error code. The net tile should
363 : crash if it sees such an error so the problem does not go undetected.
364 : Otherwise, returns 0. */
365 :
366 : static int
367 0 : net_is_fatal_xdp_error( int err ) {
368 0 : return err==ESOCKTNOSUPPORT || err==EOPNOTSUPP || err==EINVAL ||
369 0 : err==EPERM;
370 0 : }
371 :
372 : /* net_gre_tunnel_ip returns the IP address of the GRE tunnel peer if an
373 : untagged GRE tunnel exists, returns 0 otherwise. */
374 :
375 : static uint
376 6 : net_gre_tunnel_ip( fd_net_ctx_t * ctx ) {
377 6 : fd_netdev_t * dev_tbl = ctx->netdev_tbl.dev_tbl;
378 6 : ushort dev_cnt = ctx->netdev_tbl.hdr->dev_cnt;
379 :
380 6 : for( ushort if_idx = 0; if_idx<dev_cnt; if_idx++ ) {
381 3 : fd_netdev_t const * dev = dev_tbl+if_idx;
382 3 : if( dev->dev_type==ARPHRD_IPGRE && dev->gre_dst_ip ) return dev->gre_dst_ip;
383 3 : }
384 3 : return 0U;
385 6 : }
386 :
387 :
388 : /* net_tx_ready returns 1 if we can submit a job to this TX ring, and 0 otherwise.
389 : Reasons for block include:
390 : - No TX buffer is available (free ring empty)
391 : - TX ring is full
392 :
393 : tx_ring: pointer to the XDP TX ring
394 : free_ring: pointer to the free TX ring */
395 :
396 : static int
397 : net_tx_ready( fd_xdp_ring_t * tx_ring,
398 42 : fd_net_free_ring_t * free_ring ) {
399 42 : if( FD_UNLIKELY( free_ring->prod == free_ring->cons ) ) return 0; /* drop - no free buffers */
400 36 : if( FD_UNLIKELY( fd_xdp_ring_full( tx_ring ) ) ) return 0; /* drop - tx ring full */
401 33 : return 1;
402 36 : }
403 :
404 : /* net_rx_wakeup triggers xsk_recvmsg to run in the kernel. Needs to be
405 : called periodically in order to receive packets. */
406 :
407 : static void
408 : net_rx_wakeup( fd_net_ctx_t * ctx,
409 : fd_xsk_t * xsk,
410 0 : int * charge_busy ) {
411 0 : FD_VOLATILE( *xsk->ring_rx.cons ) = xsk->ring_rx.cached_cons; /* write-back local copies to fseqs */
412 0 : FD_VOLATILE( *xsk->ring_fr.prod ) = xsk->ring_fr.cached_prod;
413 0 : if( !fd_xsk_rx_need_wakeup( xsk ) ) return;
414 0 : *charge_busy = 1;
415 0 : struct msghdr _ignored[ 1 ] = { 0 };
416 0 : if( FD_UNLIKELY( -1==recvmsg( xsk->xsk_fd, _ignored, MSG_DONTWAIT ) ) ) {
417 0 : if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
418 0 : FD_LOG_ERR(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
419 0 : }
420 0 : if( FD_UNLIKELY( errno!=EAGAIN ) ) {
421 0 : long ts = fd_log_wallclock();
422 0 : if( ts > xsk->log_suppress_until_ns ) {
423 0 : FD_LOG_WARNING(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
424 0 : xsk->log_suppress_until_ns = ts + (long)1e9;
425 0 : }
426 0 : }
427 0 : }
428 0 : ctx->metrics.xsk_rx_wakeup_cnt++;
429 0 : }
430 :
431 : /* net_tx_wakeup triggers xsk_sendmsg to run in the kernel. Needs to be
432 : called periodically in order to transmit packets. Should only be called
433 : if there are unconsumed packets in Tx ring. */
434 :
435 : static void
436 : net_tx_wakeup( fd_net_ctx_t * ctx,
437 : fd_xsk_t * xsk,
438 24 : int * charge_busy ) {
439 24 : FD_VOLATILE( *xsk->ring_tx.prod ) = xsk->ring_tx.cached_prod; /* write-back local copies to fseqs */
440 24 : FD_VOLATILE( *xsk->ring_cr.cons ) = xsk->ring_cr.cached_cons;
441 24 : if( !fd_xsk_tx_need_wakeup( xsk ) ) return;
442 0 : *charge_busy = 1;
443 0 : if( FD_UNLIKELY( -1==sendto( xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) {
444 0 : if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
445 0 : FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
446 0 : }
447 0 : if( FD_UNLIKELY( errno!=EAGAIN ) ) {
448 0 : long ts = fd_log_wallclock();
449 0 : if( ts > xsk->log_suppress_until_ns ) {
450 0 : FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
451 0 : xsk->log_suppress_until_ns = ts + (long)1e9;
452 0 : }
453 0 : }
454 0 : }
455 0 : ctx->metrics.xsk_tx_wakeup_cnt++;
456 0 : }
457 :
458 : /* net_tx_periodic_wakeup does a timer based xsk_sendmsg wakeup. */
459 :
460 : static inline int
461 : net_tx_periodic_wakeup( fd_net_ctx_t * ctx,
462 : uint xsk_idx,
463 : long now,
464 27 : int * charge_busy ) {
465 27 : fd_xdp_ring_t * tx_ring = &ctx->xsk[ xsk_idx ].ring_tx;
466 27 : int tx_ring_empty = fd_xdp_ring_empty( tx_ring, FD_XDP_RING_ROLE_PROD );
467 27 : if( fd_net_flusher_check( ctx->tx_flusher+xsk_idx, now, tx_ring_empty ) ) {
468 24 : net_tx_wakeup( ctx, &ctx->xsk[ xsk_idx ], charge_busy );
469 24 : fd_net_flusher_wakeup( ctx->tx_flusher+xsk_idx, now );
470 24 : }
471 27 : return 0;
472 27 : }
473 :
474 : static void
475 0 : during_housekeeping( fd_net_ctx_t * ctx ) {
476 0 : long now = fd_tickcount();
477 0 : if( FD_LIKELY( !fd_seqlock_locked_hint( &ctx->netdev_shared.hdr->seqlock ) ) ) {
478 0 : fd_netdev_tbl_copy( &ctx->netdev_tbl, &ctx->netdev_shared );
479 0 : }
480 0 : ctx->gre_tunnel_ip = net_gre_tunnel_ip( ctx );
481 :
482 0 : ctx->metrics.rx_busy_cnt = 0UL;
483 0 : ctx->metrics.rx_idle_cnt = 0UL;
484 0 : ctx->metrics.tx_busy_cnt = 0UL;
485 0 : ctx->metrics.tx_idle_cnt = fd_seq_diff( ctx->free_tx.prod, ctx->free_tx.cons );
486 0 : for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
487 0 : fd_xsk_t * xsk = &ctx->xsk[ j ];
488 0 : FD_COMPILER_MFENCE();
489 : /* Write back local copies to fseqs that we own */
490 0 : FD_VOLATILE( *xsk->ring_fr.prod ) = xsk->ring_fr.cached_prod;
491 0 : FD_VOLATILE( *xsk->ring_rx.cons ) = xsk->ring_rx.cached_cons;
492 0 : FD_VOLATILE( *xsk->ring_tx.prod ) = xsk->ring_tx.cached_prod;
493 0 : FD_VOLATILE( *xsk->ring_cr.cons ) = xsk->ring_cr.cached_cons;
494 :
495 : /* Refresh kernel-owned seq numbers for accurate stats */
496 0 : xsk->ring_fr.cached_cons = FD_VOLATILE_CONST( *xsk->ring_fr.cons );
497 0 : xsk->ring_rx.cached_prod = FD_VOLATILE_CONST( *xsk->ring_rx.prod );
498 0 : xsk->ring_tx.cached_cons = FD_VOLATILE_CONST( *xsk->ring_tx.cons );
499 0 : xsk->ring_cr.cached_prod = FD_VOLATILE_CONST( *xsk->ring_cr.prod );
500 :
501 0 : FD_COMPILER_MFENCE();
502 0 : ctx->metrics.rx_busy_cnt += (long)(int)( xsk->ring_rx.cached_prod - xsk->ring_rx.cached_cons );
503 0 : ctx->metrics.rx_idle_cnt += (long)(int)( xsk->ring_fr.cached_prod - xsk->ring_fr.cached_cons );
504 0 : ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_tx.cached_prod - xsk->ring_tx.cached_cons );
505 0 : ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_cr.cached_prod - xsk->ring_cr.cached_cons );
506 0 : }
507 :
508 0 : if( now > ctx->next_xdp_stats_refresh ) {
509 0 : ctx->next_xdp_stats_refresh = now + ctx->xdp_stats_interval_ticks;
510 0 : poll_xdp_statistics( ctx );
511 0 : }
512 0 : }
513 :
514 :
515 : /* net_tx_route resolves the xsk index, src ip address, src MAC address, and
516 : dst MAC address. Returns 1 on success, 0 on failure.
517 : On success, tx_op->{xsk_idx,src_ip,mac_addrs} is set, and if the dst_ip
518 : belongs to a GRE interface, is_gre_inf will set to 1 and
519 : tx_op->{gre_outer_src_ip, gre_outer_dst_ip} will be loaded from the netdev
520 : table. is_gre_inf is set to 0 if dst_ip doesn't belong to a GRE interface. */
521 :
522 : static int
523 : net_tx_route( fd_net_ctx_t * ctx,
524 : uint dst_ip,
525 33 : uint * is_gre_inf ) {
526 :
527 : /* Route lookup */
528 :
529 33 : fd_fib4_hop_t hop[2] = {0};
530 33 : hop[0] = fd_fib4_lookup( ctx->fib_local, dst_ip, 0UL );
531 33 : hop[1] = fd_fib4_lookup( ctx->fib_main, dst_ip, 0UL );
532 33 : fd_fib4_hop_t const * next_hop = fd_fib4_hop_or( hop+0, hop+1 );
533 :
534 33 : uint rtype = next_hop->rtype;
535 33 : uint if_idx = next_hop->if_idx;
536 33 : uint ip4_src = next_hop->ip4_src;
537 :
538 33 : if( FD_UNLIKELY( rtype==FD_FIB4_RTYPE_LOCAL ) ) {
539 0 : rtype = FD_FIB4_RTYPE_UNICAST;
540 0 : if_idx = 1;
541 0 : }
542 :
543 33 : if( FD_UNLIKELY( rtype!=FD_FIB4_RTYPE_UNICAST ) ) {
544 0 : ctx->metrics.tx_route_fail_cnt++;
545 0 : return 0;
546 0 : }
547 :
548 33 : fd_netdev_t * netdev = fd_netdev_tbl_query( &ctx->netdev_tbl, if_idx );
549 33 : if( !netdev ) {
550 3 : ctx->metrics.tx_route_fail_cnt++;
551 3 : return 0;
552 3 : }
553 :
554 30 : ip4_src = fd_uint_if( !!ctx->bind_address, ctx->bind_address, ip4_src );
555 30 : ctx->tx_op.src_ip = ip4_src;
556 30 : ctx->tx_op.xsk_idx = UINT_MAX;
557 :
558 30 : FD_TEST( is_gre_inf );
559 30 : *is_gre_inf = 0;
560 30 : if( netdev->dev_type==ARPHRD_LOOPBACK ) {
561 : /* Set Ethernet src and dst address to 00:00:00:00:00:00 */
562 0 : memset( ctx->tx_op.mac_addrs, 0, 12UL );
563 0 : ctx->tx_op.xsk_idx = XSK_IDX_LO;
564 : /* Set preferred src address to 127.0.0.1 if no bind address is set */
565 0 : if( !ctx->tx_op.src_ip ) ctx->tx_op.src_ip = FD_IP4_ADDR( 127,0,0,1 );
566 0 : return 1;
567 30 : } else if( netdev->dev_type==ARPHRD_IPGRE ) {
568 : /* skip MAC addrs lookup for GRE inner dst ip */
569 12 : if( netdev->gre_src_ip ) ctx->tx_op.gre_outer_src_ip = netdev->gre_src_ip;
570 12 : ctx->tx_op.gre_outer_dst_ip = netdev->gre_dst_ip;
571 12 : *is_gre_inf = 1;
572 12 : return 1;
573 12 : }
574 :
575 18 : if( FD_UNLIKELY( netdev->dev_type!=ARPHRD_ETHER ) ) return 0; // drop
576 :
577 18 : if( FD_UNLIKELY( if_idx!=ctx->if_virt ) ) {
578 0 : ctx->metrics.tx_no_xdp_cnt++;
579 0 : return 0;
580 0 : }
581 18 : ctx->tx_op.xsk_idx = XSK_IDX_MAIN;
582 :
583 : /* Neighbor resolve */
584 18 : uint neigh_ip = next_hop->ip4_gw;
585 18 : if( !neigh_ip ) neigh_ip = dst_ip;
586 :
587 18 : fd_neigh4_entry_t neigh[1];
588 18 : int neigh_res = fd_neigh4_hmap_query_entry( ctx->neigh4, neigh_ip, neigh );
589 18 : if( FD_UNLIKELY( neigh_res!=FD_MAP_SUCCESS ) ) {
590 : /* Neighbor not found */
591 0 : fd_netlink_neigh4_solicit( ctx->neigh4_solicit, neigh_ip, if_idx, fd_frag_meta_ts_comp( fd_tickcount() ) );
592 0 : ctx->metrics.tx_neigh_fail_cnt++;
593 0 : return 0;
594 0 : }
595 18 : if( FD_UNLIKELY( neigh->state != FD_NEIGH4_STATE_ACTIVE ) ) {
596 0 : ctx->metrics.tx_neigh_fail_cnt++;
597 0 : return 0;
598 0 : }
599 18 : ip4_src = fd_uint_if( !ip4_src, ctx->default_address, ip4_src );
600 18 : ctx->tx_op.src_ip = ip4_src;
601 18 : memcpy( ctx->tx_op.mac_addrs+0, neigh->mac_addr, 6 );
602 18 : memcpy( ctx->tx_op.mac_addrs+6, netdev->mac_addr, 6 );
603 :
604 18 : return 1;
605 18 : }
606 :
607 : /* before_frag is called when a new metadata descriptor for a TX job is
608 : found. This callback determines whether this net tile is responsible
609 : for the TX job. If so, it prepares the TX op for the during_frag and
610 : after_frag callbacks. */
611 :
612 : static inline int
613 : before_frag( fd_net_ctx_t * ctx,
614 : ulong in_idx,
615 : ulong seq,
616 18 : ulong sig ) {
617 18 : (void)in_idx; (void)seq;
618 :
619 : /* Find interface index of next packet */
620 18 : ulong proto = fd_disco_netmux_sig_proto( sig );
621 18 : if( FD_UNLIKELY( proto!=DST_PROTO_OUTGOING ) ) return 1;
622 :
623 : /* Load balance TX */
624 18 : uint net_tile_cnt = ctx->net_tile_cnt;
625 18 : uint hash = (uint)fd_disco_netmux_sig_hash( sig );
626 18 : uint target_idx = hash % net_tile_cnt;
627 18 : uint net_tile_id = ctx->net_tile_id;
628 18 : uint dst_ip = fd_disco_netmux_sig_ip( sig );
629 :
630 : /* Skip if another net tile is responsible for this packet.
631 : Fast path for net tiles other than net_tile 0. */
632 :
633 18 : if( net_tile_id!=0 && net_tile_id!=target_idx ) return 1; /* ignore */
634 :
635 :
636 18 : ctx->tx_op.use_gre = 0;
637 18 : ctx->tx_op.gre_outer_dst_ip = 0;
638 18 : ctx->tx_op.gre_outer_src_ip = 0;
639 18 : uint is_gre_inf = 0;
640 :
641 18 : if( FD_UNLIKELY( !net_tx_route( ctx, dst_ip, &is_gre_inf ) ) ) {
642 0 : return 1; /* metrics incremented by net_tx_route */
643 0 : }
644 :
645 18 : uint xsk_idx = ctx->tx_op.xsk_idx;
646 :
647 18 : if( is_gre_inf ) {
648 12 : uint inner_src_ip = ctx->tx_op.src_ip;
649 12 : if( FD_UNLIKELY( !inner_src_ip ) ) {
650 0 : ctx->metrics.tx_gre_route_fail_cnt++;
651 0 : return 1;
652 0 : }
653 : /* Find the MAC addrs for the eth hdr, and src ip for outer ip4 hdr if not found in netdev tbl */
654 12 : ctx->tx_op.src_ip = 0;
655 12 : is_gre_inf = 0;
656 12 : if( FD_UNLIKELY( !net_tx_route( ctx, ctx->tx_op.gre_outer_dst_ip, &is_gre_inf ) ) ) {
657 0 : ctx->metrics.tx_gre_route_fail_cnt++;
658 0 : return 1;
659 0 : }
660 12 : if( is_gre_inf ) {
661 : /* Only one layer of tunnelling supported */
662 0 : ctx->metrics.tx_gre_route_fail_cnt++;
663 0 : return 1;
664 0 : }
665 12 : if( !ctx->tx_op.gre_outer_src_ip ) {
666 6 : ctx->tx_op.gre_outer_src_ip = ctx->tx_op.src_ip;
667 6 : }
668 12 : ctx->tx_op.use_gre = 1; /* indicate to during_frag to use GRE header */
669 12 : ctx->tx_op.src_ip = inner_src_ip;
670 12 : xsk_idx = XSK_IDX_MAIN;
671 12 : }
672 :
673 18 : if( FD_UNLIKELY( xsk_idx>=ctx->xsk_cnt ) ) {
674 : /* Packet does not route to an XDP interface */
675 0 : ctx->metrics.tx_no_xdp_cnt++;
676 0 : return 1;
677 0 : }
678 :
679 18 : if( xsk_idx==XSK_IDX_LO ) target_idx = 0; /* loopback always targets tile 0 */
680 :
681 : /* Skip if another net tile is responsible for this packet */
682 :
683 18 : if( net_tile_id!=target_idx ) return 1; /* ignore */
684 :
685 : /* Skip if TX is blocked */
686 :
687 18 : fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
688 18 : fd_net_free_ring_t * free = &ctx->free_tx;
689 18 : if( FD_UNLIKELY( !net_tx_ready( &xsk->ring_tx, free ) ) ) {
690 0 : ctx->metrics.tx_full_fail_cnt++;
691 0 : return 1;
692 0 : }
693 :
694 : /* Allocate buffer for receive */
695 18 : ulong alloc_seq = free->cons;
696 18 : void * frame = (void *)free->queue[ alloc_seq % free->depth ];
697 18 : free->cons = fd_seq_inc( alloc_seq, 1UL );
698 :
699 18 : ctx->tx_op.frame = frame;
700 :
701 18 : return 0; /* continue */
702 18 : }
703 :
704 : /* during_frag is called when before_frag has committed to transmit an
705 : outgoing packet. */
706 :
707 : static inline void
708 : during_frag( fd_net_ctx_t * ctx,
709 : ulong in_idx,
710 : ulong seq FD_PARAM_UNUSED,
711 : ulong sig FD_PARAM_UNUSED,
712 : ulong chunk,
713 : ulong sz,
714 18 : ulong ctl FD_PARAM_UNUSED ) {
715 18 : if( FD_UNLIKELY( chunk<ctx->in[ in_idx ].chunk0 || chunk>ctx->in[ in_idx ].wmark || sz>FD_NET_MTU ) )
716 0 : FD_LOG_ERR(( "chunk %lu %lu corrupt, not in range [%lu,%lu]", chunk, sz, ctx->in[ in_idx ].chunk0, ctx->in[ in_idx ].wmark ));
717 :
718 18 : if( FD_UNLIKELY( sz<( sizeof(fd_eth_hdr_t)+sizeof(fd_ip4_hdr_t) ) ) )
719 0 : FD_LOG_ERR(( "packet too small %lu (in_idx=%lu)", sz, in_idx ));
720 :
721 18 : if( FD_UNLIKELY( sz>FD_ETH_PAYLOAD_MAX ) )
722 0 : FD_LOG_ERR(( "packet too big %lu (in_idx=%lu)", sz, in_idx ));
723 :
724 18 : void * frame = ctx->tx_op.frame;
725 18 : if( FD_UNLIKELY( (ulong)frame < (ulong)ctx->umem ) )
726 0 : FD_LOG_ERR(( "frame %p out of bounds (below %p)", frame, (void *)ctx->umem ));
727 18 : ulong umem_off = (ulong)frame - (ulong)ctx->umem;
728 18 : if( FD_UNLIKELY( (ulong)umem_off > (ulong)ctx->umem_sz ) )
729 0 : FD_LOG_ERR(( "frame %p out of bounds (beyond %p)", frame, (void *)ctx->umem_sz ));
730 :
731 : /* Speculatively copy frame into XDP buffer */
732 18 : uchar const * src = fd_chunk_to_laddr_const( ctx->in[ in_idx ].mem, chunk );
733 :
734 18 : if( ctx->tx_op.use_gre ) {
735 : /* Discard the ethernet hdr from src. Copy the rest to where the inner ip4_hdr is.
736 : Safe from overflow: FD_ETH_PAYLOAD_MAX + header overhead < frame size (2048UL) */
737 12 : ulong overhead = sizeof(fd_eth_hdr_t) + sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t);
738 12 : fd_memcpy( (void *)( (ulong)ctx->tx_op.frame + overhead ), src + sizeof(fd_eth_hdr_t), sz - sizeof(fd_eth_hdr_t) );
739 12 : } else {
740 6 : fd_memcpy( ctx->tx_op.frame, src, sz );
741 6 : }
742 18 : }
743 :
744 : /* after_frag is called when the during_frag memcpy was _not_ overrun. */
745 :
746 : static void
747 : after_frag( fd_net_ctx_t * ctx,
748 : ulong in_idx,
749 : ulong seq,
750 : ulong sig,
751 : ulong sz,
752 : ulong tsorig,
753 : ulong tspub,
754 18 : fd_stem_context_t * stem ) {
755 18 : (void)in_idx; (void)seq; (void)sig; (void)tsorig; (void)tspub; (void)stem;
756 :
757 : /* Current send operation */
758 :
759 18 : uchar * frame = ctx->tx_op.frame;
760 18 : uint xsk_idx = ctx->tx_op.xsk_idx;
761 :
762 : /* Select Ethernet addresses */
763 18 : memcpy( frame, ctx->tx_op.mac_addrs, 12 );
764 :
765 18 : uchar * iphdr = frame + sizeof(fd_eth_hdr_t);
766 :
767 18 : if( ctx->tx_op.use_gre ) {
768 :
769 : /* For GRE packets, the ethertype will always be FD_ETH_HDR_TYPE_IP. outer source ip can't be 0 */
770 12 : if( FD_UNLIKELY( ctx->tx_op.gre_outer_src_ip==0 ) ) {
771 0 : ctx->metrics.tx_gre_route_fail_cnt++;
772 0 : return;
773 0 : }
774 :
775 : /* Write the last two bytes for eth_hdr */
776 12 : FD_STORE( ushort, frame+12, fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) );
777 :
778 12 : uchar * outer_iphdr = frame + sizeof(fd_eth_hdr_t);
779 12 : uchar * gre_hdr = outer_iphdr + sizeof(fd_ip4_hdr_t);
780 12 : uchar * inner_iphdr = gre_hdr + sizeof(fd_gre_hdr_t);
781 :
782 : /* outer hdr + gre hdr + inner net_tot_len */
783 12 : ushort outer_net_tot_len = (ushort)( sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t) + fd_ushort_bswap( ( (fd_ip4_hdr_t *)inner_iphdr )->net_tot_len ) );
784 :
785 : /* Construct outer ip header */
786 12 : fd_ip4_hdr_t ip4_outer = (fd_ip4_hdr_t) {
787 12 : .verihl = FD_IP4_VERIHL( 4,5 ),
788 12 : .tos = 0,
789 12 : .net_tot_len = fd_ushort_bswap( outer_net_tot_len ),
790 12 : .net_id = 0,
791 12 : .net_frag_off = fd_ushort_bswap( FD_IP4_HDR_FRAG_OFF_DF ),
792 12 : .ttl = 64,
793 12 : .protocol = FD_IP4_HDR_PROTOCOL_GRE,
794 12 : .check = 0,
795 12 : .saddr = ctx->tx_op.gre_outer_src_ip,
796 12 : .daddr = ctx->tx_op.gre_outer_dst_ip,
797 12 : };
798 12 : ip4_outer.check = fd_ip4_hdr_check_fast( &ip4_outer );
799 12 : FD_STORE( fd_ip4_hdr_t, outer_iphdr, ip4_outer );
800 :
801 : /* Construct gre header */
802 12 : fd_gre_hdr_t gre_hdr_ = {
803 12 : .flags_version = FD_GRE_HDR_FLG_VER_BASIC,
804 12 : .protocol = fd_ushort_bswap( FD_ETH_HDR_TYPE_IP )
805 12 : };
806 12 : FD_STORE( fd_gre_hdr_t, gre_hdr, gre_hdr_ );
807 :
808 12 : iphdr = inner_iphdr;
809 12 : sz = sizeof(fd_eth_hdr_t) + outer_net_tot_len;
810 12 : xsk_idx = 0;
811 12 : }
812 :
813 : /* Construct (inner) ip header */
814 18 : uint ihl = FD_IP4_GET_LEN( *(fd_ip4_hdr_t *)iphdr );
815 18 : uint ver = FD_IP4_GET_VERSION( *(fd_ip4_hdr_t *)iphdr );
816 18 : uint ip4_saddr = FD_LOAD( uint, iphdr+12 );
817 18 : ushort ethertype = FD_LOAD( ushort, frame+12 );
818 :
819 18 : if( FD_UNLIKELY( ethertype!=fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) ) ) {
820 0 : FD_LOG_CRIT(( "in link %lu attempted to send packet with invalid ethertype %04x",
821 0 : in_idx, fd_ushort_bswap( ethertype ) ));
822 0 : }
823 :
824 18 : if( ver!=0x4 ) {
825 0 : ctx->metrics.tx_route_fail_cnt++; // Not an IPv4 packet. drop
826 0 : return;
827 0 : }
828 :
829 18 : if( ip4_saddr==0 ) {
830 18 : if( FD_UNLIKELY( ctx->tx_op.src_ip==0 ||
831 18 : ihl<sizeof(fd_ip4_hdr_t) ||
832 18 : (sizeof(fd_eth_hdr_t)+ihl)>sz ) ) {
833 : /* Outgoing IPv4 packet with unknown src IP or invalid IHL */
834 : /* FIXME should select first IPv4 address of device table here */
835 0 : ctx->metrics.tx_route_fail_cnt++;
836 0 : return;
837 0 : }
838 : /* Recompute checksum after changing header */
839 18 : FD_STORE( uint, iphdr+12, ctx->tx_op.src_ip );
840 18 : FD_STORE( ushort, iphdr+10, 0 );
841 18 : FD_STORE( ushort, iphdr+10, fd_ip4_hdr_check( iphdr ) );
842 18 : }
843 :
844 : /* Submit packet TX job
845 :
846 : Invariant for ring_tx: prod-cons<length
847 : (This invariant breaks if any other packet is sent over this ring
848 : between before_frag and this point, e.g. send_arp_probe.) */
849 :
850 18 : fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
851 18 : fd_xdp_ring_t * tx_ring = &xsk->ring_tx;
852 18 : uint tx_seq = tx_ring->cached_prod;
853 18 : uint tx_mask = tx_ring->depth - 1U;
854 18 : xsk->ring_tx.packet_ring[ tx_seq&tx_mask ] = (struct xdp_desc) {
855 18 : .addr = (ulong)frame - (ulong)ctx->umem,
856 18 : .len = (uint)sz,
857 18 : .options = 0
858 18 : };
859 :
860 : /* Frame is now owned by kernel. Clear tx_op. */
861 18 : ctx->tx_op.frame = NULL;
862 :
863 : /* Register newly enqueued packet */
864 18 : tx_ring->cached_prod = tx_seq+1U;
865 18 : ctx->metrics.tx_submit_cnt++;
866 18 : ctx->metrics.tx_bytes_total += sz;
867 18 : if( ctx->tx_op.use_gre ) ctx->metrics.tx_gre_cnt++;
868 18 : fd_net_flusher_inc( ctx->tx_flusher+xsk_idx, fd_tickcount() );
869 18 : }
870 :
871 : /* net_rx_packet is called when a new Ethernet frame is available.
872 : Attempts to copy out the frame to a downstream tile. */
873 :
874 : static void
875 : net_rx_packet( fd_net_ctx_t * ctx,
876 : ulong umem_off,
877 : ulong sz,
878 27 : uint * freed_chunk ) {
879 :
880 27 : if( FD_UNLIKELY( sz<sizeof(fd_eth_hdr_t)+sizeof(fd_ip4_hdr_t)+sizeof(fd_udp_hdr_t) ) ) {
881 0 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
882 0 : ctx->metrics.rx_undersz_cnt++;
883 0 : return;
884 0 : }
885 :
886 27 : uchar * packet = (uchar *)ctx->umem + umem_off;
887 27 : uchar const * packet_end = packet + sz;
888 27 : fd_ip4_hdr_t * iphdr = (fd_ip4_hdr_t *)(packet + sizeof(fd_eth_hdr_t));
889 :
890 27 : if( FD_UNLIKELY( ((fd_eth_hdr_t *)packet)->net_type!=fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) ) ) return;
891 :
892 27 : int is_packet_gre = 0;
893 : /* Discard the GRE overhead (outer iphdr and gre hdr) */
894 27 : if( iphdr->protocol == FD_IP4_HDR_PROTOCOL_GRE ) {
895 15 : if( FD_UNLIKELY( !ctx->gre_tunnel_ip ) ) {
896 0 : ctx->metrics.rx_gre_ignored_cnt++;
897 0 : return;
898 0 : }
899 15 : ulong gre_ipver = FD_IP4_GET_VERSION( *iphdr );
900 15 : ulong gre_iplen = FD_IP4_GET_LEN( *iphdr );
901 15 : if( FD_UNLIKELY( gre_ipver!=0x4 || gre_iplen<20 ) ) {
902 0 : FD_DTRACE_PROBE( net_tile_err_rx_noip );
903 0 : ctx->metrics.rx_gre_inv_pkt_cnt++; /* drop IPv6 packets */
904 0 : return;
905 0 : }
906 :
907 15 : if( FD_UNLIKELY( iphdr->saddr!=ctx->gre_tunnel_ip ) ) {
908 3 : ctx->metrics.rx_src_addr_invalid_cnt++;
909 3 : return;
910 3 : }
911 :
912 12 : ulong overhead = gre_iplen + sizeof(fd_gre_hdr_t);
913 12 : if( FD_UNLIKELY( (uchar *)iphdr+overhead+sizeof(fd_ip4_hdr_t)>packet_end ) ) {
914 0 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
915 0 : ctx->metrics.rx_undersz_cnt++; // inner ip4 header invalid
916 0 : return;
917 0 : }
918 :
919 : /* The new iphdr is where the inner iphdr was. Copy over the eth_hdr */
920 12 : iphdr = (fd_ip4_hdr_t *)((uchar *)iphdr + overhead);
921 12 : uchar * new_packet = (uchar *)iphdr - sizeof(fd_eth_hdr_t);
922 12 : fd_memcpy( new_packet, packet, sizeof(fd_eth_hdr_t) );
923 12 : sz -= overhead;
924 12 : packet = new_packet;
925 12 : umem_off = (ulong)( packet - (uchar *)ctx->umem );
926 12 : is_packet_gre = 1;
927 12 : }
928 :
929 : /* Translate packet to UMEM frame index */
930 24 : ulong chunk = ctx->umem_chunk0 + (umem_off>>FD_CHUNK_LG_SZ);
931 24 : ulong ctl = umem_off & 0x3fUL;
932 :
933 : /* Filter for UDP/IPv4 packets. */
934 24 : ulong ipver = FD_IP4_GET_VERSION( *iphdr );
935 24 : ulong iplen = FD_IP4_GET_LEN ( *iphdr );
936 24 : if( FD_UNLIKELY( ipver!=0x4 || iplen<20 ||
937 24 : iphdr->protocol!=FD_IP4_HDR_PROTOCOL_UDP ) ) {
938 0 : FD_DTRACE_PROBE( net_tile_err_rx_noip );
939 0 : ctx->metrics.rx_undersz_cnt++; /* drop IPv6 packets */
940 0 : return;
941 0 : }
942 :
943 24 : uchar const * udp = (uchar *)iphdr + iplen;
944 24 : if( FD_UNLIKELY( udp+sizeof(fd_udp_hdr_t) > packet_end ) ) {
945 0 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
946 0 : ctx->metrics.rx_undersz_cnt++;
947 0 : return;
948 0 : }
949 :
950 24 : fd_udp_hdr_t const * udp_hdr = (fd_udp_hdr_t const *)udp;
951 24 : ulong const udp_sz = fd_ushort_bswap( udp_hdr->net_len );
952 24 : if( FD_UNLIKELY( (udp_sz<sizeof(fd_udp_hdr_t)) | (udp+udp_sz>packet_end) ) ) {
953 6 : FD_DTRACE_PROBE( net_tile_err_rx_undersz );
954 6 : ctx->metrics.rx_undersz_cnt++;
955 6 : return;
956 6 : }
957 :
958 : /* Extract IP dest addr and UDP src/dest port */
959 18 : uint ip_srcaddr = iphdr->saddr;
960 18 : ushort udp_srcport = fd_ushort_bswap( udp_hdr->net_sport );
961 18 : ushort udp_dstport = fd_ushort_bswap( udp_hdr->net_dport );
962 :
963 18 : if( FD_UNLIKELY( fd_ip4_addr_is_mcast( ip_srcaddr ) ) ) {
964 0 : ctx->metrics.rx_src_addr_invalid_cnt++;
965 0 : return;
966 0 : }
967 :
968 18 : FD_DTRACE_PROBE_4( net_tile_pkt_rx, ip_srcaddr, udp_srcport, udp_dstport, sz );
969 :
970 : /* Route packet to downstream tile */
971 18 : ushort proto;
972 18 : fd_net_out_ctx_t * out;
973 18 : if( FD_UNLIKELY( udp_dstport==ctx->shred_listen_port ) ) {
974 18 : proto = DST_PROTO_SHRED;
975 18 : out = ctx->shred_out;
976 18 : } else if( FD_UNLIKELY( udp_dstport==ctx->quic_transaction_listen_port ) ) {
977 0 : proto = DST_PROTO_TPU_QUIC;
978 0 : out = ctx->quic_out;
979 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->legacy_transaction_listen_port ) ) {
980 0 : proto = DST_PROTO_TPU_UDP;
981 0 : out = ctx->quic_out;
982 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->gossip_listen_port ) ) {
983 0 : proto = DST_PROTO_GOSSIP;
984 0 : out = ctx->gossvf_out;
985 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->repair_intake_listen_port ) ) {
986 0 : proto = DST_PROTO_REPAIR;
987 0 : if( FD_UNLIKELY( sz == REPAIR_PING_SZ ) ) out = ctx->repair_out; /* ping-pong */
988 0 : else out = ctx->shred_out;
989 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->repair_serve_listen_port ) ) {
990 0 : proto = DST_PROTO_REPAIR;
991 0 : out = ctx->repair_out;
992 0 : } else if( FD_UNLIKELY( udp_dstport==ctx->txsend_src_port ) ) {
993 0 : proto = DST_PROTO_SEND;
994 0 : out = ctx->txsend_out;
995 0 : } else {
996 :
997 0 : FD_LOG_ERR(( "Firedancer received a UDP packet on port %hu which was not expected. "
998 0 : "Only the following ports should be configured to forward packets: "
999 0 : "%hu, %hu, %hu, %hu, %hu, %hu (excluding any 0 ports, which can be ignored)."
1000 0 : "Please report this error to Firedancer maintainers.",
1001 0 : udp_dstport,
1002 0 : ctx->shred_listen_port,
1003 0 : ctx->quic_transaction_listen_port,
1004 0 : ctx->legacy_transaction_listen_port,
1005 0 : ctx->gossip_listen_port,
1006 0 : ctx->repair_intake_listen_port,
1007 0 : ctx->repair_serve_listen_port ));
1008 0 : }
1009 :
1010 : /* tile can decide how to partition based on src ip addr and src port */
1011 18 : ulong sig = fd_disco_netmux_sig( ip_srcaddr, udp_srcport, ip_srcaddr, proto, 14UL+8UL+iplen );
1012 :
1013 : /* Peek the mline for an old frame */
1014 18 : fd_frag_meta_t * mline = out->mcache + fd_mcache_line_idx( out->seq, out->depth );
1015 18 : *freed_chunk = mline->chunk;
1016 :
1017 : /* Overwrite the mline with the new frame */
1018 18 : ulong tspub = (ulong)fd_frag_meta_ts_comp( fd_tickcount() );
1019 18 : # if FD_HAS_AVX
1020 18 : fd_mcache_publish_avx( out->mcache, out->depth, out->seq, sig, chunk, sz, ctl, 0, tspub );
1021 : # else
1022 : fd_mcache_publish( out->mcache, out->depth, out->seq, sig, chunk, sz, ctl, 0, tspub );
1023 : # endif
1024 :
1025 : /* Wind up for the next iteration */
1026 18 : out->seq = fd_seq_inc( out->seq, 1UL );
1027 :
1028 18 : if( is_packet_gre ) ctx->metrics.rx_gre_cnt++;
1029 18 : ctx->metrics.rx_pkt_cnt++;
1030 18 : ctx->metrics.rx_bytes_total += sz;
1031 18 : }
1032 :
1033 : /* net_comp_event is called when an XDP TX frame is free again. */
1034 :
1035 : static void
1036 : net_comp_event( fd_net_ctx_t * ctx,
1037 : fd_xsk_t * xsk,
1038 0 : uint comp_seq ) {
1039 :
1040 : /* Locate the incoming frame */
1041 :
1042 0 : fd_xdp_ring_t * comp_ring = &xsk->ring_cr;
1043 0 : uint comp_mask = comp_ring->depth - 1U;
1044 0 : ulong frame = FD_VOLATILE_CONST( comp_ring->frame_ring[ comp_seq&comp_mask ] );
1045 0 : ulong const frame_mask = FD_NET_MTU - 1UL;
1046 0 : FD_STATIC_ASSERT( FD_ULONG_IS_POW2( FD_NET_MTU ), "FD_NET_MTU must be a power of two" );
1047 0 : if( FD_UNLIKELY( frame+FD_NET_MTU > ctx->umem_sz ) ) {
1048 0 : FD_LOG_ERR(( "Bounds check failed: frame=0x%lx umem_sz=0x%lx",
1049 0 : frame, (ulong)ctx->umem_sz ));
1050 0 : }
1051 :
1052 : /* Check if we have space to return the freed frame */
1053 :
1054 0 : fd_net_free_ring_t * free = &ctx->free_tx;
1055 0 : ulong free_prod = free->prod;
1056 0 : ulong free_mask = free->depth - 1UL;
1057 0 : ulong free_cons = free->cons;
1058 0 : long free_cnt = fd_seq_diff( free_prod, free_cons );
1059 0 : FD_TEST( free_prod >= free_cons );
1060 0 : if( FD_UNLIKELY( free_cnt>=(long)free->depth ) ) return; /* blocked */
1061 :
1062 0 : free->queue[ free_prod&free_mask ] = (ulong)ctx->umem + (frame & (~frame_mask));
1063 0 : free->prod = fd_seq_inc( free_prod, 1UL );
1064 :
1065 : /* Wind up for next iteration */
1066 :
1067 0 : comp_ring->cached_cons = comp_seq+1U;
1068 0 : ctx->metrics.tx_complete_cnt++;
1069 0 : }
1070 :
1071 : /* net_rx_event is called when a new XDP RX frame is available. Calls
1072 : net_rx_packet, then returns the packet back to the kernel via the fill
1073 : ring. */
1074 :
1075 : static void
1076 : net_rx_event( fd_net_ctx_t * ctx,
1077 : fd_xsk_t * xsk,
1078 27 : uint rx_seq ) {
1079 : /* Locate the incoming frame */
1080 :
1081 27 : fd_xdp_ring_t * rx_ring = &xsk->ring_rx;
1082 27 : uint rx_mask = rx_ring->depth - 1U;
1083 27 : struct xdp_desc frame = FD_VOLATILE_CONST( rx_ring->packet_ring[ rx_seq&rx_mask ] );
1084 :
1085 27 : if( FD_UNLIKELY( frame.len>FD_NET_MTU ) )
1086 0 : FD_LOG_ERR(( "received a UDP packet with a too large payload (%u)", frame.len ));
1087 :
1088 : /* Check if we have space in the fill ring to free the frame */
1089 :
1090 27 : fd_xdp_ring_t * fill_ring = &xsk->ring_fr;
1091 27 : if( FD_UNLIKELY( fd_xdp_ring_full( fill_ring ) ) ) {
1092 0 : ctx->metrics.rx_fill_blocked_cnt++;
1093 0 : return; /* blocked */
1094 0 : }
1095 :
1096 : /* Pass it to the receive handler */
1097 :
1098 27 : uint freed_chunk = (uint)( ctx->umem_chunk0 + (frame.addr>>FD_CHUNK_LG_SZ) );
1099 27 : net_rx_packet( ctx, frame.addr, frame.len, &freed_chunk );
1100 27 : FD_COMPILER_MFENCE();
1101 27 : rx_ring->cached_cons = rx_seq+1U;
1102 :
1103 : /* Every RX operation returns one frame to the FILL ring. If the
1104 : packet was forwarded to a downstream ring, the newly shadowed frame
1105 : is returned. Otherwise, the frame just received is returned. */
1106 :
1107 27 : if( FD_UNLIKELY( ( freed_chunk < ctx->umem_chunk0 ) |
1108 27 : ( freed_chunk > ctx->umem_wmark ) ) ) {
1109 0 : FD_LOG_CRIT(( "mcache corruption detected: chunk=%u chunk0=%u wmark=%u",
1110 0 : freed_chunk, ctx->umem_chunk0, ctx->umem_wmark ));
1111 0 : }
1112 :
1113 27 : FD_STATIC_ASSERT( FD_ULONG_IS_POW2( FD_NET_MTU ), "FD_NET_MTU must be a power of two" );
1114 27 : uint fill_prod = fill_ring->cached_prod;
1115 27 : uint fill_mask = (fill_ring->depth)-1U;
1116 27 : ulong frame_mask = FD_NET_MTU - 1UL;
1117 27 : ulong freed_off = (freed_chunk - ctx->umem_chunk0)<<FD_CHUNK_LG_SZ;
1118 27 : fill_ring->frame_ring[ fill_prod&fill_mask ] = freed_off & (~frame_mask);
1119 27 : fill_ring->cached_prod = fill_prod+1U;
1120 27 : }
1121 :
1122 : /* before_credit is called every loop iteration. */
1123 :
1124 : static void
1125 : before_credit( fd_net_ctx_t * ctx,
1126 : fd_stem_context_t * stem,
1127 27 : int * charge_busy ) {
1128 27 : (void)stem;
1129 : /* A previous send attempt was overrun. A corrupt copy of the packet was
1130 : placed into an XDP frame, but the frame was not yet submitted to the
1131 : TX ring. Return the tx buffer to the free list. */
1132 :
1133 27 : if( ctx->tx_op.frame ) {
1134 0 : *charge_busy = 1;
1135 0 : fd_net_free_ring_t * free = &ctx->free_tx;
1136 0 : ulong alloc_seq = free->prod;
1137 0 : free->queue[ alloc_seq % free->depth ] = (ulong)ctx->tx_op.frame;
1138 0 : free->prod = fd_seq_inc( alloc_seq, 1UL );
1139 0 : ctx->tx_op.frame = NULL;
1140 0 : }
1141 :
1142 : /* Check if new packets are available or if TX frames are free again
1143 : (Round-robin through sockets) */
1144 :
1145 27 : uint rr_idx = ctx->rr_idx;
1146 27 : fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ];
1147 :
1148 27 : net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy );
1149 :
1150 : /* Fire RX event if we have RX desc avail */
1151 27 : if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) {
1152 27 : *charge_busy = 1;
1153 27 : net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons );
1154 27 : } else {
1155 0 : net_rx_wakeup( ctx, rr_xsk, charge_busy );
1156 0 : ctx->rr_idx++;
1157 0 : ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx );
1158 0 : }
1159 :
1160 : /* Fire comp event if we have comp desc avail */
1161 27 : if( !fd_xdp_ring_empty( &rr_xsk->ring_cr, FD_XDP_RING_ROLE_CONS ) ) {
1162 0 : *charge_busy = 1;
1163 0 : net_comp_event( ctx, rr_xsk, rr_xsk->ring_cr.cached_cons );
1164 0 : }
1165 27 : }
1166 :
1167 : /* net_xsk_bootstrap assigns UMEM frames to the FILL ring. */
1168 :
1169 : static ulong
1170 : net_xsk_bootstrap( fd_net_ctx_t * ctx,
1171 : uint xsk_idx,
1172 0 : ulong frame_off ) {
1173 0 : fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
1174 :
1175 0 : ulong const frame_sz = FD_NET_MTU;
1176 0 : ulong const fr_depth = ctx->xsk[ xsk_idx ].ring_fr.depth/2UL;
1177 :
1178 0 : fd_xdp_ring_t * fill = &xsk->ring_fr;
1179 0 : uint fill_prod = fill->cached_prod;
1180 0 : for( ulong j=0UL; j<fr_depth; j++ ) {
1181 0 : fill->frame_ring[ j ] = frame_off;
1182 0 : frame_off += frame_sz;
1183 0 : }
1184 0 : FD_VOLATILE( *fill->prod ) = fill->cached_prod = fill_prod + (uint)fr_depth;
1185 :
1186 0 : return frame_off;
1187 0 : }
1188 :
1189 : /* FIXME source MAC address from netlnk tile instead */
1190 :
1191 : static void
1192 : interface_addrs( const char * interface,
1193 : uchar * mac,
1194 0 : uint * ip4_addr ) {
1195 0 : int fd = socket( AF_INET, SOCK_DGRAM, 0 );
1196 0 : struct ifreq ifr;
1197 0 : ifr.ifr_addr.sa_family = AF_INET;
1198 :
1199 0 : strncpy( ifr.ifr_name, interface, IFNAMSIZ );
1200 0 : if( FD_UNLIKELY( ioctl( fd, SIOCGIFHWADDR, &ifr ) ) )
1201 0 : FD_LOG_ERR(( "could not get MAC address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
1202 0 : fd_memcpy( mac, ifr.ifr_hwaddr.sa_data, 6 );
1203 :
1204 0 : if( FD_UNLIKELY( ioctl( fd, SIOCGIFADDR, &ifr ) ) )
1205 0 : FD_LOG_ERR(( "could not get IP address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
1206 0 : *ip4_addr = ((struct sockaddr_in *)fd_type_pun( &ifr.ifr_addr ))->sin_addr.s_addr;
1207 :
1208 0 : if( FD_UNLIKELY( close(fd) ) )
1209 0 : FD_LOG_ERR(( "could not close socket (%i-%s)", errno, fd_io_strerror( errno ) ));
1210 0 : }
1211 :
1212 : /* privileged_init does the following initialization steps:
1213 :
1214 : - Create an AF_XDP socket
1215 : - Map XDP metadata rings
1216 : - Register UMEM data region with socket
1217 : - Insert AF_XDP socket into xsk_map
1218 :
1219 : Net tile 0 also runs fd_xdp_install and repeats the above step for
1220 : the loopback device. (Unless the main interface is already loopback)
1221 :
1222 : Kernel object references:
1223 :
1224 : BPF_LINK file descriptor
1225 : |
1226 : +-> XDP program installation on NIC
1227 : | |
1228 : | +-> XDP program <-- BPF_PROG file descriptor (prog_fd)
1229 : |
1230 : +-> XSKMAP object <-- BPF_MAP file descriptor (xsk_map) */
1231 :
1232 : FD_FN_UNUSED static void
1233 : privileged_init( fd_topo_t const * topo,
1234 0 : fd_topo_tile_t const * tile ) {
1235 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1236 :
1237 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1238 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
1239 0 : ulong * free_tx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );;
1240 :
1241 0 : fd_memset( ctx, 0, sizeof(fd_net_ctx_t) );
1242 :
1243 0 : interface_addrs( tile->xdp.if_virt, ctx->src_mac_addr, &ctx->default_address );
1244 0 : ctx->if_virt = if_nametoindex( tile->xdp.if_virt ); FD_TEST( ctx->if_virt );
1245 :
1246 : /* Load up dcache containing UMEM */
1247 :
1248 0 : void * const dcache_mem = fd_topo_obj_laddr( topo, tile->net.umem_dcache_obj_id );
1249 0 : void * const umem = fd_dcache_join( dcache_mem );
1250 0 : ulong const umem_dcache_data_sz = fd_dcache_data_sz( umem );
1251 0 : ulong const umem_frame_sz = 2048UL;
1252 0 : ulong const umem_sz = fd_ulong_align_dn( umem_dcache_data_sz, umem_frame_sz );
1253 :
1254 : /* Derive chunk bounds */
1255 :
1256 0 : void * const umem_base = fd_wksp_containing( dcache_mem );
1257 0 : ulong const umem_chunk0 = ( (ulong)umem - (ulong)umem_base )>>FD_CHUNK_LG_SZ;
1258 0 : ulong const umem_wmark = umem_chunk0 + ( ( umem_sz-umem_frame_sz )>>FD_CHUNK_LG_SZ );
1259 :
1260 0 : if( FD_UNLIKELY( umem_chunk0>UINT_MAX || umem_wmark>UINT_MAX || umem_chunk0>umem_wmark ) ) {
1261 0 : FD_LOG_ERR(( "Calculated invalid UMEM bounds [%lu,%lu]", umem_chunk0, umem_wmark ));
1262 0 : }
1263 :
1264 0 : if( FD_UNLIKELY( !umem_base ) ) FD_LOG_ERR(( "UMEM dcache is not in a workspace" ));
1265 :
1266 0 : ctx->umem = umem;
1267 0 : ctx->umem_sz = umem_sz;
1268 0 : ctx->umem_chunk0 = (uint)umem_chunk0;
1269 0 : ctx->umem_wmark = (uint)umem_wmark;
1270 :
1271 0 : ctx->free_tx.queue = free_tx;
1272 0 : ctx->free_tx.depth = tile->xdp.xdp_tx_queue_size;
1273 :
1274 : /* Create and install XSKs */
1275 :
1276 0 : uint if_phys_if_idx = if_nametoindex( tile->xdp.if_phys );
1277 0 : if( FD_UNLIKELY( !if_phys_if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed", tile->xdp.if_phys ));
1278 :
1279 0 : fd_xsk_params_t params0 = {
1280 0 : .if_idx = if_phys_if_idx,
1281 0 : .if_queue_id = tile->xdp.if_queue,
1282 :
1283 : /* Some kernels produce EOPNOTSUP errors on sendto calls when
1284 : starting up without either XDP_ZEROCOPY or XDP_COPY
1285 : (e.g. 5.14.0-503.23.1.el9_5 with i40e) */
1286 0 : .bind_flags = tile->xdp.zero_copy ? XDP_ZEROCOPY : XDP_COPY,
1287 :
1288 0 : .fr_depth = tile->xdp.xdp_rx_queue_size*2,
1289 0 : .rx_depth = tile->xdp.xdp_rx_queue_size,
1290 0 : .cr_depth = tile->xdp.xdp_tx_queue_size,
1291 0 : .tx_depth = tile->xdp.xdp_tx_queue_size,
1292 :
1293 0 : .umem_addr = umem,
1294 0 : .frame_sz = umem_frame_sz,
1295 0 : .umem_sz = umem_sz,
1296 :
1297 0 : .core_dump = tile->xdp.xsk_core_dump,
1298 0 : };
1299 :
1300 : /* Re-derive XDP file descriptors */
1301 :
1302 0 : fd_xdp_fds_t xdp_fds[ FD_TOPO_XDP_FDS_MAX ];
1303 0 : uint xdp_fds_cnt = FD_TOPO_XDP_FDS_MAX;
1304 0 : fd_topo_install_xdp( topo, xdp_fds, &xdp_fds_cnt, 0U, /* dry_run */ 1 );
1305 :
1306 0 : int xsk_map_fd = -1;
1307 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
1308 0 : if( xdp_fds[ i ].if_idx==if_phys_if_idx ) {
1309 0 : xsk_map_fd = xdp_fds[ i ].xsk_map_fd;
1310 0 : ctx->prog_link_fds[ 0 ] = xdp_fds[ i ].prog_link_fd;
1311 0 : xdp_fds[ i ].prog_link_fd = -1; /* mark as used */
1312 0 : break;
1313 0 : }
1314 0 : }
1315 0 : FD_TEST( xsk_map_fd>=0 );
1316 :
1317 : /* Init XSK */
1318 0 : if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 0 ], ¶ms0 ) ) ) FD_LOG_ERR(( "failed to bind xsk for net tile %lu", tile->kind_id ));
1319 0 : if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 0 ], xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate xsk for net tile %lu", tile->kind_id ));
1320 0 : ctx->xsk_cnt = 1;
1321 :
1322 : /* Networking tile at index 0 also binds to loopback (only queue 0 available on lo) */
1323 :
1324 0 : if( FD_UNLIKELY( strcmp( tile->xdp.if_virt, "lo" ) && !tile->kind_id ) ) {
1325 0 : ctx->xsk_cnt = 2;
1326 :
1327 0 : uint lo_idx = if_nametoindex( "lo" );
1328 0 : if( FD_UNLIKELY( !lo_idx ) ) FD_LOG_ERR(( "if_nametoindex(lo) failed" ));
1329 :
1330 0 : int lo_xsk_map_fd = -1;
1331 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
1332 0 : if( xdp_fds[ i ].if_idx==lo_idx ) {
1333 0 : lo_xsk_map_fd = xdp_fds[ i ].xsk_map_fd;
1334 0 : ctx->prog_link_fds[ 1 ] = xdp_fds[ i ].prog_link_fd;
1335 0 : xdp_fds[ i ].prog_link_fd = -1; /* mark as used */
1336 0 : break;
1337 0 : }
1338 0 : }
1339 0 : FD_TEST( lo_xsk_map_fd>=0 );
1340 :
1341 : /* init xsk 1 */
1342 0 : fd_xsk_params_t params1 = params0;
1343 0 : params1.if_idx = lo_idx; /* probably always 1 */
1344 0 : params1.if_queue_id = 0;
1345 0 : params1.bind_flags = 0;
1346 0 : if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 1 ], ¶ms1 ) ) ) FD_LOG_ERR(( "failed to bind lo_xsk" ));
1347 0 : if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 1 ], lo_xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate lo_xsk" ));
1348 0 : }
1349 :
1350 : /* Close unused XDP fds */
1351 :
1352 0 : if( FD_UNLIKELY( fd_sandbox_gettid()==fd_sandbox_getpid() ) ) {
1353 : /* Kind of gross.. in single threaded mode we don't want to close the xsk_map_fd
1354 : since it's shared with other net tiles. Just check for that by seeing if we
1355 : are the only thread in the process. */
1356 0 : for( uint i=0U; i<xdp_fds_cnt; i++ ) {
1357 0 : if( -1==close( xdp_fds[ i ].xsk_map_fd ) ) {
1358 0 : FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
1359 0 : }
1360 0 : if( xdp_fds[ i ].prog_link_fd>0 &&
1361 0 : -1==close( xdp_fds[ i ].prog_link_fd ) ) {
1362 0 : FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
1363 0 : }
1364 0 : }
1365 0 : }
1366 :
1367 0 : double tick_per_ns = fd_tempo_tick_per_ns( NULL );
1368 0 : ctx->xdp_stats_interval_ticks = (long)( FD_XDP_STATS_INTERVAL_NS * tick_per_ns );
1369 :
1370 0 : ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, scratch_align() );
1371 0 : if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
1372 0 : FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
1373 0 : }
1374 :
1375 : static void
1376 : init_device_table( fd_net_ctx_t * ctx,
1377 : void * netdev_tbl_shm,
1378 3 : void * netdev_tbl_local ) {
1379 3 : FD_TEST( fd_netdev_tbl_join( &ctx->netdev_shared, netdev_tbl_shm ) );
1380 3 : FD_TEST( fd_netdev_tbl_new( netdev_tbl_local, NETDEV_MAX, BOND_MASTER_MAX ) );
1381 3 : FD_TEST( fd_netdev_tbl_join( &ctx->netdev_tbl, netdev_tbl_local ) );
1382 3 : }
1383 :
1384 : FD_FN_UNUSED static void
1385 : unprivileged_init( fd_topo_t const * topo,
1386 0 : fd_topo_tile_t const * tile ) {
1387 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1388 :
1389 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1390 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
1391 0 : FD_TEST( ctx->xsk_cnt!=0 );
1392 0 : FD_TEST( ctx->free_tx.queue!=NULL );
1393 0 : (void)FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );
1394 0 : void * netdev_tbl_local = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ) );
1395 :
1396 0 : ctx->net_tile_id = (uint)tile->kind_id;
1397 0 : ctx->net_tile_cnt = (uint)fd_topo_tile_name_cnt( topo, tile->name );
1398 :
1399 0 : ctx->bind_address = tile->net.bind_address;
1400 0 : ctx->shred_listen_port = tile->net.shred_listen_port;
1401 0 : ctx->quic_transaction_listen_port = tile->net.quic_transaction_listen_port;
1402 0 : ctx->legacy_transaction_listen_port = tile->net.legacy_transaction_listen_port;
1403 0 : ctx->gossip_listen_port = tile->net.gossip_listen_port;
1404 0 : ctx->repair_intake_listen_port = tile->net.repair_intake_listen_port;
1405 0 : ctx->repair_serve_listen_port = tile->net.repair_serve_listen_port;
1406 0 : ctx->txsend_src_port = tile->net.txsend_src_port;
1407 :
1408 : /* Put a bound on chunks we read from the input, to make sure they
1409 : are within in the data region of the workspace. */
1410 :
1411 0 : if( FD_UNLIKELY( !tile->in_cnt ) ) FD_LOG_ERR(( "net tile in link cnt is zero" ));
1412 0 : if( FD_UNLIKELY( tile->in_cnt>MAX_NET_INS ) ) FD_LOG_ERR(( "net tile in link cnt %lu exceeds MAX_NET_INS %lu", tile->in_cnt, MAX_NET_INS ));
1413 0 : FD_TEST( tile->in_cnt<=32 );
1414 0 : for( ulong i=0UL; i<tile->in_cnt; i++ ) {
1415 0 : fd_topo_link_t const * link = &topo->links[ tile->in_link_id[ i ] ];
1416 0 : if( FD_UNLIKELY( link->mtu!=FD_NET_MTU ) ) FD_LOG_ERR(( "net tile in link %s does not have a normal MTU", link->name ));
1417 :
1418 0 : ctx->in[ i ].mem = topo->workspaces[ topo->objs[ link->dcache_obj_id ].wksp_id ].wksp;
1419 0 : ctx->in[ i ].chunk0 = fd_dcache_compact_chunk0( ctx->in[ i ].mem, link->dcache );
1420 0 : ctx->in[ i ].wmark = fd_dcache_compact_wmark( ctx->in[ i ].mem, link->dcache, link->mtu );
1421 0 : }
1422 :
1423 0 : for( ulong i = 0; i < tile->out_cnt; i++ ) {
1424 0 : fd_topo_link_t const * out_link = &topo->links[ tile->out_link_id[ i ] ];
1425 0 : if( strcmp( out_link->name, "net_quic" ) == 0 ) {
1426 0 : fd_topo_link_t const * quic_out = out_link;
1427 0 : ctx->quic_out->mcache = quic_out->mcache;
1428 0 : ctx->quic_out->sync = fd_mcache_seq_laddr( ctx->quic_out->mcache );
1429 0 : ctx->quic_out->depth = fd_mcache_depth( ctx->quic_out->mcache );
1430 0 : ctx->quic_out->seq = fd_mcache_seq_query( ctx->quic_out->sync );
1431 0 : } else if( strcmp( out_link->name, "net_shred" ) == 0 ) {
1432 0 : fd_topo_link_t const * shred_out = out_link;
1433 0 : ctx->shred_out->mcache = shred_out->mcache;
1434 0 : ctx->shred_out->sync = fd_mcache_seq_laddr( ctx->shred_out->mcache );
1435 0 : ctx->shred_out->depth = fd_mcache_depth( ctx->shred_out->mcache );
1436 0 : ctx->shred_out->seq = fd_mcache_seq_query( ctx->shred_out->sync );
1437 0 : } else if( strcmp( out_link->name, "net_gossvf" ) == 0 ) {
1438 0 : fd_topo_link_t const * gossip_out = out_link;
1439 0 : ctx->gossvf_out->mcache = gossip_out->mcache;
1440 0 : ctx->gossvf_out->sync = fd_mcache_seq_laddr( ctx->gossvf_out->mcache );
1441 0 : ctx->gossvf_out->depth = fd_mcache_depth( ctx->gossvf_out->mcache );
1442 0 : ctx->gossvf_out->seq = fd_mcache_seq_query( ctx->gossvf_out->sync );
1443 0 : } else if( strcmp( out_link->name, "net_repair" ) == 0 ) {
1444 0 : fd_topo_link_t const * repair_out = out_link;
1445 0 : ctx->repair_out->mcache = repair_out->mcache;
1446 0 : ctx->repair_out->sync = fd_mcache_seq_laddr( ctx->repair_out->mcache );
1447 0 : ctx->repair_out->depth = fd_mcache_depth( ctx->repair_out->mcache );
1448 0 : ctx->repair_out->seq = fd_mcache_seq_query( ctx->repair_out->sync );
1449 0 : } else if( strcmp( out_link->name, "net_netlnk" ) == 0 ) {
1450 0 : fd_topo_link_t const * netlink_out = out_link;
1451 0 : ctx->neigh4_solicit->mcache = netlink_out->mcache;
1452 0 : ctx->neigh4_solicit->depth = fd_mcache_depth( ctx->neigh4_solicit->mcache );
1453 0 : ctx->neigh4_solicit->seq = fd_mcache_seq_query( fd_mcache_seq_laddr( ctx->neigh4_solicit->mcache ) );
1454 0 : } else if( strcmp( out_link->name, "net_txsend" ) == 0 ) {
1455 0 : fd_topo_link_t const * txsend_out = out_link;
1456 0 : ctx->txsend_out->mcache = txsend_out->mcache;
1457 0 : ctx->txsend_out->sync = fd_mcache_seq_laddr( ctx->txsend_out->mcache );
1458 0 : ctx->txsend_out->depth = fd_mcache_depth( ctx->txsend_out->mcache );
1459 0 : ctx->txsend_out->seq = fd_mcache_seq_query( ctx->txsend_out->sync );
1460 0 : } else {
1461 0 : FD_LOG_ERR(( "unrecognized out link `%s`", out_link->name ));
1462 0 : }
1463 0 : }
1464 :
1465 : /* Check if any of the tiles we set a listen port for do not have an outlink. */
1466 0 : if( FD_UNLIKELY( ctx->shred_listen_port!=0 && ctx->shred_out->mcache==NULL ) ) {
1467 0 : FD_LOG_ERR(( "shred listen port set but no out link was found" ));
1468 0 : } else if( FD_UNLIKELY( ctx->quic_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
1469 0 : FD_LOG_ERR(( "quic transaction listen port set but no out link was found" ));
1470 0 : } else if( FD_UNLIKELY( ctx->legacy_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
1471 0 : FD_LOG_ERR(( "legacy transaction listen port set but no out link was found" ));
1472 0 : } else if( FD_UNLIKELY( ctx->gossip_listen_port!=0 && ctx->gossvf_out->mcache==NULL ) ) {
1473 0 : FD_LOG_ERR(( "gossip listen port set but no out link was found" ));
1474 0 : } else if( FD_UNLIKELY( ctx->repair_intake_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
1475 0 : FD_LOG_ERR(( "repair intake port set but no out link was found" ));
1476 0 : } else if( FD_UNLIKELY( ctx->repair_serve_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
1477 0 : FD_LOG_ERR(( "repair serve listen port set but no out link was found" ));
1478 0 : } else if( FD_UNLIKELY( ctx->neigh4_solicit->mcache==NULL ) ) {
1479 0 : FD_LOG_ERR(( "netlink request link not found" ));
1480 0 : } else if( FD_UNLIKELY( ctx->txsend_src_port!=0 && ctx->txsend_out->mcache==NULL ) ) {
1481 0 : FD_LOG_ERR(( "txsend listen port set but no out link was found" ));
1482 0 : }
1483 :
1484 0 : for( uint j=0U; j<2U; j++ ) {
1485 0 : ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 );
1486 0 : ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) );
1487 0 : ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX;
1488 0 : }
1489 :
1490 : /* Join netbase objects */
1491 0 : FD_TEST( fd_fib4_join( ctx->fib_local, fd_topo_obj_laddr( topo, tile->xdp.fib4_local_obj_id ) ) );
1492 0 : FD_TEST( fd_fib4_join( ctx->fib_main, fd_topo_obj_laddr( topo, tile->xdp.fib4_main_obj_id ) ) );
1493 :
1494 0 : ulong neigh4_obj_id = tile->xdp.neigh4_obj_id;
1495 0 : ulong ele_max = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.ele_max", neigh4_obj_id );
1496 0 : ulong probe_max = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.probe_max", neigh4_obj_id );
1497 0 : ulong seed = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.seed", neigh4_obj_id );
1498 0 : if( FD_UNLIKELY( (ele_max==ULONG_MAX) | (probe_max==ULONG_MAX) | (seed==ULONG_MAX) ) )
1499 0 : FD_LOG_ERR(( "neigh4 hmap properties not set" ));
1500 0 : if( FD_UNLIKELY( !fd_neigh4_hmap_join(
1501 0 : ctx->neigh4,
1502 0 : fd_topo_obj_laddr( topo, neigh4_obj_id ),
1503 0 : ele_max,
1504 0 : probe_max,
1505 0 : seed ) ) ) {
1506 0 : FD_LOG_ERR(( "fd_neigh4_hmap_join failed" ));
1507 0 : }
1508 :
1509 0 : init_device_table( ctx, fd_topo_obj_laddr( topo, tile->xdp.netdev_tbl_obj_id ), netdev_tbl_local );
1510 :
1511 : /* Initialize TX free ring */
1512 :
1513 0 : ulong const frame_sz = 2048UL;
1514 0 : ulong frame_off = 0UL;
1515 0 : ulong const tx_depth = ctx->free_tx.depth;
1516 0 : for( ulong j=0; j<tx_depth; j++ ) {
1517 0 : ctx->free_tx.queue[ j ] = (ulong)ctx->umem + frame_off;
1518 0 : frame_off += frame_sz;
1519 0 : }
1520 0 : ctx->free_tx.prod = tx_depth;
1521 :
1522 : /* Initialize RX mcache chunks */
1523 :
1524 0 : for( ulong i=0UL; i<(tile->out_cnt); i++ ) {
1525 0 : fd_topo_link_t const * out_link = &topo->links[ tile->out_link_id[ i ] ];
1526 0 : fd_frag_meta_t * mcache = out_link->mcache;
1527 0 : for( ulong j=0UL; j<fd_mcache_depth( mcache ); j++ ) {
1528 0 : mcache[ j ].chunk = (uint)( ctx->umem_chunk0 + (frame_off>>FD_CHUNK_LG_SZ) );
1529 0 : frame_off += frame_sz;
1530 0 : }
1531 0 : }
1532 :
1533 : /* Initialize FILL ring */
1534 :
1535 0 : int _charge_busy = 0;
1536 0 : for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
1537 0 : frame_off = net_xsk_bootstrap( ctx, j, frame_off );
1538 0 : net_rx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
1539 0 : net_tx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
1540 0 : }
1541 :
1542 0 : if( FD_UNLIKELY( frame_off > ctx->umem_sz ) ) {
1543 0 : FD_LOG_ERR(( "UMEM is too small" ));
1544 0 : }
1545 0 : }
1546 :
1547 : FD_FN_UNUSED static ulong
1548 : populate_allowed_seccomp( fd_topo_t const * topo,
1549 : fd_topo_tile_t const * tile,
1550 : ulong out_cnt,
1551 0 : struct sock_filter * out ) {
1552 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1553 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1554 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
1555 :
1556 : /* A bit of a hack, if there is no loopback XSK for this tile, we still need to pass
1557 : two "allow" FD arguments to the net policy, so we just make them both the same. */
1558 0 : int allow_fd2 = ctx->xsk_cnt>1UL ? ctx->xsk[ 1 ].xsk_fd : ctx->xsk[ 0 ].xsk_fd;
1559 0 : FD_TEST( ctx->xsk[ 0 ].xsk_fd >= 0 && allow_fd2 >= 0 );
1560 :
1561 0 : populate_sock_filter_policy_fd_xdp_tile( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ].xsk_fd, (uint)allow_fd2 );
1562 0 : return sock_filter_policy_fd_xdp_tile_instr_cnt;
1563 0 : }
1564 :
1565 : FD_FN_UNUSED static ulong
1566 : populate_allowed_fds( fd_topo_t const * topo,
1567 : fd_topo_tile_t const * tile,
1568 : ulong out_fds_cnt,
1569 0 : int * out_fds ) {
1570 0 : void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
1571 0 : FD_SCRATCH_ALLOC_INIT( l, scratch );
1572 0 : fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
1573 :
1574 0 : if( FD_UNLIKELY( out_fds_cnt<6UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
1575 :
1576 0 : ulong out_cnt = 0UL;
1577 :
1578 0 : out_fds[ out_cnt++ ] = 2; /* stderr */
1579 0 : if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
1580 0 : out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
1581 :
1582 0 : out_fds[ out_cnt++ ] = ctx->xsk[ 0 ].xsk_fd;
1583 0 : out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 0 ];
1584 0 : if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->xsk[ 1 ].xsk_fd;
1585 0 : if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 1 ];
1586 0 : return out_cnt;
1587 0 : }
1588 :
1589 0 : #define STEM_BURST (1UL)
1590 0 : #define STEM_LAZY ((ulong)30e3) /* 30 us */
1591 :
1592 0 : #define STEM_CALLBACK_CONTEXT_TYPE fd_net_ctx_t
1593 0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_net_ctx_t)
1594 :
1595 0 : #define STEM_CALLBACK_METRICS_WRITE metrics_write
1596 0 : #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
1597 0 : #define STEM_CALLBACK_BEFORE_CREDIT before_credit
1598 0 : #define STEM_CALLBACK_BEFORE_FRAG before_frag
1599 0 : #define STEM_CALLBACK_DURING_FRAG during_frag
1600 0 : #define STEM_CALLBACK_AFTER_FRAG after_frag
1601 :
1602 : #include "../../stem/fd_stem.c"
1603 :
1604 : #ifndef FD_TILE_TEST
1605 : fd_topo_run_tile_t fd_tile_net = {
1606 : .name = "net",
1607 : .populate_allowed_seccomp = populate_allowed_seccomp,
1608 : .populate_allowed_fds = populate_allowed_fds,
1609 : .scratch_align = scratch_align,
1610 : .scratch_footprint = scratch_footprint,
1611 : .privileged_init = privileged_init,
1612 : .unprivileged_init = unprivileged_init,
1613 : .run = stem_run,
1614 : };
1615 : #endif
|