LCOV - code coverage report
Current view: top level - disco/net/xdp - fd_xdp_tile.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 340 909 37.4 %
Date: 2025-12-07 04:58:33 Functions: 17 90 18.9 %

          Line data    Source code
       1             : /* The xdp tile translates between AF_XDP and fd_tango
       2             :    traffic.  It is responsible for setting up the XDP and
       3             :    XSK socket configuration. */
       4             : 
       5             : #include "../fd_net_tile.h"
       6             : 
       7             : #include <errno.h>
       8             : #include <fcntl.h>
       9             : #include <net/if.h>
      10             : #include <netinet/in.h>
      11             : #include <sys/socket.h> /* MSG_DONTWAIT needed before importing the net seccomp filter */
      12             : #include <linux/if_xdp.h>
      13             : 
      14             : #include "../fd_net_common.h"
      15             : #include "../../metrics/fd_metrics.h"
      16             : #include "../../netlink/fd_netlink_tile.h" /* neigh4_solicit */
      17             : #include "../../topo/fd_topo.h"
      18             : 
      19             : #include "../../../waltz/ip/fd_fib4.h"
      20             : #include "../../../waltz/neigh/fd_neigh4_map.h"
      21             : #include "../../../waltz/mib/fd_netdev_tbl.h"
      22             : #include "../../../waltz/mib/fd_dbl_buf.h"
      23             : #include "../../../waltz/xdp/fd_xdp_redirect_user.h" /* fd_xsk_activate */
      24             : #include "../../../waltz/xdp/fd_xsk.h"
      25             : #include "../../../util/log/fd_dtrace.h"
      26             : #include "../../../util/net/fd_eth.h"
      27             : #include "../../../util/net/fd_ip4.h"
      28             : #include "../../../util/net/fd_gre.h"
      29             : #include "../../../util/pod/fd_pod_format.h"
      30             : 
      31             : #include <unistd.h>
      32             : #include <linux/if.h> /* struct ifreq */
      33             : #include <sys/ioctl.h>
      34             : #include <linux/if_arp.h>
      35             : 
      36             : #include "generated/fd_xdp_tile_seccomp.h"
      37             : 
      38             : /* MAX_NET_INS controls the max number of TX links that a net tile can
      39             :    serve. */
      40             : 
      41             : #define MAX_NET_INS (32UL)
      42             : 
      43             : /* FD_XDP_STATS_INTERVAL_NS controls the XDP stats refresh interval.
      44             :    This should be lower than the interval at which the metrics tile
      45             :    collects metrics. */
      46             : 
      47           0 : #define FD_XDP_STATS_INTERVAL_NS (11e6) /* 11ms */
      48             : 
      49             : /* XSK_IDX_{MAIN,LO} are the hardcoded XSK indices in ctx->xsk[ ... ].
      50             :    Only net tile 0 has XSK_IDX_LO, all net tiles have XSK_IDX_MAIN. */
      51             : 
      52          30 : #define XSK_IDX_MAIN 0
      53          18 : #define XSK_IDX_LO   1
      54             : 
      55             : /* fd_net_in_ctx_t contains consumer information for an incoming tango
      56             :    link.  It is used as part of the TX path. */
      57             : 
      58             : typedef struct {
      59             :   fd_wksp_t * mem;
      60             :   ulong       chunk0;
      61             :   ulong       wmark;
      62             : } fd_net_in_ctx_t;
      63             : 
      64             : /* fd_net_out_ctx_t contains publisher information for a link to a
      65             :    downstream app tile.  It is used as part of the RX path. */
      66             : 
      67             : typedef struct {
      68             :   fd_frag_meta_t * mcache;
      69             :   ulong *          sync;
      70             :   ulong            depth;
      71             :   ulong            seq;
      72             : } fd_net_out_ctx_t;
      73             : 
      74             : /* fd_net_flusher_t controls the pacing of XDP sendto calls for flushing
      75             :    TX batches.  In the 'wakeup' XDP mode, no TX occurs unless the net
      76             :    tile wakes up the kernel periodically using the sendto() syscall.
      77             :    If sendto() is called too frequently, time is wasted on context
      78             :    switches.  If sendto() is called not often enough, packets are
      79             :    delayed or dropped.  sendto() calls make almost no guarantees how
      80             :    much packets are sent out, nor do they indicate when the kernel
      81             :    finishes a wakeup call (asynchronously dispatched).  The net tile
      82             :    thus uses a myraid of flush triggers that were tested for best
      83             :    performance. */
      84             : 
      85             : struct fd_net_flusher {
      86             : 
      87             :   /* Packets that were enqueued after the last sendto() wakeup are
      88             :      considered "pending".  If there are more than pending_wmark packets
      89             :      pending, a wakeup is dispatched.  Thus, this dispatch trigger is
      90             :      proportional to packet rate, but does not trigger if I/O is seldom. */
      91             :   ulong pending_cnt;
      92             :   ulong pending_wmark;
      93             : 
      94             :   /* Sometimes, packets are not flushed out even after a sendto()
      95             :      wakeup.  This can result in the tail of a burst getting delayed or
      96             :      overrun.  If more than tail_flush_backoff ticks pass since the last
      97             :      sendto() wakeup and there are still unacknowledged packets in the
      98             :      TX ring, issues another wakeup. */
      99             :   long next_tail_flush_ticks;
     100             :   long tail_flush_backoff;
     101             : 
     102             : };
     103             : 
     104             : typedef struct fd_net_flusher fd_net_flusher_t;
     105             : 
     106             : FD_PROTOTYPES_BEGIN
     107             : 
     108             : /* fd_net_flusher_inc marks a new packet as enqueued. */
     109             : 
     110             : static inline void
     111             : fd_net_flusher_inc( fd_net_flusher_t * flusher,
     112          18 :                     long               now ) {
     113          18 :   flusher->pending_cnt++;
     114          18 :   long next_flush = now + flusher->tail_flush_backoff;
     115          18 :   flusher->next_tail_flush_ticks = fd_long_min( flusher->next_tail_flush_ticks, next_flush );
     116          18 : }
     117             : 
     118             : /* fd_net_flusher_check returns 1 if a sendto() wakeup should be issued
     119             :    immediately.  now is a recent fd_tickcount() value.
     120             :    If tx_ring_empty==0 then the kernel is caught up with the net tile
     121             :    on the XDP TX ring.  (Otherwise, the kernel is behind the net tile) */
     122             : 
     123             : static inline int
     124             : fd_net_flusher_check( fd_net_flusher_t * flusher,
     125             :                       long               now,
     126          24 :                       int                tx_ring_empty ) {
     127          24 :   int flush_level   = flusher->pending_cnt >= flusher->pending_wmark;
     128          24 :   int flush_timeout = now >= flusher->next_tail_flush_ticks;
     129          24 :   int flush         = flush_level || flush_timeout;
     130          24 :   if( !flush ) return 0;
     131          24 :   if( FD_UNLIKELY( tx_ring_empty ) ) {
     132             :     /* Flush requested but caught up */
     133           3 :     flusher->pending_cnt           = 0UL;
     134           3 :     flusher->next_tail_flush_ticks = LONG_MAX;
     135           3 :     return 0;
     136           3 :   }
     137          21 :   return 1;
     138          24 : }
     139             : 
     140             : /* fd_net_flusher_wakeup signals a sendto() wakeup was done.  now is a
     141             :    recent fd_tickcount() value. */
     142             : 
     143             : static inline void
     144             : fd_net_flusher_wakeup( fd_net_flusher_t * flusher,
     145          21 :                        long               now ) {
     146          21 :   flusher->pending_cnt           = 0UL;
     147          21 :   flusher->next_tail_flush_ticks = now + flusher->tail_flush_backoff;
     148          21 : }
     149             : 
     150             : FD_PROTOTYPES_END
     151             : 
     152             : /* fd_net_free_ring is a FIFO queue that stores pointers to free XDP TX
     153             :    frames. */
     154             : 
     155             : struct fd_net_free_ring {
     156             :   ulong   prod;
     157             :   ulong   cons;
     158             :   ulong   depth;
     159             :   ulong * queue;
     160             : };
     161             : typedef struct fd_net_free_ring fd_net_free_ring_t;
     162             : 
     163             : typedef struct {
     164             :   /* An "XSK" is an AF_XDP socket */
     165             :   uint     xsk_cnt;
     166             :   fd_xsk_t xsk[ 2 ];
     167             :   int      prog_link_fds[ 2 ];
     168             :   uint     if_virt;
     169             : 
     170             :   /* UMEM frame region within dcache */
     171             :   void *   umem;    /* Start of UMEM */
     172             :   ulong    umem_sz; /* Size  of UMEM */
     173             : 
     174             :   /* UMEM chunk region within workspace */
     175             :   uint     umem_chunk0; /* Lowest allowed chunk number */
     176             :   uint     umem_wmark;  /* Highest allowed chunk number */
     177             : 
     178             :   /* All net tiles are subscribed to the same TX links.  (These are
     179             :      incoming links from app tiles asking the net tile to send out packets)
     180             :      The net tiles "take turns" doing TX jobs based on the L3+L4 dst hash.
     181             :      net_tile_id is the index of the current interface, net_tile_cnt is the
     182             :      total amount of interfaces. */
     183             :   uint net_tile_id;
     184             :   uint net_tile_cnt;
     185             : 
     186             :   /* Details pertaining to an inflight send op */
     187             :   struct {
     188             :     uint   xsk_idx;
     189             :     void * frame;
     190             :     uchar  mac_addrs[12];     /* First 12 bytes of Ethernet header */
     191             :     uint   src_ip;            /* src_ip in net order */
     192             : 
     193             :     uint   use_gre;           /* The tx packet will be GRE-encapsulated */
     194             :     uint   gre_outer_src_ip;  /* For GRE: Outer iphdr's src_ip in net order */
     195             :     uint   gre_outer_dst_ip;  /* For GRE: Outer iphdr's dst_ip in net order */
     196             :   } tx_op;
     197             : 
     198             :   /* Round-robin cycle serivce operations */
     199             :   uint rr_idx;
     200             : 
     201             :   /* Ring tracking free packet buffers */
     202             :   fd_net_free_ring_t free_tx;
     203             : 
     204             :   uchar  src_mac_addr[6];
     205             :   uint   default_address;
     206             : 
     207             :   uint   bind_address;
     208             :   ushort shred_listen_port;
     209             :   ushort quic_transaction_listen_port;
     210             :   ushort legacy_transaction_listen_port;
     211             :   ushort gossip_listen_port;
     212             :   ushort repair_intake_listen_port;
     213             :   ushort repair_serve_listen_port;
     214             :   ushort send_src_port;
     215             : 
     216             :   ulong in_cnt;
     217             :   fd_net_in_ctx_t in[ MAX_NET_INS ];
     218             : 
     219             :   fd_net_out_ctx_t quic_out[1];
     220             :   fd_net_out_ctx_t shred_out[1];
     221             :   fd_net_out_ctx_t gossvf_out[1];
     222             :   fd_net_out_ctx_t repair_out[1];
     223             :   fd_net_out_ctx_t send_out[1];
     224             : 
     225             :   /* XDP stats refresh timer */
     226             :   long xdp_stats_interval_ticks;
     227             :   long next_xdp_stats_refresh;
     228             : 
     229             :   /* TX flush timers */
     230             :   fd_net_flusher_t tx_flusher[2]; /* one per XSK */
     231             : 
     232             :   /* Route and neighbor tables */
     233             :   fd_fib4_t fib_local[1];
     234             :   fd_fib4_t fib_main[1];
     235             :   fd_neigh4_hmap_t  neigh4[1];
     236             :   fd_netlink_neigh4_solicit_link_t neigh4_solicit[1];
     237             : 
     238             :   /* Netdev table */
     239             :   fd_dbl_buf_t *       netdev_dbl_buf;    /* remote copy of device table */
     240             :   uchar *              netdev_buf;        /* local copy of device table */
     241             :   ulong                netdev_buf_sz;
     242             :   fd_netdev_tbl_join_t netdev_tbl;        /* join to local copy of device table */
     243             :   int                  has_gre_interface; /* enable GRE support? */
     244             : 
     245             :   struct {
     246             :     ulong rx_pkt_cnt;
     247             :     ulong rx_bytes_total;
     248             :     ulong rx_undersz_cnt;
     249             :     ulong rx_fill_blocked_cnt;
     250             :     ulong rx_backp_cnt;
     251             :     long  rx_busy_cnt;
     252             :     long  rx_idle_cnt;
     253             : 
     254             :     ulong tx_submit_cnt;
     255             :     ulong tx_complete_cnt;
     256             :     ulong tx_bytes_total;
     257             :     ulong tx_route_fail_cnt;
     258             :     ulong tx_no_xdp_cnt;
     259             :     ulong tx_neigh_fail_cnt;
     260             :     ulong tx_full_fail_cnt;
     261             :     long  tx_busy_cnt;
     262             :     long  tx_idle_cnt;
     263             : 
     264             :     ulong xsk_tx_wakeup_cnt;
     265             :     ulong xsk_rx_wakeup_cnt;
     266             : 
     267             :     ulong rx_gre_cnt;
     268             :     ulong rx_gre_ignored_cnt;
     269             :     ulong rx_gre_inv_pkt_cnt;
     270             :     ulong tx_gre_cnt;
     271             :     ulong tx_gre_route_fail_cnt;
     272             :   } metrics;
     273             : } fd_net_ctx_t;
     274             : 
     275             : FD_FN_CONST static inline ulong
     276           9 : scratch_align( void ) {
     277           9 :   return 4096UL;
     278           9 : }
     279             : 
     280             : FD_FN_PURE static inline ulong
     281           3 : scratch_footprint( fd_topo_tile_t const * tile ) {
     282           3 :   ulong l = FD_LAYOUT_INIT;
     283           3 :   l = FD_LAYOUT_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t)                      );
     284           3 :   l = FD_LAYOUT_APPEND( l, alignof(ulong),        tile->xdp.free_ring_depth * sizeof(ulong) );
     285           3 :   l = FD_LAYOUT_APPEND( l, fd_netdev_tbl_align(), fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX ) );
     286           3 :   return FD_LAYOUT_FINI( l, scratch_align() );
     287           3 : }
     288             : 
     289             : static void
     290           0 : metrics_write( fd_net_ctx_t * ctx ) {
     291           0 :   FD_MCNT_SET(   NET, RX_PKT_CNT,          ctx->metrics.rx_pkt_cnt          );
     292           0 :   FD_MCNT_SET(   NET, RX_BYTES_TOTAL,      ctx->metrics.rx_bytes_total      );
     293           0 :   FD_MCNT_SET(   NET, RX_UNDERSZ_CNT,      ctx->metrics.rx_undersz_cnt      );
     294           0 :   FD_MCNT_SET(   NET, RX_FILL_BLOCKED_CNT, ctx->metrics.rx_fill_blocked_cnt );
     295           0 :   FD_MCNT_SET(   NET, RX_BACKPRESSURE_CNT, ctx->metrics.rx_backp_cnt        );
     296           0 :   FD_MGAUGE_SET( NET, RX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.rx_busy_cnt, 0L ) );
     297           0 :   FD_MGAUGE_SET( NET, RX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.rx_idle_cnt, 0L ) );
     298           0 :   FD_MGAUGE_SET( NET, TX_BUSY_CNT, (ulong)fd_long_max( ctx->metrics.tx_busy_cnt, 0L ) );
     299           0 :   FD_MGAUGE_SET( NET, TX_IDLE_CNT, (ulong)fd_long_max( ctx->metrics.tx_idle_cnt, 0L ) );
     300             : 
     301           0 :   FD_MCNT_SET( NET, TX_SUBMIT_CNT,        ctx->metrics.tx_submit_cnt     );
     302           0 :   FD_MCNT_SET( NET, TX_COMPLETE_CNT,      ctx->metrics.tx_complete_cnt   );
     303           0 :   FD_MCNT_SET( NET, TX_BYTES_TOTAL,       ctx->metrics.tx_bytes_total    );
     304           0 :   FD_MCNT_SET( NET, TX_ROUTE_FAIL_CNT,    ctx->metrics.tx_route_fail_cnt );
     305           0 :   FD_MCNT_SET( NET, TX_NEIGHBOR_FAIL_CNT, ctx->metrics.tx_neigh_fail_cnt );
     306           0 :   FD_MCNT_SET( NET, TX_FULL_FAIL_CNT,     ctx->metrics.tx_full_fail_cnt  );
     307             : 
     308           0 :   FD_MCNT_SET( NET, XSK_TX_WAKEUP_CNT,    ctx->metrics.xsk_tx_wakeup_cnt    );
     309           0 :   FD_MCNT_SET( NET, XSK_RX_WAKEUP_CNT,    ctx->metrics.xsk_rx_wakeup_cnt    );
     310             : 
     311           0 :   FD_MCNT_SET( NET, RX_GRE_CNT,            ctx->metrics.rx_gre_cnt            );
     312           0 :   FD_MCNT_SET( NET, RX_GRE_INVALID_CNT,    ctx->metrics.rx_gre_inv_pkt_cnt    );
     313           0 :   FD_MCNT_SET( NET, RX_GRE_IGNORED_CNT,    ctx->metrics.rx_gre_ignored_cnt    );
     314           0 :   FD_MCNT_SET( NET, TX_GRE_CNT,            ctx->metrics.tx_gre_cnt            );
     315           0 :   FD_MCNT_SET( NET, TX_GRE_ROUTE_FAIL_CNT, ctx->metrics.tx_gre_route_fail_cnt );
     316           0 : }
     317             : 
     318             : struct xdp_statistics_v0 {
     319             :   __u64 rx_dropped; /* Dropped for other reasons */
     320             :   __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
     321             :   __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
     322             : };
     323             : 
     324             : struct xdp_statistics_v1 {
     325             :   __u64 rx_dropped; /* Dropped for other reasons */
     326             :   __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
     327             :   __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
     328             :   __u64 rx_ring_full; /* Dropped due to rx ring being full */
     329             :   __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */
     330             :   __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */
     331             : };
     332             : 
     333             : static void
     334           0 : poll_xdp_statistics( fd_net_ctx_t * ctx ) {
     335           0 :   struct xdp_statistics_v1 stats = {0};
     336           0 :   ulong xsk_cnt = ctx->xsk_cnt;
     337           0 :   for( ulong j=0UL; j<xsk_cnt; j++ ) {
     338           0 :     struct xdp_statistics_v1 sub_stats;
     339           0 :     uint optlen = (uint)sizeof(struct xdp_statistics_v1);
     340           0 :     if( FD_UNLIKELY( -1==getsockopt( ctx->xsk[ j ].xsk_fd, SOL_XDP, XDP_STATISTICS, &sub_stats, &optlen ) ) )
     341           0 :       FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) failed: %s", strerror( errno ) ));
     342           0 :     if( FD_UNLIKELY( optlen!=sizeof(struct xdp_statistics_v0) &&
     343           0 :                      optlen!=sizeof(struct xdp_statistics_v1) ) ) {
     344           0 :       FD_LOG_ERR(( "getsockopt(SOL_XDP, XDP_STATISTICS) returned unexpected size %u", optlen ));
     345           0 :     }
     346           0 :     stats.rx_dropped               += sub_stats.rx_dropped;
     347           0 :     stats.rx_invalid_descs         += sub_stats.rx_invalid_descs;
     348           0 :     stats.tx_invalid_descs         += sub_stats.tx_invalid_descs;
     349           0 :     stats.rx_ring_full             += sub_stats.rx_ring_full;
     350           0 :     stats.rx_fill_ring_empty_descs += sub_stats.rx_fill_ring_empty_descs;
     351           0 :     stats.tx_ring_empty_descs      += sub_stats.tx_ring_empty_descs;
     352           0 :   }
     353             : 
     354           0 :   FD_MCNT_SET( NET, XDP_RX_DROPPED_OTHER,         stats.rx_dropped               );
     355           0 :   FD_MCNT_SET( NET, XDP_RX_INVALID_DESCS,         stats.rx_invalid_descs         );
     356           0 :   FD_MCNT_SET( NET, XDP_TX_INVALID_DESCS,         stats.tx_invalid_descs         );
     357           0 :   FD_MCNT_SET( NET, XDP_RX_RING_FULL,             stats.rx_ring_full             );
     358           0 :   FD_MCNT_SET( NET, XDP_RX_FILL_RING_EMPTY_DESCS, stats.rx_fill_ring_empty_descs );
     359           0 :   FD_MCNT_SET( NET, XDP_TX_RING_EMPTY_DESCS,      stats.tx_ring_empty_descs      );
     360           0 : }
     361             : 
     362             : /* net_is_fatal_xdp_error returns 1 if the given errno returned by an
     363             :    XDP API indicates a non-recoverable error code.  The net tile should
     364             :    crash if it sees such an error so the problem does not go undetected.
     365             :    Otherwise, returns 0. */
     366             : 
     367             : static int
     368           0 : net_is_fatal_xdp_error( int err ) {
     369           0 :   return err==ESOCKTNOSUPPORT || err==EOPNOTSUPP || err==EINVAL ||
     370           0 :          err==EPERM;
     371           0 : }
     372             : 
     373             : /* Load the netdev table to ctx->netdev_buf. Create a join in ctx->netdev_tbl_handle  */
     374             : 
     375             : static void
     376           0 : net_load_netdev_tbl( fd_net_ctx_t * ctx ) {
     377             :   /* Copy netdev table from netlink tile.  This could fail briefly
     378             :      during startup if the netlink tile is late to start up. */
     379           0 :   if( FD_UNLIKELY( !fd_dbl_buf_read( ctx->netdev_dbl_buf, ctx->netdev_buf_sz, ctx->netdev_buf, NULL ) ) ) return;
     380             : 
     381             :   /* Join local copy */
     382           0 :   if( FD_UNLIKELY( !fd_netdev_tbl_join( &ctx->netdev_tbl, ctx->netdev_buf ) ) ) FD_LOG_ERR(("netdev table join failed"));
     383           0 : }
     384             : 
     385             : /* Iterates the netdev table and returns 1 if a GRE interface exists, 0 otherwise.
     386             :    Only called in privileged_init and during_housekeeping */
     387             : 
     388             : static int
     389           3 : net_check_gre_interface_exists( fd_net_ctx_t * ctx ) {
     390           3 :   fd_netdev_t * dev_tbl = ctx->netdev_tbl.dev_tbl;
     391           3 :   ushort        dev_cnt = ctx->netdev_tbl.hdr->dev_cnt;
     392             : 
     393           3 :   for( ushort if_idx = 0; if_idx<dev_cnt; if_idx++ ) {
     394           0 :     if( dev_tbl[if_idx].dev_type==ARPHRD_IPGRE ) return 1;
     395           0 :   }
     396           3 :   return 0;
     397           3 : }
     398             : 
     399             : 
     400             : /* net_tx_ready returns 1 if the current XSK is ready to submit a TX send
     401             :    job.  If the XSK is blocked for sends, returns 0.  Reasons for block
     402             :    include:
     403             :    - No XSK TX buffer is available
     404             :    - XSK TX ring is full */
     405             : 
     406             : static int
     407             : net_tx_ready( fd_net_ctx_t * ctx,
     408          18 :               uint           xsk_idx ) {
     409          18 :   fd_xsk_t *           xsk     = &ctx->xsk[ xsk_idx ];
     410          18 :   fd_xdp_ring_t *      tx_ring = &xsk->ring_tx;
     411          18 :   fd_net_free_ring_t * free    = &ctx->free_tx;
     412          18 :   if( free->prod == free->cons ) return 0; /* drop */
     413             : 
     414             :   /* If potentially stale cached_cons says there is space,
     415             :      there is definitely space */
     416          18 :   if( tx_ring->cached_prod - tx_ring->cached_cons >= tx_ring->depth ) return 1;
     417             : 
     418             :   /* read the fseq, and update our cache */
     419          18 :   tx_ring->cached_cons = FD_VOLATILE_CONST( *tx_ring->cons );
     420          18 :   if( tx_ring->cached_prod - tx_ring->cached_cons >= tx_ring->depth ) return 0; /* drop */
     421          18 :   return 1;
     422          18 : }
     423             : 
     424             : /* net_rx_wakeup triggers xsk_recvmsg to run in the kernel.  Needs to be
     425             :    called periodically in order to receive packets. */
     426             : 
     427             : static void
     428             : net_rx_wakeup( fd_net_ctx_t * ctx,
     429             :                fd_xsk_t *     xsk,
     430           0 :                int *          charge_busy ) {
     431           0 :   FD_VOLATILE( *xsk->ring_rx.cons ) = xsk->ring_rx.cached_cons; /* write-back local copies to fseqs */
     432           0 :   FD_VOLATILE( *xsk->ring_fr.prod ) = xsk->ring_fr.cached_prod;
     433           0 :   if( !fd_xsk_rx_need_wakeup( xsk ) ) return;
     434           0 :   *charge_busy = 1;
     435           0 :   struct msghdr _ignored[ 1 ] = { 0 };
     436           0 :   if( FD_UNLIKELY( -1==recvmsg( xsk->xsk_fd, _ignored, MSG_DONTWAIT ) ) ) {
     437           0 :     if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
     438           0 :       FD_LOG_ERR(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
     439           0 :     }
     440           0 :     if( FD_UNLIKELY( errno!=EAGAIN ) ) {
     441           0 :       long ts = fd_log_wallclock();
     442           0 :       if( ts > xsk->log_suppress_until_ns ) {
     443           0 :         FD_LOG_WARNING(( "xsk recvmsg failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
     444           0 :         xsk->log_suppress_until_ns = ts + (long)1e9;
     445           0 :       }
     446           0 :     }
     447           0 :   }
     448           0 :   ctx->metrics.xsk_rx_wakeup_cnt++;
     449           0 : }
     450             : 
     451             : /* net_tx_wakeup triggers xsk_sendmsg to run in the kernel.  Needs to be
     452             :    called periodically in order to transmit packets. Should only be called
     453             :    if there are unconsumed packets in Tx ring. */
     454             : 
     455             : static void
     456             : net_tx_wakeup( fd_net_ctx_t * ctx,
     457             :                fd_xsk_t *     xsk,
     458          21 :                int *          charge_busy ) {
     459          21 :   FD_VOLATILE( *xsk->ring_tx.prod ) = xsk->ring_tx.cached_prod; /* write-back local copies to fseqs */
     460          21 :   FD_VOLATILE( *xsk->ring_cr.cons ) = xsk->ring_cr.cached_cons;
     461          21 :   if( !fd_xsk_tx_need_wakeup( xsk ) ) return;
     462           0 :   *charge_busy = 1;
     463           0 :   if( FD_UNLIKELY( -1==sendto( xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) {
     464           0 :     if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) {
     465           0 :       FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
     466           0 :     }
     467           0 :     if( FD_UNLIKELY( errno!=EAGAIN ) ) {
     468           0 :       long ts = fd_log_wallclock();
     469           0 :       if( ts > xsk->log_suppress_until_ns ) {
     470           0 :         FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
     471           0 :         xsk->log_suppress_until_ns = ts + (long)1e9;
     472           0 :       }
     473           0 :     }
     474           0 :   }
     475           0 :   ctx->metrics.xsk_tx_wakeup_cnt++;
     476           0 : }
     477             : 
     478             : /* net_tx_periodic_wakeup does a timer based xsk_sendmsg wakeup. */
     479             : 
     480             : static inline int
     481             : net_tx_periodic_wakeup( fd_net_ctx_t * ctx,
     482             :                         uint           xsk_idx,
     483             :                         long           now,
     484          24 :                         int *          charge_busy ) {
     485          24 :   fd_xdp_ring_t * tx_ring        = &ctx->xsk[ xsk_idx ].ring_tx;
     486          24 :   uint            tx_prod        = tx_ring->cached_prod;
     487          24 :   uint            tx_cons        = tx_ring->cached_cons;
     488             : 
     489          24 :   int             tx_ring_empty  = tx_prod==tx_cons;
     490             :   /* If we already think tx_ring_empty, it's definitely empty.
     491             :      But if not, we should update our view of what kernel has consumed. */
     492          24 :   if( FD_LIKELY( !tx_ring_empty ) ) {
     493          21 :     tx_cons       = tx_ring->cached_cons = FD_VOLATILE_CONST( *tx_ring->cons );
     494          21 :     tx_ring_empty = tx_prod==tx_cons;
     495          21 :   }
     496             : 
     497          24 :   if( fd_net_flusher_check( ctx->tx_flusher+xsk_idx, now, tx_ring_empty ) ) {
     498          21 :     net_tx_wakeup( ctx, &ctx->xsk[ xsk_idx ], charge_busy );
     499          21 :     fd_net_flusher_wakeup( ctx->tx_flusher+xsk_idx, now );
     500          21 :   }
     501          24 :   return 0;
     502          24 : }
     503             : 
     504             : static void
     505           0 : during_housekeeping( fd_net_ctx_t * ctx ) {
     506           0 :   long now = fd_tickcount();
     507           0 :   net_load_netdev_tbl( ctx );
     508           0 :   ctx->has_gre_interface = net_check_gre_interface_exists( ctx );
     509             : 
     510           0 :   ctx->metrics.rx_busy_cnt = 0UL;
     511           0 :   ctx->metrics.rx_idle_cnt = 0UL;
     512           0 :   ctx->metrics.tx_busy_cnt = 0UL;
     513           0 :   ctx->metrics.tx_idle_cnt = fd_seq_diff( ctx->free_tx.prod, ctx->free_tx.cons );
     514           0 :   for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
     515           0 :     fd_xsk_t * xsk = &ctx->xsk[ j ];
     516           0 :     FD_COMPILER_MFENCE();
     517             :     /* Write back local copies to fseqs that we own */
     518           0 :     FD_VOLATILE( *xsk->ring_fr.prod ) = xsk->ring_fr.cached_prod;
     519           0 :     FD_VOLATILE( *xsk->ring_rx.cons ) = xsk->ring_rx.cached_cons;
     520           0 :     FD_VOLATILE( *xsk->ring_tx.prod ) = xsk->ring_tx.cached_prod;
     521           0 :     FD_VOLATILE( *xsk->ring_cr.cons ) = xsk->ring_cr.cached_cons;
     522             : 
     523             :     /* Refresh kernel-owned seq numbers for accurate stats */
     524           0 :     xsk->ring_fr.cached_cons = FD_VOLATILE_CONST( *xsk->ring_fr.cons );
     525           0 :     xsk->ring_rx.cached_prod = FD_VOLATILE_CONST( *xsk->ring_rx.prod );
     526           0 :     xsk->ring_tx.cached_cons = FD_VOLATILE_CONST( *xsk->ring_tx.cons );
     527           0 :     xsk->ring_cr.cached_prod = FD_VOLATILE_CONST( *xsk->ring_cr.prod );
     528             : 
     529           0 :     FD_COMPILER_MFENCE();
     530           0 :     ctx->metrics.rx_busy_cnt += (long)(int)( xsk->ring_rx.cached_prod - xsk->ring_rx.cached_cons );
     531           0 :     ctx->metrics.rx_idle_cnt += (long)(int)( xsk->ring_fr.cached_prod - xsk->ring_fr.cached_cons );
     532           0 :     ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_tx.cached_prod - xsk->ring_tx.cached_cons );
     533           0 :     ctx->metrics.tx_busy_cnt += (long)(int)( xsk->ring_cr.cached_prod - xsk->ring_cr.cached_cons );
     534           0 :   }
     535             : 
     536           0 :   if( now > ctx->next_xdp_stats_refresh ) {
     537           0 :     ctx->next_xdp_stats_refresh = now + ctx->xdp_stats_interval_ticks;
     538           0 :     poll_xdp_statistics( ctx );
     539           0 :   }
     540           0 : }
     541             : 
     542             : 
     543             : /* net_tx_route resolves the xsk index, src ip address, src MAC address, and
     544             :    dst MAC address.  Returns 1 on success, 0 on failure.
     545             :    On success, tx_op->{xsk_idx,src_ip,mac_addrs} is set, and if the dst_ip
     546             :    belongs to a GRE interface, is_gre_inf will set to 1 and
     547             :    tx_op->{gre_outer_src_ip, gre_outer_dst_ip} will be loaded from the netdev
     548             :    table. is_gre_inf is set to 0 if dst_ip doesn't belong to a GRE interface. */
     549             : 
     550             : static int
     551             : net_tx_route( fd_net_ctx_t * ctx,
     552             :               uint           dst_ip,
     553          33 :               uint *         is_gre_inf ) {
     554             : 
     555             :   /* Route lookup */
     556             : 
     557          33 :   fd_fib4_hop_t hop[2] = {0};
     558          33 :   hop[0] = fd_fib4_lookup( ctx->fib_local, dst_ip, 0UL );
     559          33 :   hop[1] = fd_fib4_lookup( ctx->fib_main,  dst_ip, 0UL );
     560          33 :   fd_fib4_hop_t const * next_hop = fd_fib4_hop_or( hop+0, hop+1 );
     561             : 
     562          33 :   uint rtype   = next_hop->rtype;
     563          33 :   uint if_idx  = next_hop->if_idx;
     564          33 :   uint ip4_src = next_hop->ip4_src;
     565             : 
     566          33 :   if( FD_UNLIKELY( rtype==FD_FIB4_RTYPE_LOCAL ) ) {
     567           0 :     rtype  = FD_FIB4_RTYPE_UNICAST;
     568           0 :     if_idx = 1;
     569           0 :   }
     570             : 
     571          33 :   if( FD_UNLIKELY( rtype!=FD_FIB4_RTYPE_UNICAST ) ) {
     572           0 :     ctx->metrics.tx_route_fail_cnt++;
     573           0 :     return 0;
     574           0 :   }
     575             : 
     576          33 :   fd_netdev_t * netdev = fd_netdev_tbl_query( &ctx->netdev_tbl, if_idx );
     577          33 :   if( !netdev ) {
     578           3 :     ctx->metrics.tx_route_fail_cnt++;
     579           3 :     return 0;
     580           3 :   }
     581             : 
     582          30 :   ip4_src = fd_uint_if( !!ctx->bind_address, ctx->bind_address, ip4_src );
     583          30 :   ctx->tx_op.src_ip  = ip4_src;
     584          30 :   ctx->tx_op.xsk_idx = UINT_MAX;
     585             : 
     586          30 :   FD_TEST( is_gre_inf );
     587          30 :   *is_gre_inf = 0;
     588          30 :   if( netdev->dev_type==ARPHRD_LOOPBACK ) {
     589             :     /* Set Ethernet src and dst address to 00:00:00:00:00:00 */
     590           0 :     memset( ctx->tx_op.mac_addrs, 0, 12UL );
     591           0 :     ctx->tx_op.xsk_idx = XSK_IDX_LO;
     592             :     /* Set preferred src address to 127.0.0.1 if no bind address is set */
     593           0 :     if( !ctx->tx_op.src_ip ) ctx->tx_op.src_ip = FD_IP4_ADDR( 127,0,0,1 );
     594           0 :     return 1;
     595          30 :   } else if( netdev->dev_type==ARPHRD_IPGRE ) {
     596             :     /* skip MAC addrs lookup for GRE inner dst ip */
     597          12 :     if( netdev->gre_src_ip ) ctx->tx_op.gre_outer_src_ip = netdev->gre_src_ip;
     598          12 :     ctx->tx_op.gre_outer_dst_ip = netdev->gre_dst_ip;
     599          12 :     *is_gre_inf = 1;
     600          12 :     return 1;
     601          12 :   }
     602             : 
     603          18 :   if( FD_UNLIKELY( netdev->dev_type!=ARPHRD_ETHER ) ) return 0; // drop
     604             : 
     605          18 :   if( FD_UNLIKELY( if_idx!=ctx->if_virt ) ) {
     606           0 :     ctx->metrics.tx_no_xdp_cnt++;
     607           0 :     return 0;
     608           0 :   }
     609          18 :   ctx->tx_op.xsk_idx = XSK_IDX_MAIN;
     610             : 
     611             :   /* Neighbor resolve */
     612          18 :   uint neigh_ip = next_hop->ip4_gw;
     613          18 :   if( !neigh_ip ) neigh_ip = dst_ip;
     614             : 
     615          18 :   fd_neigh4_entry_t neigh[1];
     616          18 :   int neigh_res = fd_neigh4_hmap_query_entry( ctx->neigh4, neigh_ip, neigh );
     617          18 :   if( FD_UNLIKELY( neigh_res!=FD_MAP_SUCCESS ) ) {
     618             :     /* Neighbor not found */
     619           0 :     fd_netlink_neigh4_solicit( ctx->neigh4_solicit, neigh_ip, if_idx, fd_frag_meta_ts_comp( fd_tickcount() ) );
     620           0 :     ctx->metrics.tx_neigh_fail_cnt++;
     621           0 :     return 0;
     622           0 :   }
     623          18 :   if( FD_UNLIKELY( neigh->state != FD_NEIGH4_STATE_ACTIVE ) ) {
     624           0 :     ctx->metrics.tx_neigh_fail_cnt++;
     625           0 :     return 0;
     626           0 :   }
     627          18 :   ip4_src = fd_uint_if( !ip4_src, ctx->default_address, ip4_src );
     628          18 :   ctx->tx_op.src_ip = ip4_src;
     629          18 :   memcpy( ctx->tx_op.mac_addrs+0, neigh->mac_addr, 6 );
     630          18 :   memcpy( ctx->tx_op.mac_addrs+6, netdev->mac_addr,  6 );
     631             : 
     632          18 :   return 1;
     633          18 : }
     634             : 
     635             : /* before_frag is called when a new metadata descriptor for a TX job is
     636             :    found.  This callback determines whether this net tile is responsible
     637             :    for the TX job.  If so, it prepares the TX op for the during_frag and
     638             :    after_frag callbacks. */
     639             : 
     640             : static inline int
     641             : before_frag( fd_net_ctx_t * ctx,
     642             :              ulong          in_idx,
     643             :              ulong          seq,
     644          18 :              ulong          sig ) {
     645          18 :   (void)in_idx; (void)seq;
     646             : 
     647             :   /* Find interface index of next packet */
     648          18 :   ulong proto = fd_disco_netmux_sig_proto( sig );
     649          18 :   if( FD_UNLIKELY( proto!=DST_PROTO_OUTGOING ) ) return 1;
     650             : 
     651             :   /* Load balance TX */
     652          18 :   uint net_tile_cnt = ctx->net_tile_cnt;
     653          18 :   uint hash         = (uint)fd_disco_netmux_sig_hash( sig );
     654          18 :   uint target_idx   = hash % net_tile_cnt;
     655          18 :   uint net_tile_id  = ctx->net_tile_id;
     656          18 :   uint dst_ip       = fd_disco_netmux_sig_ip( sig );
     657             : 
     658             :   /* Skip if another net tile is responsible for this packet.
     659             :      Fast path for net tiles other than net_tile 0. */
     660             : 
     661          18 :   if( net_tile_id!=0 && net_tile_id!=target_idx ) return 1; /* ignore */
     662             : 
     663             : 
     664          18 :   ctx->tx_op.use_gre          = 0;
     665          18 :   ctx->tx_op.gre_outer_dst_ip = 0;
     666          18 :   ctx->tx_op.gre_outer_src_ip = 0;
     667          18 :   uint is_gre_inf             = 0;
     668             : 
     669          18 :   if( FD_UNLIKELY( !net_tx_route( ctx, dst_ip, &is_gre_inf ) ) ) {
     670           0 :     return 1; /* metrics incremented by net_tx_route */
     671           0 :   }
     672             : 
     673          18 :   uint xsk_idx     = ctx->tx_op.xsk_idx;
     674             : 
     675          18 :   if( is_gre_inf ) {
     676          12 :     uint inner_src_ip = ctx->tx_op.src_ip;
     677          12 :     if( FD_UNLIKELY( !inner_src_ip ) ) {
     678           0 :       ctx->metrics.tx_gre_route_fail_cnt++;
     679           0 :       return 1;
     680           0 :     }
     681             :     /* Find the MAC addrs for the eth hdr, and src ip for outer ip4 hdr if not found in netdev tbl */
     682          12 :     ctx->tx_op.src_ip  = 0;
     683          12 :     is_gre_inf         = 0;
     684          12 :     if( FD_UNLIKELY( !net_tx_route( ctx, ctx->tx_op.gre_outer_dst_ip, &is_gre_inf ) ) ) {
     685           0 :       ctx->metrics.tx_gre_route_fail_cnt++;
     686           0 :       return 1;
     687           0 :     }
     688          12 :     if( is_gre_inf ) {
     689             :       /* Only one layer of tunnelling supported */
     690           0 :       ctx->metrics.tx_gre_route_fail_cnt++;
     691           0 :       return 1;
     692           0 :     }
     693          12 :     if( !ctx->tx_op.gre_outer_src_ip ) {
     694           6 :       ctx->tx_op.gre_outer_src_ip = ctx->tx_op.src_ip;
     695           6 :     }
     696          12 :     ctx->tx_op.use_gre = 1; /* indicate to during_frag to use GRE header */
     697          12 :     ctx->tx_op.src_ip  = inner_src_ip;
     698          12 :     xsk_idx = XSK_IDX_MAIN;
     699          12 :   }
     700             : 
     701          18 :   if( FD_UNLIKELY( xsk_idx>=ctx->xsk_cnt ) ) {
     702             :     /* Packet does not route to an XDP interface */
     703           0 :     ctx->metrics.tx_no_xdp_cnt++;
     704           0 :     return 1;
     705           0 :   }
     706             : 
     707          18 :   if( xsk_idx==XSK_IDX_LO ) target_idx = 0; /* loopback always targets tile 0 */
     708             : 
     709             :   /* Skip if another net tile is responsible for this packet */
     710             : 
     711          18 :   if( net_tile_id!=target_idx ) return 1; /* ignore */
     712             : 
     713             :   /* Skip if TX is blocked */
     714             : 
     715          18 :   if( FD_UNLIKELY( !net_tx_ready( ctx, xsk_idx ) ) ) {
     716           0 :     ctx->metrics.tx_full_fail_cnt++;
     717           0 :     return 1;
     718           0 :   }
     719             : 
     720             :   /* Allocate buffer for receive */
     721             : 
     722          18 :   fd_net_free_ring_t * free      = &ctx->free_tx;
     723          18 :   ulong                alloc_seq = free->cons;
     724          18 :   void *               frame     = (void *)free->queue[ alloc_seq % free->depth ];
     725          18 :   free->cons = fd_seq_inc( alloc_seq, 1UL );
     726             : 
     727          18 :   ctx->tx_op.frame = frame;
     728             : 
     729          18 :   return 0; /* continue */
     730          18 : }
     731             : 
     732             : /* during_frag is called when before_frag has committed to transmit an
     733             :    outgoing packet. */
     734             : 
     735             : static inline void
     736             : during_frag( fd_net_ctx_t * ctx,
     737             :              ulong          in_idx,
     738             :              ulong          seq FD_PARAM_UNUSED,
     739             :              ulong          sig FD_PARAM_UNUSED,
     740             :              ulong          chunk,
     741             :              ulong          sz,
     742          18 :              ulong          ctl FD_PARAM_UNUSED ) {
     743          18 :   if( FD_UNLIKELY( chunk<ctx->in[ in_idx ].chunk0 || chunk>ctx->in[ in_idx ].wmark || sz>FD_NET_MTU ) )
     744           0 :     FD_LOG_ERR(( "chunk %lu %lu corrupt, not in range [%lu,%lu]", chunk, sz, ctx->in[ in_idx ].chunk0, ctx->in[ in_idx ].wmark ));
     745             : 
     746          18 :   if( FD_UNLIKELY( sz<( sizeof(fd_eth_hdr_t)+sizeof(fd_ip4_hdr_t) ) ) )
     747           0 :     FD_LOG_ERR(( "packet too small %lu (in_idx=%lu)", sz, in_idx ));
     748             : 
     749          18 :   if( FD_UNLIKELY( sz>FD_ETH_PAYLOAD_MAX ) )
     750           0 :     FD_LOG_ERR(( "packet too big %lu (in_idx=%lu)", sz, in_idx ));
     751             : 
     752          18 :   void * frame = ctx->tx_op.frame;
     753          18 :   if( FD_UNLIKELY( (ulong)frame < (ulong)ctx->umem ) )
     754           0 :     FD_LOG_ERR(( "frame %p out of bounds (below %p)", frame, (void *)ctx->umem ));
     755          18 :   ulong umem_off = (ulong)frame - (ulong)ctx->umem;
     756          18 :   if( FD_UNLIKELY( (ulong)umem_off > (ulong)ctx->umem_sz ) )
     757           0 :     FD_LOG_ERR(( "frame %p out of bounds (beyond %p)", frame, (void *)ctx->umem_sz ));
     758             : 
     759             :   /* Speculatively copy frame into XDP buffer */
     760          18 :   uchar const * src = fd_chunk_to_laddr_const( ctx->in[ in_idx ].mem, chunk );
     761             : 
     762          18 :   if( ctx->tx_op.use_gre ) {
     763             :     /* Discard the ethernet hdr from src. Copy the rest to where the inner ip4_hdr is.
     764             :        Safe from overflow: FD_ETH_PAYLOAD_MAX + header overhead < frame size (2048UL) */
     765          12 :     ulong overhead = sizeof(fd_eth_hdr_t) + sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t);
     766          12 :     fd_memcpy( (void *)( (ulong)ctx->tx_op.frame + overhead ), src + sizeof(fd_eth_hdr_t), sz - sizeof(fd_eth_hdr_t) );
     767          12 :   } else {
     768           6 :     fd_memcpy( ctx->tx_op.frame, src, sz );
     769           6 :   }
     770          18 : }
     771             : 
     772             : /* after_frag is called when the during_frag memcpy was _not_ overrun. */
     773             : 
     774             : static void
     775             : after_frag( fd_net_ctx_t *      ctx,
     776             :             ulong               in_idx,
     777             :             ulong               seq,
     778             :             ulong               sig,
     779             :             ulong               sz,
     780             :             ulong               tsorig,
     781             :             ulong               tspub,
     782          18 :             fd_stem_context_t * stem ) {
     783          18 :   (void)in_idx; (void)seq; (void)sig; (void)tsorig; (void)tspub; (void)stem;
     784             : 
     785             :   /* Current send operation */
     786             : 
     787          18 :   uchar *    frame   = ctx->tx_op.frame;
     788          18 :   uint       xsk_idx = ctx->tx_op.xsk_idx;
     789             : 
     790             :   /* Select Ethernet addresses */
     791          18 :   memcpy( frame, ctx->tx_op.mac_addrs, 12 );
     792             : 
     793          18 :   uchar * iphdr = frame + sizeof(fd_eth_hdr_t);
     794             : 
     795          18 :   if( ctx->tx_op.use_gre ) {
     796             : 
     797             :     /* For GRE packets, the ethertype will always be FD_ETH_HDR_TYPE_IP. outer source ip can't be 0 */
     798          12 :     if( FD_UNLIKELY( ctx->tx_op.gre_outer_src_ip==0 ) ) {
     799           0 :       ctx->metrics.tx_gre_route_fail_cnt++;
     800           0 :       return;
     801           0 :     }
     802             : 
     803             :     /* Write the last two bytes for eth_hdr */
     804          12 :     FD_STORE( ushort, frame+12, fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) );
     805             : 
     806          12 :     uchar * outer_iphdr       = frame + sizeof(fd_eth_hdr_t);
     807          12 :     uchar * gre_hdr           = outer_iphdr + sizeof(fd_ip4_hdr_t);
     808          12 :     uchar * inner_iphdr       = gre_hdr + sizeof(fd_gre_hdr_t);
     809             : 
     810             :     /* outer hdr + gre hdr + inner net_tot_len */
     811          12 :     ushort  outer_net_tot_len = (ushort)( sizeof(fd_ip4_hdr_t) + sizeof(fd_gre_hdr_t) + fd_ushort_bswap( ( (fd_ip4_hdr_t *)inner_iphdr )->net_tot_len ) );
     812             : 
     813             :     /* Construct outer ip header */
     814          12 :     fd_ip4_hdr_t ip4_outer = (fd_ip4_hdr_t) {
     815          12 :       .verihl       = FD_IP4_VERIHL( 4,5 ),
     816          12 :       .tos          = 0,
     817          12 :       .net_tot_len  = fd_ushort_bswap( outer_net_tot_len ),
     818          12 :       .net_id       = 0,
     819          12 :       .net_frag_off = fd_ushort_bswap( FD_IP4_HDR_FRAG_OFF_DF ),
     820          12 :       .ttl          = 64,
     821          12 :       .protocol     = FD_IP4_HDR_PROTOCOL_GRE,
     822          12 :       .check        = 0,
     823          12 :       .saddr        = ctx->tx_op.gre_outer_src_ip,
     824          12 :       .daddr        = ctx->tx_op.gre_outer_dst_ip,
     825          12 :     };
     826          12 :     ip4_outer.check = fd_ip4_hdr_check_fast( &ip4_outer );
     827          12 :     FD_STORE( fd_ip4_hdr_t, outer_iphdr, ip4_outer );
     828             : 
     829             :     /* Construct gre header */
     830          12 :     fd_gre_hdr_t gre_hdr_ = {
     831          12 :       .flags_version = FD_GRE_HDR_FLG_VER_BASIC,
     832          12 :       .protocol      = fd_ushort_bswap( FD_ETH_HDR_TYPE_IP )
     833          12 :     };
     834          12 :     FD_STORE( fd_gre_hdr_t, gre_hdr, gre_hdr_ );
     835             : 
     836          12 :     iphdr   = inner_iphdr;
     837          12 :     sz      = sizeof(fd_eth_hdr_t) + outer_net_tot_len;
     838          12 :     xsk_idx = 0;
     839          12 :   }
     840             : 
     841             :   /* Construct (inner) ip header */
     842          18 :   uint   ihl       = FD_IP4_GET_LEN( *(fd_ip4_hdr_t *)iphdr );
     843          18 :   uint   ver       = FD_IP4_GET_VERSION( *(fd_ip4_hdr_t *)iphdr );
     844          18 :   uint   ip4_saddr = FD_LOAD( uint, iphdr+12 );
     845          18 :   ushort ethertype = FD_LOAD( ushort, frame+12 );
     846             : 
     847          18 :   if( FD_UNLIKELY( ethertype!=fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) ) ) {
     848           0 :     FD_LOG_CRIT(( "in link %lu attempted to send packet with invalid ethertype %04x",
     849           0 :                   in_idx, fd_ushort_bswap( ethertype ) ));
     850           0 :   }
     851             : 
     852          18 :   if( ver!=0x4 ) {
     853           0 :     ctx->metrics.tx_route_fail_cnt++; // Not an IPv4 packet. drop
     854           0 :     return;
     855           0 :   }
     856             : 
     857          18 :   if( ip4_saddr==0 ) {
     858          18 :     if( FD_UNLIKELY( ctx->tx_op.src_ip==0 ||
     859          18 :                      ihl<sizeof(fd_ip4_hdr_t) ||
     860          18 :                      (sizeof(fd_eth_hdr_t)+ihl)>sz ) ) {
     861             :       /* Outgoing IPv4 packet with unknown src IP or invalid IHL */
     862             :       /* FIXME should select first IPv4 address of device table here */
     863           0 :       ctx->metrics.tx_route_fail_cnt++;
     864           0 :       return;
     865           0 :     }
     866             :     /* Recompute checksum after changing header */
     867          18 :     FD_STORE( uint,   iphdr+12, ctx->tx_op.src_ip );
     868          18 :     FD_STORE( ushort, iphdr+10, 0 );
     869          18 :     FD_STORE( ushort, iphdr+10, fd_ip4_hdr_check( iphdr ) );
     870          18 :   }
     871             : 
     872             :   /* Submit packet TX job
     873             : 
     874             :      Invariant for ring_tx: prod-cons<length
     875             :      (This invariant breaks if any other packet is sent over this ring
     876             :      between before_frag and this point, e.g. send_arp_probe.) */
     877             : 
     878          18 :   fd_xsk_t      * xsk     = &ctx->xsk[ xsk_idx ];
     879          18 :   fd_xdp_ring_t * tx_ring = &xsk->ring_tx;
     880          18 :   uint            tx_seq  = tx_ring->cached_prod;
     881          18 :   uint            tx_mask = tx_ring->depth - 1U;
     882          18 :   xsk->ring_tx.packet_ring[ tx_seq&tx_mask ] = (struct xdp_desc) {
     883          18 :     .addr    = (ulong)frame - (ulong)ctx->umem,
     884          18 :     .len     = (uint)sz,
     885          18 :     .options = 0
     886          18 :   };
     887             : 
     888             :   /* Frame is now owned by kernel. Clear tx_op. */
     889          18 :   ctx->tx_op.frame = NULL;
     890             : 
     891             :   /* Register newly enqueued packet */
     892          18 :   tx_ring->cached_prod = tx_seq+1U;
     893          18 :   ctx->metrics.tx_submit_cnt++;
     894          18 :   ctx->metrics.tx_bytes_total += sz;
     895          18 :   if( ctx->tx_op.use_gre ) ctx->metrics.tx_gre_cnt++;
     896          18 :   fd_net_flusher_inc( ctx->tx_flusher+xsk_idx, fd_tickcount() );
     897          18 : }
     898             : 
     899             : /* net_rx_packet is called when a new Ethernet frame is available.
     900             :    Attempts to copy out the frame to a downstream tile. */
     901             : 
     902             : static void
     903             : net_rx_packet( fd_net_ctx_t * ctx,
     904             :                ulong          umem_off,
     905             :                ulong          sz,
     906          24 :                uint *         freed_chunk ) {
     907             : 
     908          24 :   if( FD_UNLIKELY( sz<sizeof(fd_eth_hdr_t)+sizeof(fd_ip4_hdr_t)+sizeof(fd_udp_hdr_t) ) ) {
     909           0 :     FD_DTRACE_PROBE( net_tile_err_rx_undersz );
     910           0 :     ctx->metrics.rx_undersz_cnt++;
     911           0 :     return;
     912           0 :   }
     913             : 
     914          24 :   uchar        * packet     = (uchar *)ctx->umem + umem_off;
     915          24 :   uchar const  * packet_end = packet + sz;
     916          24 :   fd_ip4_hdr_t * iphdr      = (fd_ip4_hdr_t *)(packet + sizeof(fd_eth_hdr_t));
     917             : 
     918          24 :   if( FD_UNLIKELY( ((fd_eth_hdr_t *)packet)->net_type!=fd_ushort_bswap( FD_ETH_HDR_TYPE_IP ) ) ) return;
     919             : 
     920          24 :   int is_packet_gre = 0;
     921             :   /* Discard the GRE overhead (outer iphdr and gre hdr) */
     922          24 :   if( iphdr->protocol == FD_IP4_HDR_PROTOCOL_GRE ) {
     923          12 :     if( FD_UNLIKELY( ctx->has_gre_interface==0 ) ) {
     924           0 :       ctx->metrics.rx_gre_ignored_cnt++; // drop. No gre interface in netdev table
     925           0 :       return;
     926           0 :     }
     927          12 :     ulong gre_ipver = FD_IP4_GET_VERSION( *iphdr );
     928          12 :     ulong gre_iplen = FD_IP4_GET_LEN( *iphdr );
     929          12 :     if( FD_UNLIKELY( gre_ipver!=0x4 || gre_iplen<20 ) ) {
     930           0 :       FD_DTRACE_PROBE( net_tile_err_rx_noip );
     931           0 :       ctx->metrics.rx_gre_inv_pkt_cnt++; /* drop IPv6 packets */
     932           0 :       return;
     933           0 :     }
     934             : 
     935          12 :     ulong overhead = gre_iplen + sizeof(fd_gre_hdr_t);
     936          12 :     if( FD_UNLIKELY( (uchar *)iphdr+overhead+sizeof(fd_ip4_hdr_t)>packet_end ) ) {
     937           0 :       FD_DTRACE_PROBE( net_tile_err_rx_undersz );
     938           0 :       ctx->metrics.rx_undersz_cnt++;  // inner ip4 header invalid
     939           0 :       return;
     940           0 :     }
     941             : 
     942             :     /* The new iphdr is where the inner iphdr was. Copy over the eth_hdr */
     943          12 :     iphdr              = (fd_ip4_hdr_t *)((uchar *)iphdr + overhead);
     944          12 :     uchar * new_packet = (uchar *)iphdr - sizeof(fd_eth_hdr_t);
     945          12 :     fd_memcpy( new_packet, packet, sizeof(fd_eth_hdr_t) );
     946          12 :     sz                 -= overhead;
     947          12 :     packet             = new_packet;
     948          12 :     umem_off           = (ulong)( packet - (uchar *)ctx->umem );
     949          12 :     is_packet_gre      = 1;
     950          12 :   }
     951             : 
     952             :   /* Translate packet to UMEM frame index */
     953          24 :   ulong chunk       = ctx->umem_chunk0 + (umem_off>>FD_CHUNK_LG_SZ);
     954          24 :   ulong ctl         = umem_off & 0x3fUL;
     955             : 
     956             :   /* Filter for UDP/IPv4 packets. */
     957          24 :   ulong ipver = FD_IP4_GET_VERSION( *iphdr );
     958          24 :   ulong iplen = FD_IP4_GET_LEN    ( *iphdr );
     959          24 :   if( FD_UNLIKELY( ipver!=0x4 || iplen<20 ||
     960          24 :                    iphdr->protocol!=FD_IP4_HDR_PROTOCOL_UDP ) ) {
     961           0 :     FD_DTRACE_PROBE( net_tile_err_rx_noip );
     962           0 :     ctx->metrics.rx_undersz_cnt++; /* drop IPv6 packets */
     963           0 :     return;
     964           0 :   }
     965             : 
     966          24 :   uchar const * udp = (uchar *)iphdr + iplen;
     967          24 :   if( FD_UNLIKELY( udp+sizeof(fd_udp_hdr_t) > packet_end ) ) {
     968           0 :     FD_DTRACE_PROBE( net_tile_err_rx_undersz );
     969           0 :     ctx->metrics.rx_undersz_cnt++;
     970           0 :     return;
     971           0 :   }
     972             : 
     973          24 :   fd_udp_hdr_t const * udp_hdr = (fd_udp_hdr_t const *)udp;
     974          24 :   ulong        const   udp_sz  = fd_ushort_bswap( udp_hdr->net_len );
     975          24 :   if( FD_UNLIKELY( (udp_sz<sizeof(fd_udp_hdr_t)) | (udp+udp_sz>packet_end) ) ) {
     976           6 :     FD_DTRACE_PROBE( net_tile_err_rx_undersz );
     977           6 :     ctx->metrics.rx_undersz_cnt++;
     978           6 :     return;
     979           6 :   }
     980             : 
     981             :   /* Extract IP dest addr and UDP src/dest port */
     982          18 :   uint   ip_srcaddr   =  iphdr->saddr;
     983          18 :   ushort udp_srcport  =  fd_ushort_bswap( udp_hdr->net_sport );
     984          18 :   ushort udp_dstport  =  fd_ushort_bswap( udp_hdr->net_dport );
     985             : 
     986          18 :   FD_DTRACE_PROBE_4( net_tile_pkt_rx, ip_srcaddr, udp_srcport, udp_dstport, sz );
     987             : 
     988             :   /* Route packet to downstream tile */
     989          18 :   ushort proto;
     990          18 :   fd_net_out_ctx_t * out;
     991          18 :   if(      FD_UNLIKELY( udp_dstport==ctx->shred_listen_port ) ) {
     992          18 :     proto = DST_PROTO_SHRED;
     993          18 :     out = ctx->shred_out;
     994          18 :   } else if( FD_UNLIKELY( udp_dstport==ctx->quic_transaction_listen_port ) ) {
     995           0 :     proto = DST_PROTO_TPU_QUIC;
     996           0 :     out = ctx->quic_out;
     997           0 :   } else if( FD_UNLIKELY( udp_dstport==ctx->legacy_transaction_listen_port ) ) {
     998           0 :     proto = DST_PROTO_TPU_UDP;
     999           0 :     out = ctx->quic_out;
    1000           0 :   } else if( FD_UNLIKELY( udp_dstport==ctx->gossip_listen_port ) ) {
    1001           0 :     proto = DST_PROTO_GOSSIP;
    1002           0 :     out = ctx->gossvf_out;
    1003           0 :   } else if( FD_UNLIKELY( udp_dstport==ctx->repair_intake_listen_port ) ) {
    1004           0 :     proto = DST_PROTO_REPAIR;
    1005           0 :     if( FD_UNLIKELY( sz == REPAIR_PING_SZ ) ) out = ctx->repair_out; /* ping-pong */
    1006           0 :     else                                      out = ctx->shred_out;
    1007           0 :   } else if( FD_UNLIKELY( udp_dstport==ctx->repair_serve_listen_port ) ) {
    1008           0 :     proto = DST_PROTO_REPAIR;
    1009           0 :     out = ctx->repair_out;
    1010           0 :   } else if( FD_UNLIKELY( udp_dstport==ctx->send_src_port ) ) {
    1011           0 :     proto = DST_PROTO_SEND;
    1012           0 :     out = ctx->send_out;
    1013           0 :   } else {
    1014             : 
    1015           0 :     FD_LOG_ERR(( "Firedancer received a UDP packet on port %hu which was not expected. "
    1016           0 :                   "Only the following ports should be configured to forward packets: "
    1017           0 :                   "%hu, %hu, %hu, %hu, %hu, %hu (excluding any 0 ports, which can be ignored)."
    1018           0 :                   "Please report this error to Firedancer maintainers.",
    1019           0 :                   udp_dstport,
    1020           0 :                   ctx->shred_listen_port,
    1021           0 :                   ctx->quic_transaction_listen_port,
    1022           0 :                   ctx->legacy_transaction_listen_port,
    1023           0 :                   ctx->gossip_listen_port,
    1024           0 :                   ctx->repair_intake_listen_port,
    1025           0 :                   ctx->repair_serve_listen_port ));
    1026           0 :   }
    1027             : 
    1028             :   /* tile can decide how to partition based on src ip addr and src port */
    1029          18 :   ulong sig              = fd_disco_netmux_sig( ip_srcaddr, udp_srcport, ip_srcaddr, proto, 14UL+8UL+iplen );
    1030             : 
    1031             :   /* Peek the mline for an old frame */
    1032          18 :   fd_frag_meta_t * mline = out->mcache + fd_mcache_line_idx( out->seq, out->depth );
    1033          18 :   *freed_chunk           = mline->chunk;
    1034             : 
    1035             :   /* Overwrite the mline with the new frame */
    1036          18 :   ulong tspub            = (ulong)fd_frag_meta_ts_comp( fd_tickcount() );
    1037          18 :   fd_mcache_publish( out->mcache, out->depth, out->seq, sig, chunk, sz, ctl, 0, tspub );
    1038             : 
    1039             :   /* Wind up for the next iteration */
    1040          18 :   out->seq               = fd_seq_inc( out->seq, 1UL );
    1041             : 
    1042          18 :   if( is_packet_gre ) ctx->metrics.rx_gre_cnt++;
    1043          18 :   ctx->metrics.rx_pkt_cnt++;
    1044          18 :   ctx->metrics.rx_bytes_total += sz;
    1045          18 : }
    1046             : 
    1047             : /* net_comp_event is called when an XDP TX frame is free again. */
    1048             : 
    1049             : static void
    1050             : net_comp_event( fd_net_ctx_t * ctx,
    1051             :                 fd_xsk_t *     xsk,
    1052           0 :                 uint           comp_seq ) {
    1053             : 
    1054             :   /* Locate the incoming frame */
    1055             : 
    1056           0 :   fd_xdp_ring_t * comp_ring  = &xsk->ring_cr;
    1057           0 :   uint            comp_mask  = comp_ring->depth - 1U;
    1058           0 :   ulong           frame      = FD_VOLATILE_CONST( comp_ring->frame_ring[ comp_seq&comp_mask ] );
    1059           0 :   ulong const     frame_mask = FD_NET_MTU - 1UL;
    1060           0 :   if( FD_UNLIKELY( frame+FD_NET_MTU > ctx->umem_sz ) ) {
    1061           0 :     FD_LOG_ERR(( "Bounds check failed: frame=0x%lx umem_sz=0x%lx",
    1062           0 :                  frame, (ulong)ctx->umem_sz ));
    1063           0 :   }
    1064             : 
    1065             :   /* Check if we have space to return the freed frame */
    1066             : 
    1067           0 :   fd_net_free_ring_t * free      = &ctx->free_tx;
    1068           0 :   ulong                free_prod = free->prod;
    1069           0 :   ulong                free_mask = free->depth - 1UL;
    1070           0 :   ulong                free_cons = free->cons;
    1071           0 :   long                 free_cnt = fd_seq_diff( free_prod, free_cons );
    1072           0 :   FD_TEST( free_prod >= free_cons );
    1073           0 :   if( FD_UNLIKELY( free_cnt>=(long)free->depth ) ) return; /* blocked */
    1074             : 
    1075           0 :   free->queue[ free_prod&free_mask ] = (ulong)ctx->umem + (frame & (~frame_mask));
    1076           0 :   free->prod = fd_seq_inc( free_prod, 1UL );
    1077             : 
    1078             :   /* Wind up for next iteration */
    1079             : 
    1080           0 :   comp_ring->cached_cons = comp_seq+1U;
    1081           0 :   ctx->metrics.tx_complete_cnt++;
    1082           0 : }
    1083             : 
    1084             : /* net_rx_event is called when a new XDP RX frame is available.  Calls
    1085             :    net_rx_packet, then returns the packet back to the kernel via the fill
    1086             :    ring.  */
    1087             : 
    1088             : static void
    1089             : net_rx_event( fd_net_ctx_t * ctx,
    1090             :               fd_xsk_t *     xsk,
    1091          24 :               uint           rx_seq ) {
    1092             :   /* Locate the incoming frame */
    1093             : 
    1094          24 :   fd_xdp_ring_t * rx_ring = &xsk->ring_rx;
    1095          24 :   uint            rx_mask = rx_ring->depth - 1U;
    1096          24 :   struct xdp_desc frame   = FD_VOLATILE_CONST( rx_ring->packet_ring[ rx_seq&rx_mask ] );
    1097             : 
    1098          24 :   if( FD_UNLIKELY( frame.len>FD_NET_MTU ) )
    1099           0 :     FD_LOG_ERR(( "received a UDP packet with a too large payload (%u)", frame.len ));
    1100             : 
    1101             :   /* Check if we have space in the fill ring to free the frame */
    1102             : 
    1103          24 :   fd_xdp_ring_t * fill_ring  = &xsk->ring_fr;
    1104          24 :   uint            fill_depth = fill_ring->depth;
    1105          24 :   uint            fill_mask  = fill_depth-1U;
    1106          24 :   ulong           frame_mask = FD_NET_MTU - 1UL;
    1107          24 :   uint            fill_prod  = fill_ring->cached_prod;
    1108          24 :   uint            fill_cons  = fill_ring->cached_cons;
    1109             : 
    1110             :   /* If cached_cons suggests there may not be space in the fill ring,
    1111             :      refresh from fseq and check again. Else, skip the fseq access */
    1112             : 
    1113          24 :   if( FD_UNLIKELY( fill_prod-fill_cons >= fill_depth ) ) {
    1114           0 :     fill_cons = fill_ring->cached_cons = FD_VOLATILE_CONST( *fill_ring->cons );
    1115           0 :     if( FD_UNLIKELY( fill_prod-fill_cons >= fill_depth ) ) {
    1116           0 :       ctx->metrics.rx_fill_blocked_cnt++;
    1117           0 :       return; /* blocked */
    1118           0 :     }
    1119           0 :   }
    1120             : 
    1121             :   /* Pass it to the receive handler */
    1122             : 
    1123          24 :   uint freed_chunk = (uint)( ctx->umem_chunk0 + (frame.addr>>FD_CHUNK_LG_SZ) );
    1124          24 :   net_rx_packet( ctx, frame.addr, frame.len, &freed_chunk );
    1125             : 
    1126          24 :   FD_COMPILER_MFENCE();
    1127          24 :   rx_ring->cached_cons = rx_seq+1U;
    1128             : 
    1129             :   /* Every RX operation returns one frame to the FILL ring.  If the
    1130             :      packet was forwarded to a downstream ring, the newly shadowed frame
    1131             :      is returned.  Otherwise, the frame just received is returned. */
    1132             : 
    1133          24 :   if( FD_UNLIKELY( ( freed_chunk < ctx->umem_chunk0 ) |
    1134          24 :                     ( freed_chunk > ctx->umem_wmark ) ) ) {
    1135           0 :     FD_LOG_CRIT(( "mcache corruption detected: chunk=%u chunk0=%u wmark=%u",
    1136           0 :                   freed_chunk, ctx->umem_chunk0, ctx->umem_wmark ));
    1137           0 :   }
    1138          24 :   ulong freed_off = (freed_chunk - ctx->umem_chunk0)<<FD_CHUNK_LG_SZ;
    1139          24 :   fill_ring->frame_ring[ fill_prod&fill_mask ] = freed_off & (~frame_mask);
    1140          24 :   fill_ring->cached_prod = fill_prod+1U;
    1141          24 : }
    1142             : 
    1143             : /* before_credit is called every loop iteration. */
    1144             : 
    1145             : static void
    1146             : before_credit( fd_net_ctx_t *      ctx,
    1147             :                fd_stem_context_t * stem,
    1148          24 :                int *               charge_busy ) {
    1149          24 :   (void)stem;
    1150             :   /* A previous send attempt was overrun.  A corrupt copy of the packet was
    1151             :      placed into an XDP frame, but the frame was not yet submitted to the
    1152             :      TX ring.  Return the tx buffer to the free list. */
    1153             : 
    1154          24 :   if( ctx->tx_op.frame ) {
    1155           0 :     *charge_busy = 1;
    1156           0 :     fd_net_free_ring_t * free      = &ctx->free_tx;
    1157           0 :     ulong                alloc_seq = free->prod;
    1158           0 :     free->queue[ alloc_seq % free->depth ] = (ulong)ctx->tx_op.frame;
    1159           0 :     free->prod = fd_seq_inc( alloc_seq, 1UL );
    1160           0 :     ctx->tx_op.frame = NULL;
    1161           0 :   }
    1162             : 
    1163             :   /* Check if new packets are available or if TX frames are free again
    1164             :      (Round-robin through sockets) */
    1165             : 
    1166          24 :   uint       rr_idx = ctx->rr_idx;
    1167          24 :   fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ];
    1168             : 
    1169          24 :   net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy );
    1170             : 
    1171          24 :   uint rx_cons = rr_xsk->ring_rx.cached_cons;
    1172          24 :   uint rx_prod = rr_xsk->ring_rx.cached_prod; /* might be stale */
    1173          24 :   if( FD_UNLIKELY( rx_cons==rx_prod ) ) {
    1174          24 :     rx_prod = rr_xsk->ring_rx.cached_prod = FD_VOLATILE_CONST( *rr_xsk->ring_rx.prod );
    1175          24 :   }
    1176             : 
    1177          24 :   if( rx_cons!=rx_prod ) {
    1178          24 :     *charge_busy = 1;
    1179          24 :     net_rx_event( ctx, rr_xsk, rx_cons );
    1180          24 :   } else {
    1181           0 :     net_rx_wakeup( ctx, rr_xsk, charge_busy );
    1182           0 :     ctx->rr_idx++;
    1183           0 :     ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx );
    1184           0 :   }
    1185             : 
    1186          24 :   uint comp_cons = rr_xsk->ring_cr.cached_cons;
    1187          24 :   uint comp_prod = rr_xsk->ring_cr.cached_prod; /* might be stale */
    1188          24 :   if( FD_UNLIKELY( comp_cons==comp_prod ) ) {
    1189          24 :     comp_prod = rr_xsk->ring_cr.cached_prod = FD_VOLATILE_CONST( *rr_xsk->ring_cr.prod );
    1190          24 :   }
    1191             : 
    1192          24 :   if( comp_cons!=comp_prod ) {
    1193           0 :     *charge_busy = 1;
    1194           0 :     rr_xsk->ring_cr.cached_prod = comp_prod;
    1195           0 :     net_comp_event( ctx, rr_xsk, comp_cons );
    1196           0 :   }
    1197          24 : }
    1198             : 
    1199             : /* net_xsk_bootstrap assigns UMEM frames to the FILL ring. */
    1200             : 
    1201             : static ulong
    1202             : net_xsk_bootstrap( fd_net_ctx_t * ctx,
    1203             :                    uint           xsk_idx,
    1204           0 :                    ulong          frame_off ) {
    1205           0 :   fd_xsk_t * xsk = &ctx->xsk[ xsk_idx ];
    1206             : 
    1207           0 :   ulong const frame_sz  = FD_NET_MTU;
    1208           0 :   ulong const fr_depth  = ctx->xsk[ xsk_idx ].ring_fr.depth/2UL;
    1209             : 
    1210           0 :   fd_xdp_ring_t * fill      = &xsk->ring_fr;
    1211           0 :   uint            fill_prod = fill->cached_prod;
    1212           0 :   for( ulong j=0UL; j<fr_depth; j++ ) {
    1213           0 :     fill->frame_ring[ j ] = frame_off;
    1214           0 :     frame_off += frame_sz;
    1215           0 :   }
    1216           0 :   FD_VOLATILE( *fill->prod ) = fill->cached_prod = fill_prod + (uint)fr_depth;
    1217             : 
    1218           0 :   return frame_off;
    1219           0 : }
    1220             : 
    1221             : /* FIXME source MAC address from netlnk tile instead */
    1222             : 
    1223             : static void
    1224             : interface_addrs( const char * interface,
    1225             :                  uchar *      mac,
    1226           0 :                  uint *       ip4_addr ) {
    1227           0 :   int fd = socket( AF_INET, SOCK_DGRAM, 0 );
    1228           0 :   struct ifreq ifr;
    1229           0 :   ifr.ifr_addr.sa_family = AF_INET;
    1230             : 
    1231           0 :   strncpy( ifr.ifr_name, interface, IFNAMSIZ );
    1232           0 :   if( FD_UNLIKELY( ioctl( fd, SIOCGIFHWADDR, &ifr ) ) )
    1233           0 :     FD_LOG_ERR(( "could not get MAC address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
    1234           0 :   fd_memcpy( mac, ifr.ifr_hwaddr.sa_data, 6 );
    1235             : 
    1236           0 :   if( FD_UNLIKELY( ioctl( fd, SIOCGIFADDR, &ifr ) ) )
    1237           0 :     FD_LOG_ERR(( "could not get IP address of interface `%s`: (%i-%s)", interface, errno, fd_io_strerror( errno ) ));
    1238           0 :   *ip4_addr = ((struct sockaddr_in *)fd_type_pun( &ifr.ifr_addr ))->sin_addr.s_addr;
    1239             : 
    1240           0 :   if( FD_UNLIKELY( close(fd) ) )
    1241           0 :     FD_LOG_ERR(( "could not close socket (%i-%s)", errno, fd_io_strerror( errno ) ));
    1242           0 : }
    1243             : 
    1244             : /* privileged_init does the following initialization steps:
    1245             : 
    1246             :    - Create an AF_XDP socket
    1247             :    - Map XDP metadata rings
    1248             :    - Register UMEM data region with socket
    1249             :    - Insert AF_XDP socket into xsk_map
    1250             : 
    1251             :    Net tile 0 also runs fd_xdp_install and repeats the above step for
    1252             :    the loopback device.  (Unless the main interface is already loopback)
    1253             : 
    1254             :    Kernel object references:
    1255             : 
    1256             :      BPF_LINK file descriptor
    1257             :       |
    1258             :       +-> XDP program installation on NIC
    1259             :       |    |
    1260             :       |    +-> XDP program <-- BPF_PROG file descriptor (prog_fd)
    1261             :       |
    1262             :       +-> XSKMAP object <-- BPF_MAP file descriptor (xsk_map) */
    1263             : 
    1264             : FD_FN_UNUSED static void
    1265             : privileged_init( fd_topo_t *      topo,
    1266           0 :                  fd_topo_tile_t * tile ) {
    1267           0 :   void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
    1268             : 
    1269           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
    1270           0 :   fd_net_ctx_t * ctx     = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
    1271           0 :   ulong *        free_tx = FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );;
    1272             : 
    1273           0 :   fd_memset( ctx, 0, sizeof(fd_net_ctx_t) );
    1274             : 
    1275           0 :   interface_addrs( tile->xdp.if_virt, ctx->src_mac_addr, &ctx->default_address );
    1276           0 :   ctx->if_virt = if_nametoindex( tile->xdp.if_virt ); FD_TEST( ctx->if_virt );
    1277             : 
    1278             :   /* Load up dcache containing UMEM */
    1279             : 
    1280           0 :   void * const dcache_mem          = fd_topo_obj_laddr( topo, tile->net.umem_dcache_obj_id );
    1281           0 :   void * const umem                = fd_dcache_join( dcache_mem );
    1282           0 :   ulong  const umem_dcache_data_sz = fd_dcache_data_sz( umem );
    1283           0 :   ulong  const umem_frame_sz       = 2048UL;
    1284           0 :   ulong  const umem_sz             = fd_ulong_align_dn( umem_dcache_data_sz, umem_frame_sz );
    1285             : 
    1286             :   /* Derive chunk bounds */
    1287             : 
    1288           0 :   void * const umem_base   = fd_wksp_containing( dcache_mem );
    1289           0 :   ulong  const umem_chunk0 = ( (ulong)umem - (ulong)umem_base )>>FD_CHUNK_LG_SZ;
    1290           0 :   ulong  const umem_wmark  = umem_chunk0 + ( ( umem_sz-umem_frame_sz )>>FD_CHUNK_LG_SZ );
    1291             : 
    1292           0 :   if( FD_UNLIKELY( umem_chunk0>UINT_MAX || umem_wmark>UINT_MAX || umem_chunk0>umem_wmark ) ) {
    1293           0 :     FD_LOG_ERR(( "Calculated invalid UMEM bounds [%lu,%lu]", umem_chunk0, umem_wmark ));
    1294           0 :   }
    1295             : 
    1296           0 :   if( FD_UNLIKELY( !umem_base ) ) FD_LOG_ERR(( "UMEM dcache is not in a workspace" ));
    1297             : 
    1298           0 :   ctx->umem        = umem;
    1299           0 :   ctx->umem_sz     = umem_sz;
    1300           0 :   ctx->umem_chunk0 = (uint)umem_chunk0;
    1301           0 :   ctx->umem_wmark  = (uint)umem_wmark;
    1302             : 
    1303           0 :   ctx->free_tx.queue = free_tx;
    1304           0 :   ctx->free_tx.depth = tile->xdp.xdp_tx_queue_size;
    1305             : 
    1306             :   /* Create and install XSKs */
    1307             : 
    1308           0 :   uint if_phys_if_idx = if_nametoindex( tile->xdp.if_phys );
    1309           0 :   if( FD_UNLIKELY( !if_phys_if_idx ) ) FD_LOG_ERR(( "if_nametoindex(%s) failed", tile->xdp.if_phys ));
    1310             : 
    1311           0 :   fd_xsk_params_t params0 = {
    1312           0 :     .if_idx      = if_phys_if_idx,
    1313           0 :     .if_queue_id = tile->xdp.if_queue,
    1314             : 
    1315             :     /* Some kernels produce EOPNOTSUP errors on sendto calls when
    1316             :        starting up without either XDP_ZEROCOPY or XDP_COPY
    1317             :        (e.g. 5.14.0-503.23.1.el9_5 with i40e) */
    1318           0 :     .bind_flags  = tile->xdp.zero_copy ? XDP_ZEROCOPY : XDP_COPY,
    1319             : 
    1320           0 :     .fr_depth  = tile->xdp.xdp_rx_queue_size*2,
    1321           0 :     .rx_depth  = tile->xdp.xdp_rx_queue_size,
    1322           0 :     .cr_depth  = tile->xdp.xdp_tx_queue_size,
    1323           0 :     .tx_depth  = tile->xdp.xdp_tx_queue_size,
    1324             : 
    1325           0 :     .umem_addr = umem,
    1326           0 :     .frame_sz  = umem_frame_sz,
    1327           0 :     .umem_sz   = umem_sz
    1328           0 :   };
    1329             : 
    1330             :   /* Re-derive XDP file descriptors */
    1331             : 
    1332           0 :   fd_xdp_fds_t xdp_fds[ FD_TOPO_XDP_FDS_MAX ];
    1333           0 :   uint         xdp_fds_cnt = FD_TOPO_XDP_FDS_MAX;
    1334           0 :   fd_topo_install_xdp( topo, xdp_fds, &xdp_fds_cnt, 0U, /* dry_run */ 1 );
    1335             : 
    1336           0 :   int xsk_map_fd = -1;
    1337           0 :   for( uint i=0U; i<xdp_fds_cnt; i++ ) {
    1338           0 :     if( xdp_fds[ i ].if_idx==if_phys_if_idx ) {
    1339           0 :       xsk_map_fd              = xdp_fds[ i ].xsk_map_fd;
    1340           0 :       ctx->prog_link_fds[ 0 ] = xdp_fds[ i ].prog_link_fd;
    1341           0 :       xdp_fds[ i ].prog_link_fd = -1; /* mark as used */
    1342           0 :       break;
    1343           0 :     }
    1344           0 :   }
    1345           0 :   FD_TEST( xsk_map_fd>=0 );
    1346             : 
    1347             :   /* Init XSK */
    1348           0 :   if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 0 ], &params0 ) ) )       FD_LOG_ERR(( "failed to bind xsk for net tile %lu", tile->kind_id ));
    1349           0 :   if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 0 ], xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate xsk for net tile %lu", tile->kind_id ));
    1350           0 :   ctx->xsk_cnt = 1;
    1351             : 
    1352             :   /* Networking tile at index 0 also binds to loopback (only queue 0 available on lo) */
    1353             : 
    1354           0 :   if( FD_UNLIKELY( strcmp( tile->xdp.if_virt, "lo" ) && !tile->kind_id ) ) {
    1355           0 :     ctx->xsk_cnt = 2;
    1356             : 
    1357           0 :     uint lo_idx = if_nametoindex( "lo" );
    1358           0 :     if( FD_UNLIKELY( !lo_idx ) ) FD_LOG_ERR(( "if_nametoindex(lo) failed" ));
    1359             : 
    1360           0 :     int lo_xsk_map_fd = -1;
    1361           0 :     for( uint i=0U; i<xdp_fds_cnt; i++ ) {
    1362           0 :       if( xdp_fds[ i ].if_idx==lo_idx ) {
    1363           0 :         lo_xsk_map_fd           = xdp_fds[ i ].xsk_map_fd;
    1364           0 :         ctx->prog_link_fds[ 1 ] = xdp_fds[ i ].prog_link_fd;
    1365           0 :         xdp_fds[ i ].prog_link_fd = -1; /* mark as used */
    1366           0 :         break;
    1367           0 :       }
    1368           0 :     }
    1369           0 :     FD_TEST( lo_xsk_map_fd>=0 );
    1370             : 
    1371             :     /* init xsk 1 */
    1372           0 :     fd_xsk_params_t params1 = params0;
    1373           0 :     params1.if_idx      = lo_idx; /* probably always 1 */
    1374           0 :     params1.if_queue_id = 0;
    1375           0 :     params1.bind_flags  = 0;
    1376           0 :     if( FD_UNLIKELY( !fd_xsk_init( &ctx->xsk[ 1 ], &params1 ) ) )          FD_LOG_ERR(( "failed to bind lo_xsk" ));
    1377           0 :     if( FD_UNLIKELY( !fd_xsk_activate( &ctx->xsk[ 1 ], lo_xsk_map_fd ) ) ) FD_LOG_ERR(( "failed to activate lo_xsk" ));
    1378           0 :   }
    1379             : 
    1380             :   /* Close unused XDP fds */
    1381             : 
    1382           0 :   if( FD_UNLIKELY( fd_sandbox_gettid()==fd_sandbox_getpid() ) ) {
    1383             :     /* Kind of gross.. in single threaded mode we don't want to close the xsk_map_fd
    1384             :        since it's shared with other net tiles.  Just check for that by seeing if we
    1385             :        are the only thread in the process. */
    1386           0 :     for( uint i=0U; i<xdp_fds_cnt; i++ ) {
    1387           0 :       if( -1==close( xdp_fds[ i ].xsk_map_fd ) ) {
    1388           0 :         FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
    1389           0 :       }
    1390           0 :       if( xdp_fds[ i ].prog_link_fd>0 &&
    1391           0 :           -1==close( xdp_fds[ i ].prog_link_fd ) ) {
    1392           0 :         FD_LOG_ERR(( "close(%d) failed (%d-%s)", xsk_map_fd, errno, fd_io_strerror( errno ) ));
    1393           0 :       }
    1394           0 :     }
    1395           0 :   }
    1396             : 
    1397           0 :   double tick_per_ns = fd_tempo_tick_per_ns( NULL );
    1398           0 :   ctx->xdp_stats_interval_ticks = (long)( FD_XDP_STATS_INTERVAL_NS * tick_per_ns );
    1399             : 
    1400           0 :   ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL );
    1401           0 :   if( FD_UNLIKELY( scratch_top > (ulong)scratch + scratch_footprint( tile ) ) )
    1402           0 :     FD_LOG_ERR(( "scratch overflow %lu %lu %lu", scratch_top - (ulong)scratch - scratch_footprint( tile ), scratch_top, (ulong)scratch + scratch_footprint( tile ) ));
    1403           0 : }
    1404             : 
    1405             : /* init_device_table joins the net tile to the netlink tile's device
    1406             :    table.  The device table is very frequently read, and rarely updated.
    1407             :    Therefore, the net tile keeps a local copy of the device table in
    1408             :    scratch memory.  This table is periodically copied over from the
    1409             :    netlink tile via a double buffer (netdev_dbl_buf).
    1410             : 
    1411             :    On startup, the netlink tile might not have produced its initial
    1412             :    device table.  Therefore, initialize the local copy to an empty
    1413             :    table. */
    1414             : 
    1415             : static void
    1416             : init_device_table( fd_net_ctx_t * ctx,
    1417           3 :                    void *         netdev_dbl_buf ) {
    1418             : 
    1419             :   /* Join remote double buffer containing device table updates */
    1420           3 :   ctx->netdev_dbl_buf = fd_dbl_buf_join( netdev_dbl_buf );
    1421           3 :   if( FD_UNLIKELY( !ctx->netdev_dbl_buf ) ) FD_LOG_ERR(( "fd_dbl_buf_join failed" ));
    1422           3 :   ctx->netdev_buf_sz  = fd_netdev_tbl_footprint( NETDEV_MAX, BOND_MASTER_MAX );
    1423             : 
    1424             :   /* Create temporary empty device table during startup */
    1425           3 :   FD_TEST( fd_netdev_tbl_join( &ctx->netdev_tbl, fd_netdev_tbl_new( ctx->netdev_buf, 1, 1 ) ) );
    1426           3 : }
    1427             : 
    1428             : FD_FN_UNUSED static void
    1429             : unprivileged_init( fd_topo_t *      topo,
    1430           0 :                    fd_topo_tile_t * tile ) {
    1431           0 :   void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
    1432             : 
    1433           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
    1434           0 :   fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof(fd_net_ctx_t), sizeof(fd_net_ctx_t) );
    1435           0 :   FD_TEST( ctx->xsk_cnt!=0 );
    1436           0 :   FD_TEST( ctx->free_tx.queue!=NULL );
    1437           0 :   (void)FD_SCRATCH_ALLOC_APPEND( l, alignof(ulong), tile->xdp.free_ring_depth * sizeof(ulong) );
    1438           0 :   ctx->netdev_buf              = FD_SCRATCH_ALLOC_APPEND( l, fd_netdev_tbl_align(), ctx->netdev_buf_sz );
    1439             : 
    1440           0 :   ctx->net_tile_id  = (uint)tile->kind_id;
    1441           0 :   ctx->net_tile_cnt = (uint)fd_topo_tile_name_cnt( topo, tile->name );
    1442             : 
    1443           0 :   ctx->bind_address                   = tile->net.bind_address;
    1444           0 :   ctx->shred_listen_port              = tile->net.shred_listen_port;
    1445           0 :   ctx->quic_transaction_listen_port   = tile->net.quic_transaction_listen_port;
    1446           0 :   ctx->legacy_transaction_listen_port = tile->net.legacy_transaction_listen_port;
    1447           0 :   ctx->gossip_listen_port             = tile->net.gossip_listen_port;
    1448           0 :   ctx->repair_intake_listen_port      = tile->net.repair_intake_listen_port;
    1449           0 :   ctx->repair_serve_listen_port       = tile->net.repair_serve_listen_port;
    1450           0 :   ctx->send_src_port                  = tile->net.send_src_port;
    1451             : 
    1452             :   /* Put a bound on chunks we read from the input, to make sure they
    1453             :      are within in the data region of the workspace. */
    1454             : 
    1455           0 :   if( FD_UNLIKELY( !tile->in_cnt ) ) FD_LOG_ERR(( "net tile in link cnt is zero" ));
    1456           0 :   if( FD_UNLIKELY( tile->in_cnt>MAX_NET_INS ) ) FD_LOG_ERR(( "net tile in link cnt %lu exceeds MAX_NET_INS %lu", tile->in_cnt, MAX_NET_INS ));
    1457           0 :   FD_TEST( tile->in_cnt<=32 );
    1458           0 :   for( ulong i=0UL; i<tile->in_cnt; i++ ) {
    1459           0 :     fd_topo_link_t * link = &topo->links[ tile->in_link_id[ i ] ];
    1460           0 :     if( FD_UNLIKELY( link->mtu!=FD_NET_MTU ) ) FD_LOG_ERR(( "net tile in link %s does not have a normal MTU", link->name ));
    1461             : 
    1462           0 :     ctx->in[ i ].mem    = topo->workspaces[ topo->objs[ link->dcache_obj_id ].wksp_id ].wksp;
    1463           0 :     ctx->in[ i ].chunk0 = fd_dcache_compact_chunk0( ctx->in[ i ].mem, link->dcache );
    1464           0 :     ctx->in[ i ].wmark  = fd_dcache_compact_wmark( ctx->in[ i ].mem, link->dcache, link->mtu );
    1465           0 :   }
    1466             : 
    1467           0 :   for( ulong i = 0; i < tile->out_cnt; i++ ) {
    1468           0 :     fd_topo_link_t * out_link = &topo->links[ tile->out_link_id[ i  ] ];
    1469           0 :     if( strcmp( out_link->name, "net_quic" ) == 0 ) {
    1470           0 :       fd_topo_link_t * quic_out = out_link;
    1471           0 :       ctx->quic_out->mcache = quic_out->mcache;
    1472           0 :       ctx->quic_out->sync   = fd_mcache_seq_laddr( ctx->quic_out->mcache );
    1473           0 :       ctx->quic_out->depth  = fd_mcache_depth( ctx->quic_out->mcache );
    1474           0 :       ctx->quic_out->seq    = fd_mcache_seq_query( ctx->quic_out->sync );
    1475           0 :     } else if( strcmp( out_link->name, "net_shred" ) == 0 ) {
    1476           0 :       fd_topo_link_t * shred_out = out_link;
    1477           0 :       ctx->shred_out->mcache = shred_out->mcache;
    1478           0 :       ctx->shred_out->sync   = fd_mcache_seq_laddr( ctx->shred_out->mcache );
    1479           0 :       ctx->shred_out->depth  = fd_mcache_depth( ctx->shred_out->mcache );
    1480           0 :       ctx->shred_out->seq    = fd_mcache_seq_query( ctx->shred_out->sync );
    1481           0 :     } else if( strcmp( out_link->name, "net_gossvf" ) == 0 ) {
    1482           0 :       fd_topo_link_t * gossip_out = out_link;
    1483           0 :       ctx->gossvf_out->mcache = gossip_out->mcache;
    1484           0 :       ctx->gossvf_out->sync   = fd_mcache_seq_laddr( ctx->gossvf_out->mcache );
    1485           0 :       ctx->gossvf_out->depth  = fd_mcache_depth( ctx->gossvf_out->mcache );
    1486           0 :       ctx->gossvf_out->seq    = fd_mcache_seq_query( ctx->gossvf_out->sync );
    1487           0 :     } else if( strcmp( out_link->name, "net_repair" ) == 0 ) {
    1488           0 :       fd_topo_link_t * repair_out = out_link;
    1489           0 :       ctx->repair_out->mcache = repair_out->mcache;
    1490           0 :       ctx->repair_out->sync   = fd_mcache_seq_laddr( ctx->repair_out->mcache );
    1491           0 :       ctx->repair_out->depth  = fd_mcache_depth( ctx->repair_out->mcache );
    1492           0 :       ctx->repair_out->seq    = fd_mcache_seq_query( ctx->repair_out->sync );
    1493           0 :     } else if( strcmp( out_link->name, "net_netlnk" ) == 0 ) {
    1494           0 :       fd_topo_link_t * netlink_out = out_link;
    1495           0 :       ctx->neigh4_solicit->mcache = netlink_out->mcache;
    1496           0 :       ctx->neigh4_solicit->depth  = fd_mcache_depth( ctx->neigh4_solicit->mcache );
    1497           0 :       ctx->neigh4_solicit->seq    = fd_mcache_seq_query( fd_mcache_seq_laddr( ctx->neigh4_solicit->mcache ) );
    1498           0 :     } else if( strcmp( out_link->name, "net_send" ) == 0 ) {
    1499           0 :       fd_topo_link_t * send_out = out_link;
    1500           0 :       ctx->send_out->mcache = send_out->mcache;
    1501           0 :       ctx->send_out->sync   = fd_mcache_seq_laddr( ctx->send_out->mcache );
    1502           0 :       ctx->send_out->depth  = fd_mcache_depth( ctx->send_out->mcache );
    1503           0 :       ctx->send_out->seq    = fd_mcache_seq_query( ctx->send_out->sync );
    1504           0 :     } else {
    1505           0 :       FD_LOG_ERR(( "unrecognized out link `%s`", out_link->name ));
    1506           0 :     }
    1507           0 :   }
    1508             : 
    1509             :   /* Check if any of the tiles we set a listen port for do not have an outlink. */
    1510           0 :   if( FD_UNLIKELY( ctx->shred_listen_port!=0 && ctx->shred_out->mcache==NULL ) ) {
    1511           0 :     FD_LOG_ERR(( "shred listen port set but no out link was found" ));
    1512           0 :   } else if( FD_UNLIKELY( ctx->quic_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
    1513           0 :     FD_LOG_ERR(( "quic transaction listen port set but no out link was found" ));
    1514           0 :   } else if( FD_UNLIKELY( ctx->legacy_transaction_listen_port!=0 && ctx->quic_out->mcache==NULL ) ) {
    1515           0 :     FD_LOG_ERR(( "legacy transaction listen port set but no out link was found" ));
    1516           0 :   } else if( FD_UNLIKELY( ctx->gossip_listen_port!=0 && ctx->gossvf_out->mcache==NULL ) ) {
    1517           0 :     FD_LOG_ERR(( "gossip listen port set but no out link was found" ));
    1518           0 :   } else if( FD_UNLIKELY( ctx->repair_intake_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
    1519           0 :     FD_LOG_ERR(( "repair intake port set but no out link was found" ));
    1520           0 :   } else if( FD_UNLIKELY( ctx->repair_serve_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) {
    1521           0 :     FD_LOG_ERR(( "repair serve listen port set but no out link was found" ));
    1522           0 :   } else if( FD_UNLIKELY( ctx->neigh4_solicit->mcache==NULL ) ) {
    1523           0 :     FD_LOG_ERR(( "netlink request link not found" ));
    1524           0 :   } else if( FD_UNLIKELY( ctx->send_src_port!=0 && ctx->send_out->mcache==NULL ) ) {
    1525           0 :     FD_LOG_ERR(( "send listen port set but no out link was found" ));
    1526           0 :   }
    1527             : 
    1528           0 :   for( uint j=0U; j<2U; j++ ) {
    1529           0 :     ctx->tx_flusher[ j ].pending_wmark         = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 );
    1530           0 :     ctx->tx_flusher[ j ].tail_flush_backoff    = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) );
    1531           0 :     ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX;
    1532           0 :   }
    1533             : 
    1534             :   /* Join netbase objects */
    1535           0 :   FD_TEST( fd_fib4_join( ctx->fib_local, fd_topo_obj_laddr( topo, tile->xdp.fib4_local_obj_id ) ) );
    1536           0 :   FD_TEST( fd_fib4_join( ctx->fib_main, fd_topo_obj_laddr( topo, tile->xdp.fib4_main_obj_id  ) ) );
    1537             : 
    1538           0 :   ulong neigh4_obj_id = tile->xdp.neigh4_obj_id;
    1539           0 :   ulong ele_max   = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.ele_max",   neigh4_obj_id );
    1540           0 :   ulong probe_max = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.probe_max", neigh4_obj_id );
    1541           0 :   ulong seed      = fd_pod_queryf_ulong( topo->props, ULONG_MAX, "obj.%lu.seed",      neigh4_obj_id );
    1542           0 :   if( FD_UNLIKELY( (ele_max==ULONG_MAX) | (probe_max==ULONG_MAX) | (seed==ULONG_MAX) ) )
    1543           0 :     FD_LOG_ERR(( "neigh4 hmap properties not set" ));
    1544           0 :   if( FD_UNLIKELY( !fd_neigh4_hmap_join(
    1545           0 :       ctx->neigh4,
    1546           0 :       fd_topo_obj_laddr( topo, neigh4_obj_id ),
    1547           0 :       ele_max,
    1548           0 :       probe_max,
    1549           0 :       seed ) ) ) {
    1550           0 :     FD_LOG_ERR(( "fd_neigh4_hmap_join failed" ));
    1551           0 :   }
    1552             : 
    1553           0 :   init_device_table( ctx, fd_topo_obj_laddr( topo, tile->xdp.netdev_dbl_buf_obj_id ) );
    1554             : 
    1555             :   /* Initialize TX free ring */
    1556             : 
    1557           0 :   ulong const frame_sz  = 2048UL;
    1558           0 :   ulong       frame_off = 0UL;
    1559           0 :   ulong const tx_depth  = ctx->free_tx.depth;
    1560           0 :   for( ulong j=0; j<tx_depth; j++ ) {
    1561           0 :     ctx->free_tx.queue[ j ] = (ulong)ctx->umem + frame_off;
    1562           0 :     frame_off += frame_sz;
    1563           0 :   }
    1564           0 :   ctx->free_tx.prod = tx_depth;
    1565             : 
    1566             :   /* Initialize RX mcache chunks */
    1567             : 
    1568           0 :   for( ulong i=0UL; i<(tile->out_cnt); i++ ) {
    1569           0 :     fd_topo_link_t * out_link = &topo->links[ tile->out_link_id[ i  ] ];
    1570           0 :     fd_frag_meta_t * mcache   = out_link->mcache;
    1571           0 :     for( ulong j=0UL; j<fd_mcache_depth( mcache ); j++ ) {
    1572           0 :       mcache[ j ].chunk = (uint)( ctx->umem_chunk0 + (frame_off>>FD_CHUNK_LG_SZ) );
    1573           0 :       frame_off += frame_sz;
    1574           0 :     }
    1575           0 :   }
    1576             : 
    1577             :   /* Initialize FILL ring */
    1578             : 
    1579           0 :   int _charge_busy = 0;
    1580           0 :   for( uint j=0U; j<ctx->xsk_cnt; j++ ) {
    1581           0 :     frame_off = net_xsk_bootstrap( ctx, j, frame_off );
    1582           0 :     net_rx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
    1583           0 :     net_tx_wakeup( ctx, &ctx->xsk[ j ], &_charge_busy );
    1584           0 :   }
    1585             : 
    1586           0 :   if( FD_UNLIKELY( frame_off > ctx->umem_sz ) ) {
    1587           0 :     FD_LOG_ERR(( "UMEM is too small" ));
    1588           0 :   }
    1589           0 : }
    1590             : 
    1591             : FD_FN_UNUSED static ulong
    1592             : populate_allowed_seccomp( fd_topo_t const *      topo,
    1593             :                           fd_topo_tile_t const * tile,
    1594             :                           ulong                  out_cnt,
    1595           0 :                           struct sock_filter *   out ) {
    1596           0 :   void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
    1597           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
    1598           0 :   fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
    1599             : 
    1600             :   /* A bit of a hack, if there is no loopback XSK for this tile, we still need to pass
    1601             :      two "allow" FD arguments to the net policy, so we just make them both the same. */
    1602           0 :   int allow_fd2 = ctx->xsk_cnt>1UL ? ctx->xsk[ 1 ].xsk_fd : ctx->xsk[ 0 ].xsk_fd;
    1603           0 :   FD_TEST( ctx->xsk[ 0 ].xsk_fd >= 0 && allow_fd2 >= 0 );
    1604             : 
    1605           0 :   populate_sock_filter_policy_fd_xdp_tile( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ].xsk_fd, (uint)allow_fd2 );
    1606           0 :   return sock_filter_policy_fd_xdp_tile_instr_cnt;
    1607           0 : }
    1608             : 
    1609             : FD_FN_UNUSED static ulong
    1610             : populate_allowed_fds( fd_topo_t const *      topo,
    1611             :                       fd_topo_tile_t const * tile,
    1612             :                       ulong                  out_fds_cnt,
    1613           0 :                       int *                  out_fds ) {
    1614           0 :   void * scratch = fd_topo_obj_laddr( topo, tile->tile_obj_id );
    1615           0 :   FD_SCRATCH_ALLOC_INIT( l, scratch );
    1616           0 :   fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) );
    1617             : 
    1618           0 :   if( FD_UNLIKELY( out_fds_cnt<6UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt ));
    1619             : 
    1620           0 :   ulong out_cnt = 0UL;
    1621             : 
    1622           0 :   out_fds[ out_cnt++ ] = 2; /* stderr */
    1623           0 :   if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) )
    1624           0 :     out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */
    1625             : 
    1626           0 :                                       out_fds[ out_cnt++ ] = ctx->xsk[ 0 ].xsk_fd;
    1627           0 :                                       out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 0 ];
    1628           0 :   if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->xsk[ 1 ].xsk_fd;
    1629           0 :   if( FD_LIKELY( ctx->xsk_cnt>1UL ) ) out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 1 ];
    1630           0 :   return out_cnt;
    1631           0 : }
    1632             : 
    1633           0 : #define STEM_BURST (1UL)
    1634           0 : #define STEM_LAZY ((ulong)30e3) /* 30 us */
    1635             : 
    1636           0 : #define STEM_CALLBACK_CONTEXT_TYPE  fd_net_ctx_t
    1637           0 : #define STEM_CALLBACK_CONTEXT_ALIGN alignof(fd_net_ctx_t)
    1638             : 
    1639           0 : #define STEM_CALLBACK_METRICS_WRITE       metrics_write
    1640           0 : #define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
    1641           0 : #define STEM_CALLBACK_BEFORE_CREDIT       before_credit
    1642           0 : #define STEM_CALLBACK_BEFORE_FRAG         before_frag
    1643           0 : #define STEM_CALLBACK_DURING_FRAG         during_frag
    1644           0 : #define STEM_CALLBACK_AFTER_FRAG          after_frag
    1645             : 
    1646             : #include "../../stem/fd_stem.c"
    1647             : 
    1648             : #ifndef FD_TILE_TEST
    1649             : fd_topo_run_tile_t fd_tile_net = {
    1650             :   .name                     = "net",
    1651             :   .populate_allowed_seccomp = populate_allowed_seccomp,
    1652             :   .populate_allowed_fds     = populate_allowed_fds,
    1653             :   .scratch_align            = scratch_align,
    1654             :   .scratch_footprint        = scratch_footprint,
    1655             :   .privileged_init          = privileged_init,
    1656             :   .unprivileged_init        = unprivileged_init,
    1657             :   .run                      = stem_run,
    1658             : };
    1659             : #endif

Generated by: LCOV version 1.14