Line data Source code
1 : #ifndef HEADER_fd_src_waltz_xdp_fd_xsk_h 2 : #define HEADER_fd_src_waltz_xdp_fd_xsk_h 3 : 4 : #if defined(__linux__) 5 : 6 : /* fd_xsk manages an XSK file descriptor and provides RX/TX buffers. 7 : 8 : ### Background 9 : 10 : AF_XDP is a Linux API providing kernel-bypass networking in the form 11 : of shared memory ring buffers accessible from userspace. The kernel 12 : redirects packets from/to these buffers with the appropriate XDP 13 : configuration (XDP_REDIRECT). AF_XDP is hardware-agnostic and allows 14 : sharing a NIC with the Linux networking stack (unlike e.g. DPDK). 15 : This allows for deployment in existing, heterogeneous networks. An 16 : AF_XDP socket is called "XSK". The shared memory region storing the 17 : packet data flowing through an XSK is called "UMEM". 18 : 19 : XDP (eXpress Data Path) is a framework for installing hooks in the 20 : form of eBPF programs at an early stage of packet processing (i.e. 21 : before tc and netfilter). eBPF is user-deployable JIT-compiled 22 : bytecode that usually runs inside the kernel. Some hardware/driver 23 : combinations optionally allow offloading eBPF processing to NICs. 24 : This is not to be confused with other BPF-derived ISAs such as sBPF 25 : (Solana BPF). 26 : 27 : +--- Figure 1: AF_XDP RX Block Diagram -----------------+ 28 : | | 29 : | ┌─────┐ ┌────────┐ ┌─────┐ XDP_PASS ┌─────────┐ | 30 : | │ NIC ├──> Driver ├──> XDP ├──────────> sk_buff │ | 31 : | └─────┘ └────────┘ └─┬───┘ └─────────┘ | 32 : | │ | 33 : | │ XDP_REDIRECT | 34 : | │ | 35 : | ┌──▼───────┐ ┌─────────┐ | 36 : | │ XSK/UMEM ├──────> fd_aio │ | 37 : | └──────────┘ └─────────┘ | 38 : | | 39 : +-------------------------------------------------------+ 40 : 41 : Figure 1 shows a simplified block diagram of RX packet flow within 42 : the kernel in `XDP_FLAGS_DRV_MODE` mode. Notably, the chain of eBPF 43 : programs installed in the XDP facility get invoked for every incoming 44 : packet. If all programs return the `XDP_PASS` action, the packet 45 : continues its usual path to the Linux networking stack, where it will 46 : be allocated in sk_buff, and eventually flow through ip_rcv(), tc, 47 : and netfilter before reaching downstream sockets. 48 : If the `XDP_REDIRECT` action is taken however, the packet is copied 49 : to the UMEM of an XSK, and a RX queue entry is allocated. An fd_aio 50 : backend is provided by fd_xdp_aio. 51 : The more generic `XDP_FLAGS_SKB_MODE` XDP mode falls back to sk_buff- 52 : based memory mgmt (still skipping the rest of the generic path), but 53 : is more widely available. 54 : 55 : +--- Figure 2: AF_XDP TX Block Diagram -------------+ 56 : | | 57 : | ┌────────┐ ┌──────────┐ ┌────────┐ ┌─────┐ | 58 : | │ fd_aio ├──> XSK/UMEM ├──> Driver ├──> NIC │ | 59 : | └────────┘ └──────────┘ └────────┘ └─────┘ | 60 : | | 61 : +---------------------------------------------------+ 62 : 63 : Figure 2 shows a simplified block diagram of the TX packet flow. 64 : Userspace applications deliver packets to the XSK/UMEM buffers. The 65 : kernel then forwards these packets to the NIC. This also means that 66 : the application is responsible for maintaining a routing table to 67 : resolve layer-3 dest addrs to NICs and layer-2 addrs. As in the RX 68 : flow, netfilter (iptables, nftables) is not available. 69 : 70 : ### Memory Management 71 : 72 : The UMEM area is allocated from userspace. It is recommended to use 73 : the fd_util shmem/wksp APIs to obtain large page-backed memory. UMEM 74 : is divided into equally sized frames. At any point in time, each 75 : frame is either owned by userspace or the kernel. On initialization, 76 : all frames are owned by userspace. 77 : 78 : Changes in UMEM frame ownership and packet RX/TX events are 79 : transmitted via four rings allocated by the kernel (mmap()ed in by 80 : the user). This allows for out-of-order processing of packets. 81 : 82 : Data flow: 83 : (U->K) is userspace-to-kernel communication, and 84 : (K->U) is kernel-to-userspace. 85 : 86 : FILL Free frames are provided to the kernel using the FILL 87 : (U->K) ring. The kernel may populate these frames with RX 88 : packet data. 89 : 90 : RX Once the kernel has populated a FILL frame with RX 91 : (K->U) packet data, it passes back the frame to userspace 92 : via the RX queue. 93 : 94 : TX TX frames sent by userspace are provided to the 95 : (U->K) kernel using the TX ring. 96 : 97 : COMPLETION Once the kernel has processed a TX frame, it passes 98 : (K->U) back the frame to the userspace via the COMPLETION 99 : queue. 100 : 101 : Combined, the FILL-RX and TX-COMPLETION rings form two pairs. The 102 : kernel will not move frames between the pairs. */ 103 : 104 : #include <linux/if_link.h> 105 : #include <linux/if_xdp.h> 106 : #include <net/if.h> 107 : 108 : #include "../../util/fd_util_base.h" 109 : 110 : /* FD_XSK_UMEM_ALIGN: byte alignment of UMEM area within fd_xsk_t. 111 : This requirement is set by the kernel as of Linux 4.18. */ 112 : #define FD_XSK_UMEM_ALIGN (4096UL) 113 : 114 : /* fd_xdp_ring_t describes an XSK descriptor ring in the thread group's 115 : local address space. All pointers fall into kernel-managed XSK 116 : descriptor buffer at [mem;mem+mem_sz) that are valid during the 117 : lifetime of an fd_xsk_t join. The ring producer and consumer are 118 : synchronized via incrementing sequence numbers that wrap at 2^64. */ 119 : 120 : struct __attribute__((aligned(64UL))) fd_xdp_ring { 121 : /* This point is 64-byte aligned */ 122 : 123 : /* mmap() params, only used during join/leave for munmap() */ 124 : 125 : void * mem; /* Points to start of shared descriptor ring mmap region */ 126 : ulong map_sz; /* Size of shared descriptor ring mmap region */ 127 : ulong _pad_0x10; 128 : ulong _pad_0x18; 129 : 130 : /* This point is 64-byte aligned */ 131 : 132 : /* Pointers to fields opaque XSK ring structure. 133 : This indirection is required because the memory layout of the 134 : kernel-provided descriptor rings is unstable. The field offsets 135 : can be queried using getsockopt(SOL_XDP, XDP_MMAP_OFFSETS). */ 136 : 137 : union { 138 : void * ptr; /* Opaque pointer */ 139 : struct xdp_desc * packet_ring; /* For RX, TX rings */ 140 : ulong * frame_ring; /* For FILL, COMPLETION rings */ 141 : }; 142 : uint * flags; /* Points to flags in shared descriptor ring */ 143 : uint * prod; /* Points to producer seq in shared descriptor ring */ 144 : uint * cons; /* Points to consumer seq in shared descriptor ring */ 145 : 146 : /* This point is 64-byte aligned */ 147 : 148 : /* Managed by fd_xsk_t */ 149 : 150 : uint depth; /* Capacity of ring in no of entries */ 151 : uint cached_prod; /* Cached value of *prod */ 152 : uint cached_cons; /* Cached value of *cons */ 153 : }; 154 : typedef struct fd_xdp_ring fd_xdp_ring_t; 155 : 156 : /* fd_xsk_params_t: Memory layout parameters of XSK. 157 : Can be retrieved using fd_xsk_get_params() */ 158 : 159 : struct fd_xsk_params { 160 : /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX, 161 : TX, Completion XSK rings respectively. */ 162 : ulong fr_depth; 163 : ulong rx_depth; 164 : ulong tx_depth; 165 : ulong cr_depth; 166 : 167 : /* umem_addr: Pointer to UMEM in local address space */ 168 : void * umem_addr; 169 : 170 : /* frame_sz: Controls the frame size used in the UMEM ring buffers. */ 171 : ulong frame_sz; 172 : 173 : /* umem_sz: Total size of XSK ring shared memory area (contiguous). 174 : Aligned by FD_XSK_ALIGN. */ 175 : ulong umem_sz; 176 : 177 : /* Linux interface index */ 178 : uint if_idx; 179 : 180 : /* Interface queue index */ 181 : uint if_queue_id; 182 : 183 : /* sockaddr_xdp.sxdp_flags additional params, e.g. XDP_ZEROCOPY */ 184 : uint bind_flags; 185 : }; 186 : 187 : typedef struct fd_xsk_params fd_xsk_params_t; 188 : 189 : struct fd_xsk { 190 : /* Informational */ 191 : uint if_idx; /* index of net device */ 192 : uint if_queue_id; /* net device combined queue index */ 193 : long log_suppress_until_ns; /* suppress log messages until this time */ 194 : 195 : /* Kernel descriptor of XSK rings in local address space 196 : returned by getsockopt(SOL_XDP, XDP_MMAP_OFFSETS) */ 197 : struct xdp_mmap_offsets offsets; 198 : 199 : /* AF_XDP socket file descriptor */ 200 : int xsk_fd; 201 : 202 : /* ring_{rx,tx,fr,cr}: XSK ring descriptors */ 203 : 204 : fd_xdp_ring_t ring_rx; 205 : fd_xdp_ring_t ring_tx; 206 : fd_xdp_ring_t ring_fr; 207 : fd_xdp_ring_t ring_cr; 208 : }; 209 : 210 : typedef struct fd_xsk fd_xsk_t; 211 : 212 : FD_PROTOTYPES_BEGIN 213 : 214 : /* fd_xsk_init creates an XSK, registers UMEM, maps rings, and binds the 215 : socket to the given interface queue. This is a potentially 216 : destructive operation. As of 2024-Jun, AF_XDP zero copy support is 217 : still buggy in some device drivers. 218 : 219 : Assume that all traffic sent to this interface is compromised. On 220 : some devices, the NIC is instructed to DMA all incoming packets into 221 : UMEM, even ones not belonging to Firedancer. Those are then later 222 : on software-copied out to skbs again. This further implies that 223 : enabling AF_XDP can slow down the regular kernel receive path. 224 : 225 : Requires CAP_SYS_ADMIN. May issue the following syscalls: 226 : 227 : - socket( AF_XDP, SOCK_RAW, 0 ) = fd 228 : - setsockopt( fd, SOL_XDP, ... ) 229 : - getsockopt( fd, SOL_XDP, ... ) 230 : - mmap( ..., fd, ... ) 231 : - bind( fd, ... ) 232 : - munmap ; on fail 233 : - close ; on fail */ 234 : 235 : fd_xsk_t * 236 : fd_xsk_init( fd_xsk_t * xsk, 237 : fd_xsk_params_t const * params ); 238 : 239 : void * 240 : fd_xsk_delete( void * shxsk ); 241 : 242 : /* fd_xsk_rx_need_wakeup: returns whether a wakeup is required to 243 : complete a rx operation */ 244 : 245 : static inline int 246 0 : fd_xsk_rx_need_wakeup( fd_xsk_t * xsk ) { 247 0 : return !!( *xsk->ring_fr.flags & XDP_RING_NEED_WAKEUP ); 248 0 : } 249 : 250 : /* fd_xsk_tx_need_wakeup: returns whether a wakeup is required to 251 : complete a tx operation */ 252 : 253 : static inline int 254 0 : fd_xsk_tx_need_wakeup( fd_xsk_t * xsk ) { 255 0 : return !!( *xsk->ring_tx.flags & XDP_RING_NEED_WAKEUP ); 256 0 : } 257 : 258 : 259 : FD_PROTOTYPES_END 260 : 261 : #endif /* defined(__linux__) */ 262 : #endif /* HEADER_fd_src_waltz_xdp_fd_xsk_h */