Line data Source code
1 : #ifndef HEADER_fd_src_waltz_xdp_fd_xsk_h
2 : #define HEADER_fd_src_waltz_xdp_fd_xsk_h
3 :
4 : #if defined(__linux__)
5 :
6 : /* fd_xsk manages an XSK file descriptor and provides RX/TX buffers.
7 :
8 : ### Background
9 :
10 : AF_XDP is a Linux API providing kernel-bypass networking in the form
11 : of shared memory ring buffers accessible from userspace. The kernel
12 : redirects packets from/to these buffers with the appropriate XDP
13 : configuration (XDP_REDIRECT). AF_XDP is hardware-agnostic and allows
14 : sharing a NIC with the Linux networking stack (unlike e.g. DPDK).
15 : This allows for deployment in existing, heterogeneous networks. An
16 : AF_XDP socket is called "XSK". The shared memory region storing the
17 : packet data flowing through an XSK is called "UMEM".
18 :
19 : XDP (eXpress Data Path) is a framework for installing hooks in the
20 : form of eBPF programs at an early stage of packet processing (i.e.
21 : before tc and netfilter). eBPF is user-deployable JIT-compiled
22 : bytecode that usually runs inside the kernel. Some hardware/driver
23 : combinations optionally allow offloading eBPF processing to NICs.
24 : This is not to be confused with other BPF-derived ISAs such as sBPF
25 : (Solana BPF).
26 :
27 : +--- Figure 1: AF_XDP RX Block Diagram -----------------+
28 : | |
29 : | ┌─────┐ ┌────────┐ ┌─────┐ XDP_PASS ┌─────────┐ |
30 : | │ NIC ├──> Driver ├──> XDP ├──────────> sk_buff │ |
31 : | └─────┘ └────────┘ └─┬───┘ └─────────┘ |
32 : | │ |
33 : | │ XDP_REDIRECT |
34 : | │ |
35 : | ┌──▼───────┐ ┌─────────┐ |
36 : | │ XSK/UMEM ├──────> fd_aio │ |
37 : | └──────────┘ └─────────┘ |
38 : | |
39 : +-------------------------------------------------------+
40 :
41 : Figure 1 shows a simplified block diagram of RX packet flow within
42 : the kernel in `XDP_FLAGS_DRV_MODE` mode. Notably, the chain of eBPF
43 : programs installed in the XDP facility get invoked for every incoming
44 : packet. If all programs return the `XDP_PASS` action, the packet
45 : continues its usual path to the Linux networking stack, where it will
46 : be allocated in sk_buff, and eventually flow through ip_rcv(), tc,
47 : and netfilter before reaching downstream sockets.
48 : If the `XDP_REDIRECT` action is taken however, the packet is copied
49 : to the UMEM of an XSK, and a RX queue entry is allocated. An fd_aio
50 : backend is provided by fd_xdp_aio.
51 : The more generic `XDP_FLAGS_SKB_MODE` XDP mode falls back to sk_buff-
52 : based memory mgmt (still skipping the rest of the generic path), but
53 : is more widely available.
54 :
55 : +--- Figure 2: AF_XDP TX Block Diagram -------------+
56 : | |
57 : | ┌────────┐ ┌──────────┐ ┌────────┐ ┌─────┐ |
58 : | │ fd_aio ├──> XSK/UMEM ├──> Driver ├──> NIC │ |
59 : | └────────┘ └──────────┘ └────────┘ └─────┘ |
60 : | |
61 : +---------------------------------------------------+
62 :
63 : Figure 2 shows a simplified block diagram of the TX packet flow.
64 : Userspace applications deliver packets to the XSK/UMEM buffers. The
65 : kernel then forwards these packets to the NIC. This also means that
66 : the application is responsible for maintaining a routing table to
67 : resolve layer-3 dest addrs to NICs and layer-2 addrs. As in the RX
68 : flow, netfilter (iptables, nftables) is not available.
69 :
70 : ### Memory Management
71 :
72 : The UMEM area is allocated from userspace. It is recommended to use
73 : the fd_util shmem/wksp APIs to obtain large page-backed memory. UMEM
74 : is divided into equally sized frames. At any point in time, each
75 : frame is either owned by userspace or the kernel. On initialization,
76 : all frames are owned by userspace.
77 :
78 : Changes in UMEM frame ownership and packet RX/TX events are
79 : transmitted via four rings allocated by the kernel (mmap()ed in by
80 : the user). This allows for out-of-order processing of packets.
81 :
82 : Data flow:
83 : (U->K) is userspace-to-kernel communication, and
84 : (K->U) is kernel-to-userspace.
85 :
86 : FILL Free frames are provided to the kernel using the FILL
87 : (U->K) ring. The kernel may populate these frames with RX
88 : packet data.
89 :
90 : RX Once the kernel has populated a FILL frame with RX
91 : (K->U) packet data, it passes back the frame to userspace
92 : via the RX queue.
93 :
94 : TX TX frames sent by userspace are provided to the
95 : (U->K) kernel using the TX ring.
96 :
97 : COMPLETION Once the kernel has processed a TX frame, it passes
98 : (K->U) back the frame to the userspace via the COMPLETION
99 : queue.
100 :
101 : Combined, the FILL-RX and TX-COMPLETION rings form two pairs. The
102 : kernel will not move frames between the pairs. */
103 :
104 : #include <linux/if_link.h>
105 : #include <linux/if_xdp.h>
106 : #include <net/if.h>
107 :
108 : #include "../../util/fd_util_base.h"
109 :
110 : /* FD_XSK_UMEM_ALIGN: byte alignment of UMEM area within fd_xsk_t.
111 : This requirement is set by the kernel as of Linux 4.18. */
112 : #define FD_XSK_UMEM_ALIGN (4096UL)
113 :
114 : /* fd_xdp_ring_t describes an XSK descriptor ring in the thread group's
115 : local address space. All pointers fall into kernel-managed XSK
116 : descriptor buffer at [mem;mem+mem_sz) that are valid during the
117 : lifetime of an fd_xsk_t join. The ring producer and consumer are
118 : synchronized via incrementing sequence numbers that wrap at 2^64. */
119 :
120 : struct __attribute__((aligned(64UL))) fd_xdp_ring {
121 : /* This point is 64-byte aligned */
122 :
123 : /* mmap() params, only used during join/leave for munmap() */
124 :
125 : void * mem; /* Points to start of shared descriptor ring mmap region */
126 : ulong map_sz; /* Size of shared descriptor ring mmap region */
127 : ulong _pad_0x10;
128 : ulong _pad_0x18;
129 :
130 : /* This point is 64-byte aligned */
131 :
132 : /* Pointers to fields opaque XSK ring structure.
133 : This indirection is required because the memory layout of the
134 : kernel-provided descriptor rings is unstable. The field offsets
135 : can be queried using getsockopt(SOL_XDP, XDP_MMAP_OFFSETS). */
136 :
137 : union {
138 : void * ptr; /* Opaque pointer */
139 : struct xdp_desc * packet_ring; /* For RX, TX rings */
140 : ulong * frame_ring; /* For FILL, COMPLETION rings */
141 : };
142 : uint * flags; /* Points to flags in shared descriptor ring */
143 : uint * prod; /* Points to producer seq in shared descriptor ring */
144 : uint * cons; /* Points to consumer seq in shared descriptor ring */
145 :
146 : /* This point is 64-byte aligned */
147 :
148 : /* Managed by fd_xsk_t */
149 :
150 : uint depth; /* Capacity of ring in no of entries */
151 : uint cached_prod; /* Cached value of *prod */
152 : uint cached_cons; /* Cached value of *cons */
153 : };
154 : typedef struct fd_xdp_ring fd_xdp_ring_t;
155 :
156 123 : #define FD_XDP_RING_ROLE_PROD 0
157 48 : #define FD_XDP_RING_ROLE_CONS 1
158 :
159 : /* fd_xdp_ring_empty returns 1 if the ring is empty, 0 otherwise.
160 : 'role' is FD_XDP_RING_ROLE_PROD if userspace is the producer (fill, tx),
161 : and FD_XDP_RING_ROLE_CONS if userspace is the consumer (rx, completion). */
162 :
163 : static inline int
164 99 : fd_xdp_ring_empty( fd_xdp_ring_t * ring, uint role ) {
165 99 : if( role == FD_XDP_RING_ROLE_PROD ) {
166 : /* If potentially stale cached_cons says everything consumed,
167 : it's definitely empty. Else, refresh cached seq. */
168 39 : if( FD_UNLIKELY( ring->cached_prod == ring->cached_cons ) ) return 1;
169 33 : ring->cached_cons = FD_VOLATILE_CONST( *ring->cons );
170 60 : } else {
171 : /* If potentially stale cached_prod says we have more to read,
172 : it's definitely non-empty. Else, refresh cached seq. */
173 60 : if( FD_LIKELY( ring->cached_cons < ring->cached_prod ) ) return 0;
174 57 : ring->cached_prod = FD_VOLATILE_CONST( *ring->prod );
175 57 : }
176 90 : return ring->cached_prod == ring->cached_cons;
177 99 : }
178 :
179 : /* fd_xdp_ring_full returns 1 if the ring is full, 0 otherwise.
180 : Assumes caller is the producer to this ring (fill, tx). */
181 : static inline int
182 84 : fd_xdp_ring_full( fd_xdp_ring_t * ring ) {
183 : /* If potentially stale cached_cons says we have more space,
184 : it's definitely not full. Else, refresh cached seq. */
185 84 : if( FD_LIKELY( ring->cached_prod - ring->cached_cons < ring->depth ) ) return 0;
186 18 : ring->cached_cons = FD_VOLATILE_CONST( *ring->cons );
187 18 : return ring->cached_prod - ring->cached_cons >= ring->depth;
188 84 : }
189 :
190 : /* fd_xsk_params_t: Memory layout parameters of XSK.
191 : Can be retrieved using fd_xsk_get_params() */
192 :
193 : struct fd_xsk_params {
194 : /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX,
195 : TX, Completion XSK rings respectively. */
196 : ulong fr_depth;
197 : ulong rx_depth;
198 : ulong tx_depth;
199 : ulong cr_depth;
200 :
201 : /* umem_addr: Pointer to UMEM in local address space */
202 : void * umem_addr;
203 :
204 : /* frame_sz: Controls the frame size used in the UMEM ring buffers. */
205 : ulong frame_sz;
206 :
207 : /* umem_sz: Total size of XSK ring shared memory area (contiguous).
208 : Aligned by FD_XSK_ALIGN. */
209 : ulong umem_sz;
210 :
211 : /* Linux interface index */
212 : uint if_idx;
213 :
214 : /* Interface queue index */
215 : uint if_queue_id;
216 :
217 : /* sockaddr_xdp.sxdp_flags additional params, e.g. XDP_ZEROCOPY */
218 : uint bind_flags;
219 :
220 : /* whether the xsk memory should be included in core dumps */
221 : int core_dump;
222 : };
223 :
224 : typedef struct fd_xsk_params fd_xsk_params_t;
225 :
226 : struct fd_xsk {
227 : /* Informational */
228 : uint if_idx; /* index of net device */
229 : uint if_queue_id; /* net device combined queue index */
230 : long log_suppress_until_ns; /* suppress log messages until this time */
231 :
232 : /* Kernel descriptor of XSK rings in local address space
233 : returned by getsockopt(SOL_XDP, XDP_MMAP_OFFSETS) */
234 : struct xdp_mmap_offsets offsets;
235 :
236 : /* AF_XDP socket file descriptor */
237 : int xsk_fd;
238 :
239 : /* ring_{rx,tx,fr,cr}: XSK ring descriptors */
240 :
241 : fd_xdp_ring_t ring_rx;
242 : fd_xdp_ring_t ring_tx;
243 : fd_xdp_ring_t ring_fr;
244 : fd_xdp_ring_t ring_cr;
245 : };
246 :
247 : typedef struct fd_xsk fd_xsk_t;
248 :
249 : FD_PROTOTYPES_BEGIN
250 :
251 : /* fd_xsk_init creates an XSK, registers UMEM, maps rings, and binds the
252 : socket to the given interface queue. This is a potentially
253 : destructive operation. As of 2024-Jun, AF_XDP zero copy support is
254 : still buggy in some device drivers.
255 :
256 : Assume that all traffic sent to this interface is compromised. On
257 : some devices, the NIC is instructed to DMA all incoming packets into
258 : UMEM, even ones not belonging to Firedancer. Those are then later
259 : on software-copied out to skbs again. This further implies that
260 : enabling AF_XDP can slow down the regular kernel receive path.
261 :
262 : Requires CAP_SYS_ADMIN. May issue the following syscalls:
263 :
264 : - socket( AF_XDP, SOCK_RAW, 0 ) = fd
265 : - setsockopt( fd, SOL_XDP, ... )
266 : - getsockopt( fd, SOL_XDP, ... )
267 : - mmap( ..., fd, ... )
268 : - bind( fd, ... )
269 : - munmap ; on fail
270 : - close ; on fail */
271 :
272 : fd_xsk_t *
273 : fd_xsk_init( fd_xsk_t * xsk,
274 : fd_xsk_params_t const * params );
275 :
276 : void *
277 : fd_xsk_delete( void * shxsk );
278 :
279 : /* fd_xsk_rx_need_wakeup: returns whether a wakeup is required to
280 : complete a rx operation */
281 :
282 : static inline int
283 0 : fd_xsk_rx_need_wakeup( fd_xsk_t * xsk ) {
284 0 : return !!( *xsk->ring_fr.flags & XDP_RING_NEED_WAKEUP );
285 0 : }
286 :
287 : /* fd_xsk_tx_need_wakeup: returns whether a wakeup is required to
288 : complete a tx operation */
289 :
290 : static inline int
291 21 : fd_xsk_tx_need_wakeup( fd_xsk_t * xsk ) {
292 : return !!( *xsk->ring_tx.flags & XDP_RING_NEED_WAKEUP );
293 21 : }
294 :
295 :
296 : FD_PROTOTYPES_END
297 :
298 : #endif /* defined(__linux__) */
299 : #endif /* HEADER_fd_src_waltz_xdp_fd_xsk_h */
|