Line data Source code
1 : #ifndef HEADER_fd_src_waltz_xdp_fd_xsk_h
2 : #define HEADER_fd_src_waltz_xdp_fd_xsk_h
3 :
4 : #if defined(__linux__)
5 :
6 : /* fd_xsk manages an XSK file descriptor and provides RX/TX buffers.
7 :
8 : ### Background
9 :
10 : AF_XDP is a Linux API providing kernel-bypass networking in the form
11 : of shared memory ring buffers accessible from userspace. The kernel
12 : redirects packets from/to these buffers with the appropriate XDP
13 : configuration (XDP_REDIRECT). AF_XDP is hardware-agnostic and allows
14 : sharing a NIC with the Linux networking stack (unlike e.g. DPDK).
15 : This allows for deployment in existing, heterogeneous networks. An
16 : AF_XDP socket is called "XSK". The shared memory region storing the
17 : packet data flowing through an XSK is called "UMEM".
18 :
19 : XDP (eXpress Data Path) is a framework for installing hooks in the
20 : form of eBPF programs at an early stage of packet processing (i.e.
21 : before tc and netfilter). eBPF is user-deployable JIT-compiled
22 : bytecode that usually runs inside the kernel. Some hardware/driver
23 : combinations optionally allow offloading eBPF processing to NICs.
24 : This is not to be confused with other BPF-derived ISAs such as sBPF
25 : (Solana BPF).
26 :
27 : +--- Figure 1: AF_XDP RX Block Diagram -----------------+
28 : | |
29 : | ┌─────┐ ┌────────┐ ┌─────┐ XDP_PASS ┌─────────┐ |
30 : | │ NIC ├──> Driver ├──> XDP ├──────────> sk_buff │ |
31 : | └─────┘ └────────┘ └─┬───┘ └─────────┘ |
32 : | │ |
33 : | │ XDP_REDIRECT |
34 : | │ |
35 : | ┌──▼───────┐ ┌─────────┐ |
36 : | │ XSK/UMEM ├──────> fd_aio │ |
37 : | └──────────┘ └─────────┘ |
38 : | |
39 : +-------------------------------------------------------+
40 :
41 : Figure 1 shows a simplified block diagram of RX packet flow within
42 : the kernel in `XDP_FLAGS_DRV_MODE` mode. Notably, the chain of eBPF
43 : programs installed in the XDP facility get invoked for every incoming
44 : packet. If all programs return the `XDP_PASS` action, the packet
45 : continues its usual path to the Linux networking stack, where it will
46 : be allocated in sk_buff, and eventually flow through ip_rcv(), tc,
47 : and netfilter before reaching downstream sockets.
48 : If the `XDP_REDIRECT` action is taken however, the packet is copied
49 : to the UMEM of an XSK, and a RX queue entry is allocated. An fd_aio
50 : backend is provided by fd_xdp_aio.
51 : The more generic `XDP_FLAGS_SKB_MODE` XDP mode falls back to sk_buff-
52 : based memory mgmt (still skipping the rest of the generic path), but
53 : is more widely available.
54 :
55 : +--- Figure 2: AF_XDP TX Block Diagram -------------+
56 : | |
57 : | ┌────────┐ ┌──────────┐ ┌────────┐ ┌─────┐ |
58 : | │ fd_aio ├──> XSK/UMEM ├──> Driver ├──> NIC │ |
59 : | └────────┘ └──────────┘ └────────┘ └─────┘ |
60 : | |
61 : +---------------------------------------------------+
62 :
63 : Figure 2 shows a simplified block diagram of the TX packet flow.
64 : Userspace applications deliver packets to the XSK/UMEM buffers. The
65 : kernel then forwards these packets to the NIC. This also means that
66 : the application is responsible for maintaining a routing table to
67 : resolve layer-3 dest addrs to NICs and layer-2 addrs. As in the RX
68 : flow, netfilter (iptables, nftables) is not available.
69 :
70 : ### Memory Management
71 :
72 : The UMEM area is allocated from userspace. It is recommended to use
73 : the fd_util shmem/wksp APIs to obtain large page-backed memory. UMEM
74 : is divided into equally sized frames. At any point in time, each
75 : frame is either owned by userspace or the kernel. On initialization,
76 : all frames are owned by userspace.
77 :
78 : Changes in UMEM frame ownership and packet RX/TX events are
79 : transmitted via four rings allocated by the kernel (mmap()ed in by
80 : the user). This allows for out-of-order processing of packets.
81 :
82 : Data flow:
83 : (U->K) is userspace-to-kernel communication, and
84 : (K->U) is kernel-to-userspace.
85 :
86 : FILL Free frames are provided to the kernel using the FILL
87 : (U->K) ring. The kernel may populate these frames with RX
88 : packet data.
89 :
90 : RX Once the kernel has populated a FILL frame with RX
91 : (K->U) packet data, it passes back the frame to userspace
92 : via the RX queue.
93 :
94 : TX TX frames sent by userspace are provided to the
95 : (U->K) kernel using the TX ring.
96 :
97 : COMPLETION Once the kernel has processed a TX frame, it passes
98 : (K->U) back the frame to the userspace via the COMPLETION
99 : queue.
100 :
101 : Combined, the FILL-RX and TX-COMPLETION rings form two pairs. The
102 : kernel will not move frames between the pairs. */
103 :
104 : #include <linux/if_link.h>
105 : #include <net/if.h>
106 :
107 : #include "../../util/fd_util_base.h"
108 :
109 : /* FD_XSK_ALIGN: alignment of fd_xsk_t. */
110 66 : #define FD_XSK_ALIGN (4096UL)
111 :
112 : /* FD_XSK_UMEM_ALIGN: byte alignment of UMEM area within fd_xsk_t.
113 : This requirement is set by the kernel as of Linux 4.18. */
114 21 : #define FD_XSK_UMEM_ALIGN (4096UL)
115 :
116 : /* Forward declarations */
117 : struct fd_xsk_private;
118 : typedef struct fd_xsk_private fd_xsk_t;
119 :
120 : /* fd_xsk_frame_meta_t: Frame metadata used to identify packet */
121 :
122 : #define FD_XDP_FRAME_META_ALIGN (16UL)
123 :
124 : struct __attribute__((aligned(FD_XDP_FRAME_META_ALIGN))) fd_xsk_frame_meta {
125 : ulong off; /* Byte offset from UMEM start to start of packet */
126 : uint sz; /* Size of packet data starting at `off` */
127 : uint flags; /* Undefined for now */
128 : };
129 : typedef struct fd_xsk_frame_meta fd_xsk_frame_meta_t;
130 :
131 : /* fd_xsk_params_t: Memory layout parameters of XSK.
132 : Can be retrieved using fd_xsk_get_params() */
133 :
134 : struct fd_xsk_params {
135 : /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX,
136 : TX, Completion XSK rings respectively. */
137 : ulong fr_depth;
138 : ulong rx_depth;
139 : ulong tx_depth;
140 : ulong cr_depth;
141 :
142 : /* frame_sz: Controls the frame size used in the UMEM ring buffers. */
143 : ulong frame_sz;
144 :
145 : /* umem_sz: Total size of XSK ring shared memory area (contiguous).
146 : Aligned by FD_XSK_ALIGN. */
147 : ulong umem_sz;
148 : };
149 : typedef struct fd_xsk_params fd_xsk_params_t;
150 :
151 : FD_PROTOTYPES_BEGIN
152 :
153 : /* Setup API **********************************************************/
154 :
155 : /* fd_xsk_{align,footprint} return the required alignment and
156 : footprint of a memory region suitable for use as an fd_xsk_t.
157 : See fd_xsk_new for explanations on parameters. */
158 :
159 : FD_FN_CONST ulong
160 : fd_xsk_align( void );
161 :
162 : FD_FN_CONST ulong
163 : fd_xsk_footprint( ulong frame_sz,
164 : ulong fr_depth,
165 : ulong rx_depth,
166 : ulong tx_depth,
167 : ulong cr_depth );
168 :
169 : /* fd_xsk_new formats an unused memory region for use as an fd_xsk_t.
170 : shmem must point to a memory region that matches fd_xsk_align() and
171 : fd_xsk_footprint(). frame_sz controls the frame size used in the
172 : UMEM ring buffers and should be either 2048 or 4096.
173 : {fr,rx,tx,cr}_depth control the number of frames allocated for the
174 : Fill, RX, TX, Completion rings respectively. If zero_copy is
175 : non-zero, the xsk will be created in zero-copy mode. Returns handle
176 : suitable for fd_xsk_join() on success. */
177 :
178 : void *
179 : fd_xsk_new( void * shmem,
180 : ulong frame_sz,
181 : ulong fr_depth,
182 : ulong rx_depth,
183 : ulong tx_depth,
184 : ulong cr_depth );
185 :
186 : /* fd_xsk_join joins the caller to the fd_xsk_t */
187 :
188 : fd_xsk_t *
189 : fd_xsk_join( void * shxsk );
190 :
191 : /* fd_xsk_init creates an XSK, registers UMEM, maps rings, and binds the
192 : socket to the given interface queue. This is a potentially
193 : destructive operation. As of 2024-Jun, AF_XDP zero copy support is
194 : still buggy in some device drivers.
195 :
196 : Assume that all traffic sent to this interface is compromised. On
197 : some devices, the NIC is instructed to DMA all incoming packets into
198 : UMEM, even ones not belonging to Firedancer. Those are then later
199 : on software-copied out to skbs again. This further implies that
200 : enabling AF_XDP can slow down the regular kernel receive path.
201 :
202 : Requires CAP_SYS_ADMIN. May issue the following syscalls:
203 :
204 : - socket( AF_XDP, SOCK_RAW, 0 ) = fd
205 : - setsockopt( fd, SOL_XDP, ... )
206 : - getsockopt( fd, SOL_XDP, ... )
207 : - mmap( ..., fd, ... )
208 : - bind( fd, ... )
209 : - munmap ; on fail
210 : - close ; on fail */
211 :
212 : fd_xsk_t *
213 : fd_xsk_init( fd_xsk_t * xsk,
214 : uint if_idx, /* see if_nametoindex(3) */
215 : uint if_queue, /* queue index (type combined) */
216 : uint bind_flags ); /* e.g. XDP_ZEROCOPY */
217 :
218 : /* fd_xsk_fini unmaps XSK rings and closes the XSK file descriptor.
219 : This effectively returns the interface to the state before
220 : fd_xsk_init.
221 :
222 : May issue the following syscalls:
223 :
224 : - munmap
225 : - close */
226 :
227 : fd_xsk_t *
228 : fd_xsk_fini( fd_xsk_t * xsk );
229 :
230 : /* fd_xsk_leave leaves a current local join and releases all kernel
231 : resources. Returns a pointer to the underlying shared memory region
232 : on success and NULL on failure (logs details). Reasons for failure
233 : include xsk is NULL. */
234 :
235 : void *
236 : fd_xsk_leave( fd_xsk_t * xsk );
237 :
238 : /* fd_xsk_delete unformats a memory region used as an fd_xsk_t. Assumes
239 : nobody is joined to the region. Returns a pointer to the underlying
240 : shared memory region or NULL if used obviously in error (e.g. shxsk
241 : does not point to an fd_xsk_t ... logs details). The ownership of
242 : the memory region is transferred to the caller on success. */
243 :
244 : void *
245 : fd_xsk_delete( void * shxsk );
246 :
247 : /* I/O API ************************************************************/
248 :
249 : /* fd_xsk_rx_enqueue: Enqueues a batch of frames for RX.
250 :
251 : An RX enqueue transfers ownership of frames to the kernel using the
252 : fill ring, providing it space for incoming packet data. Successful
253 : enqueue does not imply that packets have actually been received, but
254 : rather just indicates that the frame memory is registered with the
255 : AF_XDP socket.
256 :
257 : offsets points to an array containing offsets_cnt items.
258 : Each offsets[k] for k in [0;offsets_cnt-1] is the frame's byte offset
259 : relative to the start of the UMEM region. Returns the number of
260 : frames n enqueued where n<=offsets_cnt. Each frame (identified by
261 : its offset) may not be reused in another enqueue until it is returned
262 : in fd_xsk_rx_complete. The frames that failed to enqueue are in
263 : [n;offsets_cnt-1] and may be retried in a later call. */
264 :
265 : ulong
266 : fd_xsk_rx_enqueue( fd_xsk_t * xsk,
267 : ulong * offsets,
268 : ulong offsets_cnt );
269 :
270 : /* fd_xsk_rx_enqueue2: See fd_xsk_rx_enqueue.
271 :
272 : meta points to an array containing meta_cnt items. For each k in
273 : [0;meta_cnt-1], meta[k].off is the frame's byte offset relative to
274 : the start of the UMEM region. meta[k].{sz,flags} are ignored. */
275 :
276 : ulong
277 : fd_xsk_rx_enqueue2( fd_xsk_t * xsk,
278 : fd_xsk_frame_meta_t * meta,
279 : ulong meta_cnt );
280 :
281 : /* fd_xsk_rx_complete: Receives RX completions for a batch of frames.
282 :
283 : An RX completion means that a packet has been received and transfers
284 : ownership of the frame holding the packet over to userspace.
285 : meta_cnt is the number of packets that the caller is able to receive.
286 : meta points to an array containing meta_cnt records where each k in
287 : [0,count-1] may fill a packet meta at meta[k]. Returns the number of
288 : packets actually received, which may be less than meta_cnt. */
289 :
290 : ulong
291 : fd_xsk_rx_complete( fd_xsk_t * xsk,
292 : fd_xsk_frame_meta_t * meta,
293 : ulong meta_cnt );
294 :
295 :
296 : /* fd_xsk_tx_enqueue: Enqueues a batch of frames for TX.
297 :
298 : meta_cnt is the number of packets to attempt to enqueue for transmit.
299 : meta points to an array containing meta_cnt records where each k in
300 : [0,count-1] enqueues frame at meta[k]. Returns the number of frames
301 : actually enqueued, which may be less than meta_cnt. Successful en-
302 : queue does not imply that packets have actually been sent out to the
303 : network, but rather just indicates that the frame memory is
304 : registered with the AF_XDP sockets. The frames that failed to
305 : enqueue are referred to by meta[N+] and may be retried in a later
306 : call. */
307 :
308 : ulong
309 : fd_xsk_tx_enqueue( fd_xsk_t * xsk,
310 : fd_xsk_frame_meta_t * meta,
311 : ulong meta_cnt,
312 : int flush );
313 :
314 :
315 : /* fd_xsk_tx_complete: Check for TX completions and reclaim frames.
316 :
317 : A TX completion occurs when a previously enqueued TX packet has been
318 : fully handed off to the NIC or dropped. This transfers the ownership
319 : of the corresponding frame back to the XSK, where the caller can
320 : retrieve it for future writes using this function. Note that this
321 : does not guarantee successful delivery to the network destination.
322 :
323 : offsets points to an array containing offsets_cnt items.
324 : Returns the number of frames n completed where n<=offsets_cnt.
325 : Each k in [0;n-1] writes a completion at offsets[k] where offsets[k]
326 : is the frame byte offset relative to the start of the UMEM region. */
327 :
328 : ulong
329 : fd_xsk_tx_complete( fd_xsk_t * xsk,
330 : ulong * offsets,
331 : ulong offsets_cnt );
332 :
333 : /* fd_xsk_tx_complete2: See fd_xsk_tx_complete.
334 :
335 : fd_xsk_tx_complete2 behaves similar to fd_xsk_tx_complete, except
336 : that it takes a pointer to an array of fd_xsk_frame_meta_t instead
337 : of ulong. meta points to an array containing meta_cnt.
338 : Each k in [0;n-1] writes a frame meta at meta[k] where
339 : meta[k].off is the frame offset relative to the UMEM region's start
340 : and `meta[k].{sz,flags}` are undefined. */
341 :
342 : ulong
343 : fd_xsk_tx_complete2( fd_xsk_t * xsk,
344 : fd_xsk_frame_meta_t * meta,
345 : ulong meta_cnt );
346 :
347 : /* fd_xsk_fd: Returns the XSK file descriptor. */
348 :
349 : FD_FN_PURE int
350 : fd_xsk_fd( fd_xsk_t * const xsk );
351 :
352 : /* fd_xsk_ifidx: Returns the network interface index of that the
353 : XSK is currently bound to. May return zero if the XSK is not bound. */
354 :
355 : FD_FN_PURE uint
356 : fd_xsk_ifidx( fd_xsk_t * const xsk );
357 :
358 : /* fd_xsk_ifqueue: Returns the queue index that the XSK is currently
359 : bound to (a network interface can have multiple queues). U.B if
360 : fd_xsk_ifname() returns NULL. */
361 :
362 : FD_FN_PURE uint
363 : fd_xsk_ifqueue( fd_xsk_t * const xsk );
364 :
365 : /* fd_xsk_umem_laddr returns a pointer to the XSK frame memory region in
366 : the caller's local address space. */
367 :
368 : FD_FN_CONST void *
369 : fd_xsk_umem_laddr( fd_xsk_t * xsk );
370 :
371 : /* fd_xsk_get_params returns a pointer to the memory layout params from
372 : xsk. The caller should zero-initialize the params buffer before use.
373 : xsk must be a valid join to fd_xsk_t and params must point to a
374 : memory region in the caller's local address space. The returned
375 : params struct is valid during the lifetime of the xsk. */
376 :
377 : FD_FN_CONST fd_xsk_params_t const *
378 : fd_xsk_get_params( fd_xsk_t const * xsk );
379 :
380 : FD_PROTOTYPES_END
381 :
382 : #endif /* defined(__linux__) */
383 : #endif /* HEADER_fd_src_waltz_xdp_fd_xsk_h */
|