Line data Source code
1 : #if !defined(__linux__)
2 : #error "fd_xsk requires Linux operating system with XDP support"
3 : #endif
4 :
5 : #define _GNU_SOURCE /* MADV_DONTDUMP */
6 :
7 : #include <errno.h>
8 : #include <stdio.h> /* snprintf */
9 : #include <unistd.h>
10 : #include <sys/mman.h> /* mmap */
11 : #include <sys/types.h>
12 : #include <sys/socket.h> /* sendto */
13 : #include <sys/syscall.h> /* SYS_mlock */
14 :
15 : #include "../../util/log/fd_log.h"
16 : #include "fd_xsk.h"
17 :
18 : /* Join/leave *********************************************************/
19 :
20 : /* fd_xsk_mmap_offset_cstr: Returns a cstr describing the given offset
21 : param (6th argument of mmap(2)) assuming fd (5th param of mmap(2)) is
22 : an XSK file descriptor. Returned cstr is valid until next call. */
23 : static char const *
24 0 : fd_xsk_mmap_offset_cstr( long mmap_off ) {
25 0 : switch( mmap_off ) {
26 0 : case XDP_PGOFF_RX_RING: return "XDP_PGOFF_RX_RING";
27 0 : case XDP_PGOFF_TX_RING: return "XDP_PGOFF_TX_RING";
28 0 : case XDP_UMEM_PGOFF_FILL_RING: return "XDP_UMEM_PGOFF_FILL_RING";
29 0 : case XDP_UMEM_PGOFF_COMPLETION_RING: return "XDP_UMEM_PGOFF_COMPLETION_RING";
30 0 : default: {
31 0 : static char buf[ 19UL ];
32 0 : snprintf( buf, 19UL, "0x%lx", (ulong)mmap_off );
33 0 : return buf;
34 0 : }
35 0 : }
36 0 : }
37 :
38 : /* fd_xsk_mmap_ring maps the given XSK ring into the local address space
39 : and populates fd_ring_desc_t. Every successful call to this function
40 : should eventually be paired with a call to fd_xsk_munmap_ring(). */
41 : static int
42 : fd_xsk_mmap_ring( fd_xdp_ring_t * ring,
43 : int xsk_fd,
44 : long map_off,
45 : ulong elem_sz,
46 : ulong depth,
47 0 : struct xdp_ring_offset const * ring_offset ) {
48 : /* TODO what is ring_offset->desc ? */
49 :
50 : /* sanity check */
51 0 : if( depth > (ulong)UINT_MAX ) {
52 0 : return -1;
53 0 : }
54 :
55 0 : ulong map_sz = ring_offset->desc + depth*elem_sz;
56 :
57 0 : void * res = mmap( NULL, map_sz, PROT_READ|PROT_WRITE, MAP_SHARED, xsk_fd, map_off );
58 0 : if( FD_UNLIKELY( res==MAP_FAILED ) ) {
59 0 : FD_LOG_WARNING(( "mmap(NULL, %lu, PROT_READ|PROT_WRITE, MAP_SHARED, xsk_fd, %s) failed (%i-%s)",
60 0 : map_sz, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
61 0 : return -1;
62 0 : }
63 :
64 : /* Lock descriptor rings to prevent swapping. Also advise the
65 : kernel to exclude this region from core dumps for consistency
66 : with fd_shmem. Reimplements syscall logic of fd_numa_mlock()
67 : from fd_shmem_private.h to circumvent the ASan interceptor
68 : and avoid private header dependencies. */
69 :
70 0 : if( FD_UNLIKELY( (int)syscall( SYS_mlock, res, map_sz ) ) )
71 0 : FD_LOG_WARNING(( "syscall(SYS_mlock, %p, %lu KiB) on %s ring failed (%i-%s); attempting to continue",
72 0 : res, map_sz>>10, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
73 :
74 0 : if( FD_UNLIKELY( madvise( res, map_sz, MADV_DONTDUMP ) ) )
75 0 : FD_LOG_WARNING(( "madvise(%p, %lu KiB) on %s ring failed (%i-%s); attempting to continue",
76 0 : res, map_sz>>10, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
77 :
78 : /* TODO add unit test asserting that cached prod/cons seq gets
79 : cleared on join */
80 0 : fd_memset( ring, 0, sizeof(fd_xdp_ring_t) );
81 :
82 0 : ring->mem = res;
83 0 : ring->map_sz = map_sz;
84 0 : ring->depth = (uint)depth;
85 0 : ring->ptr = (void *)( (ulong)res + ring_offset->desc );
86 0 : ring->flags = (uint *)( (ulong)res + ring_offset->flags );
87 0 : ring->prod = (uint *)( (ulong)res + ring_offset->producer );
88 0 : ring->cons = (uint *)( (ulong)res + ring_offset->consumer );
89 :
90 0 : return 0;
91 0 : }
92 :
93 : /* fd_xsk_munmap_ring unmaps the given XSK ring from the local address
94 : space and zeroes fd_ring_desc_t. */
95 : static void
96 : fd_xsk_munmap_ring( fd_xdp_ring_t * ring,
97 0 : long map_off ) {
98 0 : if( FD_UNLIKELY( !ring->mem ) ) return;
99 :
100 0 : void * mem = ring->mem;
101 0 : ulong sz = ring->map_sz;
102 :
103 0 : fd_memset( ring, 0, sizeof(fd_xdp_ring_t) );
104 :
105 0 : if( FD_UNLIKELY( 0!=munmap( mem, sz ) ) )
106 0 : FD_LOG_WARNING(( "munmap(%p, %lu) on %s ring failed (%i-%s)",
107 0 : mem, sz, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
108 0 : }
109 :
110 : /* fd_xsk_cleanup undoes a (partial) join by releasing all active kernel
111 : objects, such as mapped memory regions and file descriptors. Assumes
112 : that no join to `xsk` is currently being used. */
113 :
114 : fd_xsk_t *
115 0 : fd_xsk_fini( fd_xsk_t * xsk ) {
116 : /* Undo memory mappings */
117 :
118 0 : fd_xsk_munmap_ring( &xsk->ring_rx, XDP_PGOFF_RX_RING );
119 0 : fd_xsk_munmap_ring( &xsk->ring_tx, XDP_PGOFF_TX_RING );
120 0 : fd_xsk_munmap_ring( &xsk->ring_fr, XDP_UMEM_PGOFF_FILL_RING );
121 0 : fd_xsk_munmap_ring( &xsk->ring_cr, XDP_UMEM_PGOFF_COMPLETION_RING );
122 :
123 : /* Release XSK */
124 :
125 0 : if( FD_LIKELY( xsk->xsk_fd>=0 ) ) {
126 : /* Clear XSK descriptors */
127 0 : fd_memset( &xsk->offsets, 0, sizeof(struct xdp_mmap_offsets) );
128 : /* Close XSK */
129 0 : close( xsk->xsk_fd );
130 0 : xsk->xsk_fd = -1;
131 0 : }
132 :
133 0 : return xsk;
134 0 : }
135 :
136 : /* fd_xsk_setup_umem: Initializes xdp_umem_reg and hooks up XSK with
137 : UMEM rings via setsockopt(). Retrieves xdp_mmap_offsets via
138 : getsockopt(). Returns 0 on success, -1 on failure. */
139 : static int
140 : fd_xsk_setup_umem( fd_xsk_t * xsk,
141 0 : fd_xsk_params_t const * params ) {
142 :
143 : /* Initialize xdp_umem_reg */
144 0 : struct xdp_umem_reg umem_reg = {
145 0 : .addr = (ulong)params->umem_addr,
146 0 : .len = params->umem_sz,
147 0 : .chunk_size = (uint)params->frame_sz,
148 0 : };
149 :
150 : /* Register UMEM region */
151 0 : int res;
152 0 : res = setsockopt( xsk->xsk_fd, SOL_XDP, XDP_UMEM_REG,
153 0 : &umem_reg, sizeof(struct xdp_umem_reg) );
154 0 : if( FD_UNLIKELY( res!=0 ) ) {
155 0 : FD_LOG_WARNING(( "setsockopt(SOL_XDP,XDP_UMEM_REG(addr=%p,len=%lu,chunk_size=%lu)) failed (%i-%s)",
156 0 : (void *)umem_reg.addr, (ulong)umem_reg.len, (ulong)umem_reg.chunk_size,
157 0 : errno, fd_io_strerror( errno ) ));
158 0 : return -1;
159 0 : }
160 :
161 : /* Set ring frame counts */
162 0 : # define FD_SET_XSK_RING_DEPTH(name, var) \
163 0 : do { \
164 0 : res = setsockopt( xsk->xsk_fd, SOL_XDP, name, &(var), 8UL ); \
165 0 : if( FD_UNLIKELY( res!=0 ) ) { \
166 0 : FD_LOG_WARNING(( "setsockopt(SOL_XDP," #name ",%lu) failed (%i-%s)", \
167 0 : var, errno, fd_io_strerror( errno ) )); \
168 0 : return -1; \
169 0 : } \
170 0 : } while(0)
171 0 : FD_SET_XSK_RING_DEPTH( XDP_UMEM_FILL_RING, params->fr_depth );
172 0 : FD_SET_XSK_RING_DEPTH( XDP_RX_RING, params->rx_depth );
173 0 : FD_SET_XSK_RING_DEPTH( XDP_TX_RING, params->tx_depth );
174 0 : FD_SET_XSK_RING_DEPTH( XDP_UMEM_COMPLETION_RING, params->cr_depth );
175 0 : # undef FD_SET_XSK_RING_DEPTH
176 :
177 : /* Request ring offsets */
178 0 : socklen_t offsets_sz = sizeof(struct xdp_mmap_offsets);
179 0 : res = getsockopt( xsk->xsk_fd, SOL_XDP, XDP_MMAP_OFFSETS,
180 0 : &xsk->offsets, &offsets_sz );
181 0 : if( FD_UNLIKELY( res!=0 ) ) {
182 0 : FD_LOG_WARNING(( "getsockopt(SOL_XDP, XDP_MMAP_OFFSETS) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
183 0 : return -1;
184 0 : }
185 :
186 : /* OK */
187 0 : return 0;
188 0 : }
189 :
190 : /* fd_xsk_init: Creates and configures an XSK socket object, and
191 : attaches to a preinstalled XDP program. The various steps are
192 : implemented in fd_xsk_setup_{...}. */
193 :
194 : fd_xsk_t *
195 : fd_xsk_init( fd_xsk_t * xsk,
196 0 : fd_xsk_params_t const * params ) {
197 :
198 0 : if( FD_UNLIKELY( !xsk ) ) { FD_LOG_WARNING(( "NULL xsk" )); return NULL; }
199 0 : memset( xsk, 0, sizeof(fd_xsk_t) );
200 :
201 0 : if( FD_UNLIKELY( !params->if_idx ) ) { FD_LOG_WARNING(( "zero if_idx" )); return NULL; }
202 0 : if( FD_UNLIKELY( (!params->fr_depth) | (!params->rx_depth) |
203 0 : (!params->tx_depth) | (!params->cr_depth) ) ) {
204 0 : FD_LOG_WARNING(( "invalid {fr,rx,tx,cr}_depth" ));
205 0 : return NULL;
206 0 : }
207 0 : if( FD_UNLIKELY( !params->umem_addr ) ) {
208 0 : FD_LOG_WARNING(( "NULL umem_addr" ));
209 0 : return NULL;
210 0 : }
211 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)params->umem_addr, 4096UL ) ) ) {
212 0 : FD_LOG_WARNING(( "misaligned params->umem_addr" ));
213 0 : return NULL;
214 0 : }
215 0 : if( FD_UNLIKELY( !params->frame_sz || !fd_ulong_is_pow2( params->frame_sz ) ) ) {
216 0 : FD_LOG_WARNING(( "invalid frame_sz" ));
217 0 : return NULL;
218 0 : }
219 :
220 0 : xsk->if_idx = params->if_idx;
221 0 : xsk->if_queue_id = params->if_queue_id;
222 :
223 : /* Create XDP socket (XSK) */
224 :
225 0 : xsk->xsk_fd = socket( AF_XDP, SOCK_RAW, 0 );
226 0 : if( FD_UNLIKELY( xsk->xsk_fd<0 ) ) {
227 0 : FD_LOG_WARNING(( "Failed to create XSK (%i-%s)", errno, fd_io_strerror( errno ) ));
228 0 : return NULL;
229 0 : }
230 :
231 : /* Associate UMEM region of fd_xsk_t with XSK via setsockopt() */
232 :
233 0 : if( FD_UNLIKELY( 0!=fd_xsk_setup_umem( xsk, params ) ) ) goto fail;
234 :
235 : /* Map XSK rings into local address space */
236 :
237 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_rx, xsk->xsk_fd, XDP_PGOFF_RX_RING, sizeof(struct xdp_desc), params->rx_depth, &xsk->offsets.rx ) ) ) goto fail;
238 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_tx, xsk->xsk_fd, XDP_PGOFF_TX_RING, sizeof(struct xdp_desc), params->tx_depth, &xsk->offsets.tx ) ) ) goto fail;
239 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_fr, xsk->xsk_fd, XDP_UMEM_PGOFF_FILL_RING, sizeof(ulong), params->fr_depth, &xsk->offsets.fr ) ) ) goto fail;
240 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_cr, xsk->xsk_fd, XDP_UMEM_PGOFF_COMPLETION_RING, sizeof(ulong), params->cr_depth, &xsk->offsets.cr ) ) ) goto fail;
241 :
242 : /* Bind XSK to queue on network interface */
243 :
244 0 : uint flags = XDP_USE_NEED_WAKEUP | params->bind_flags;
245 0 : struct sockaddr_xdp sa = {
246 0 : .sxdp_family = PF_XDP,
247 0 : .sxdp_ifindex = xsk->if_idx,
248 0 : .sxdp_queue_id = xsk->if_queue_id,
249 : /* See extended commentary below for details on XDP_USE_NEED_WAKEUP
250 : flag. */
251 0 : .sxdp_flags = (ushort)flags
252 0 : };
253 :
254 0 : char if_name[ IF_NAMESIZE ] = {0};
255 :
256 0 : if( FD_UNLIKELY( 0!=bind( xsk->xsk_fd, (void *)&sa, sizeof(struct sockaddr_xdp) ) ) ) {
257 0 : FD_LOG_WARNING(( "bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) failed (%i-%s)",
258 0 : xsk->if_idx, if_indextoname( xsk->if_idx, if_name ),
259 0 : xsk->if_queue_id, flags,
260 0 : errno, fd_io_strerror( errno ) ));
261 0 : goto fail;
262 0 : }
263 :
264 : /* We've seen that some popular Intel NICs seem to have a bug that
265 : prevents them from working in SKB mode with certain kernel
266 : versions. We can identify them by sendto returning ENXIO or EINVAL
267 : in newer versions. The core of the problem is that the kernel
268 : calls the generic ndo_bpf pointer instead of the driver-specific
269 : version. This means that the driver's pointer to the BPF program
270 : never gets set, yet the driver's wakeup function gets called. */
271 0 : if( FD_UNLIKELY( -1==sendto( xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) {
272 0 : if( FD_LIKELY( errno==ENXIO || errno==EINVAL ) ) {
273 0 : FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s). This likely indicates "
274 0 : "a bug with your NIC driver. Try switching XDP mode using "
275 0 : "net.xdp.xdp_mode in the configuration TOML.\n"
276 0 : "Certain Intel NICs with certain driver/kernel combinations "
277 0 : "are known to exhibit this issue in skb mode but work in drv "
278 0 : "mode.", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
279 0 : } else {
280 0 : FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
281 0 : }
282 0 : }
283 :
284 : /* XSK successfully configured. Traffic will arrive in XSK after
285 : configuring an XDP program to forward packets via XDP_REDIRECT.
286 : This requires providing the XSK file descriptor to the program via
287 : XSKMAP and is done in a separate step. */
288 :
289 0 : FD_LOG_INFO(( "AF_XDP socket initialized: bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) success",
290 0 : xsk->if_idx, if_indextoname( xsk->if_idx, if_name ), xsk->if_queue_id, flags ));
291 :
292 0 : return xsk;
293 :
294 0 : fail:
295 0 : fd_xsk_fini( xsk );
296 : return NULL;
297 0 : }
|