Line data Source code
1 : #if !defined(__linux__)
2 : #error "fd_xsk requires Linux operating system with XDP support"
3 : #endif
4 :
5 : #include <errno.h>
6 : #include <stdio.h> /* snprintf */
7 : #include <unistd.h>
8 : #include <sys/mman.h> /* mmap */
9 : #include <sys/types.h>
10 : #include <sys/socket.h> /* sendto */
11 :
12 : #include "../../util/log/fd_log.h"
13 : #include "fd_xsk.h"
14 :
15 : /* Join/leave *********************************************************/
16 :
17 : /* fd_xsk_mmap_offset_cstr: Returns a cstr describing the given offset
18 : param (6th argument of mmap(2)) assuming fd (5th param of mmap(2)) is
19 : an XSK file descriptor. Returned cstr is valid until next call. */
20 : static char const *
21 0 : fd_xsk_mmap_offset_cstr( long mmap_off ) {
22 0 : switch( mmap_off ) {
23 0 : case XDP_PGOFF_RX_RING: return "XDP_PGOFF_RX_RING";
24 0 : case XDP_PGOFF_TX_RING: return "XDP_PGOFF_TX_RING";
25 0 : case XDP_UMEM_PGOFF_FILL_RING: return "XDP_UMEM_PGOFF_FILL_RING";
26 0 : case XDP_UMEM_PGOFF_COMPLETION_RING: return "XDP_UMEM_PGOFF_COMPLETION_RING";
27 0 : default: {
28 0 : static char buf[ 19UL ];
29 0 : snprintf( buf, 19UL, "0x%lx", (ulong)mmap_off );
30 0 : return buf;
31 0 : }
32 0 : }
33 0 : }
34 :
35 : /* fd_xsk_mmap_ring maps the given XSK ring into the local address space
36 : and populates fd_ring_desc_t. Every successful call to this function
37 : should eventually be paired with a call to fd_xsk_munmap_ring(). */
38 : static int
39 : fd_xsk_mmap_ring( fd_xdp_ring_t * ring,
40 : int xsk_fd,
41 : long map_off,
42 : ulong elem_sz,
43 : ulong depth,
44 0 : struct xdp_ring_offset const * ring_offset ) {
45 : /* TODO what is ring_offset->desc ? */
46 : /* TODO: mmap was originally called with MAP_POPULATE,
47 : but this symbol isn't available with this build */
48 :
49 : /* sanity check */
50 0 : if( depth > (ulong)UINT_MAX ) {
51 0 : return -1;
52 0 : }
53 :
54 0 : ulong map_sz = ring_offset->desc + depth*elem_sz;
55 :
56 0 : void * res = mmap( NULL, map_sz, PROT_READ|PROT_WRITE, MAP_SHARED, xsk_fd, map_off );
57 0 : if( FD_UNLIKELY( res==MAP_FAILED ) ) {
58 0 : FD_LOG_WARNING(( "mmap(NULL, %lu, PROT_READ|PROT_WRITE, MAP_SHARED, xsk_fd, %s) failed (%i-%s)",
59 0 : map_sz, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
60 0 : return -1;
61 0 : }
62 :
63 : /* TODO add unit test asserting that cached prod/cons seq gets
64 : cleared on join */
65 0 : fd_memset( ring, 0, sizeof(fd_xdp_ring_t) );
66 :
67 0 : ring->mem = res;
68 0 : ring->map_sz = map_sz;
69 0 : ring->depth = (uint)depth;
70 0 : ring->ptr = (void *)( (ulong)res + ring_offset->desc );
71 0 : ring->flags = (uint *)( (ulong)res + ring_offset->flags );
72 0 : ring->prod = (uint *)( (ulong)res + ring_offset->producer );
73 0 : ring->cons = (uint *)( (ulong)res + ring_offset->consumer );
74 :
75 0 : return 0;
76 0 : }
77 :
78 : /* fd_xsk_munmap_ring unmaps the given XSK ring from the local address
79 : space and zeroes fd_ring_desc_t. */
80 : static void
81 : fd_xsk_munmap_ring( fd_xdp_ring_t * ring,
82 0 : long map_off ) {
83 0 : if( FD_UNLIKELY( !ring->mem ) ) return;
84 :
85 0 : void * mem = ring->mem;
86 0 : ulong sz = ring->map_sz;
87 :
88 0 : fd_memset( ring, 0, sizeof(fd_xdp_ring_t) );
89 :
90 0 : if( FD_UNLIKELY( 0!=munmap( mem, sz ) ) )
91 0 : FD_LOG_WARNING(( "munmap(%p, %lu) on %s ring failed (%i-%s)",
92 0 : mem, sz, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
93 0 : }
94 :
95 : /* fd_xsk_cleanup undoes a (partial) join by releasing all active kernel
96 : objects, such as mapped memory regions and file descriptors. Assumes
97 : that no join to `xsk` is currently being used. */
98 :
99 : fd_xsk_t *
100 0 : fd_xsk_fini( fd_xsk_t * xsk ) {
101 : /* Undo memory mappings */
102 :
103 0 : fd_xsk_munmap_ring( &xsk->ring_rx, XDP_PGOFF_RX_RING );
104 0 : fd_xsk_munmap_ring( &xsk->ring_tx, XDP_PGOFF_TX_RING );
105 0 : fd_xsk_munmap_ring( &xsk->ring_fr, XDP_UMEM_PGOFF_FILL_RING );
106 0 : fd_xsk_munmap_ring( &xsk->ring_cr, XDP_UMEM_PGOFF_COMPLETION_RING );
107 :
108 : /* Release XSK */
109 :
110 0 : if( FD_LIKELY( xsk->xsk_fd>=0 ) ) {
111 : /* Clear XSK descriptors */
112 0 : fd_memset( &xsk->offsets, 0, sizeof(struct xdp_mmap_offsets) );
113 : /* Close XSK */
114 0 : close( xsk->xsk_fd );
115 0 : xsk->xsk_fd = -1;
116 0 : }
117 :
118 0 : return xsk;
119 0 : }
120 :
121 : /* fd_xsk_setup_umem: Initializes xdp_umem_reg and hooks up XSK with
122 : UMEM rings via setsockopt(). Retrieves xdp_mmap_offsets via
123 : getsockopt(). Returns 0 on success, -1 on failure. */
124 : static int
125 : fd_xsk_setup_umem( fd_xsk_t * xsk,
126 0 : fd_xsk_params_t const * params ) {
127 :
128 : /* Initialize xdp_umem_reg */
129 0 : struct xdp_umem_reg umem_reg = {
130 0 : .addr = (ulong)params->umem_addr,
131 0 : .len = params->umem_sz,
132 0 : .chunk_size = (uint)params->frame_sz,
133 0 : };
134 :
135 : /* Register UMEM region */
136 0 : int res;
137 0 : res = setsockopt( xsk->xsk_fd, SOL_XDP, XDP_UMEM_REG,
138 0 : &umem_reg, sizeof(struct xdp_umem_reg) );
139 0 : if( FD_UNLIKELY( res!=0 ) ) {
140 0 : FD_LOG_WARNING(( "setsockopt(SOL_XDP,XDP_UMEM_REG(addr=%p,len=%lu,chunk_size=%lu)) failed (%i-%s)",
141 0 : (void *)umem_reg.addr, (ulong)umem_reg.len, (ulong)umem_reg.chunk_size,
142 0 : errno, fd_io_strerror( errno ) ));
143 0 : return -1;
144 0 : }
145 :
146 : /* Set ring frame counts */
147 0 : # define FD_SET_XSK_RING_DEPTH(name, var) \
148 0 : do { \
149 0 : res = setsockopt( xsk->xsk_fd, SOL_XDP, name, &(var), 8UL ); \
150 0 : if( FD_UNLIKELY( res!=0 ) ) { \
151 0 : FD_LOG_WARNING(( "setsockopt(SOL_XDP," #name ",%lu) failed (%i-%s)", \
152 0 : var, errno, fd_io_strerror( errno ) )); \
153 0 : return -1; \
154 0 : } \
155 0 : } while(0)
156 0 : FD_SET_XSK_RING_DEPTH( XDP_UMEM_FILL_RING, params->fr_depth );
157 0 : FD_SET_XSK_RING_DEPTH( XDP_RX_RING, params->rx_depth );
158 0 : FD_SET_XSK_RING_DEPTH( XDP_TX_RING, params->tx_depth );
159 0 : FD_SET_XSK_RING_DEPTH( XDP_UMEM_COMPLETION_RING, params->cr_depth );
160 0 : # undef FD_SET_XSK_RING_DEPTH
161 :
162 : /* Request ring offsets */
163 0 : socklen_t offsets_sz = sizeof(struct xdp_mmap_offsets);
164 0 : res = getsockopt( xsk->xsk_fd, SOL_XDP, XDP_MMAP_OFFSETS,
165 0 : &xsk->offsets, &offsets_sz );
166 0 : if( FD_UNLIKELY( res!=0 ) ) {
167 0 : FD_LOG_WARNING(( "getsockopt(SOL_XDP, XDP_MMAP_OFFSETS) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
168 0 : return -1;
169 0 : }
170 :
171 : /* OK */
172 0 : return 0;
173 0 : }
174 :
175 : /* fd_xsk_init: Creates and configures an XSK socket object, and
176 : attaches to a preinstalled XDP program. The various steps are
177 : implemented in fd_xsk_setup_{...}. */
178 :
179 : fd_xsk_t *
180 : fd_xsk_init( fd_xsk_t * xsk,
181 0 : fd_xsk_params_t const * params ) {
182 :
183 0 : if( FD_UNLIKELY( !xsk ) ) { FD_LOG_WARNING(( "NULL xsk" )); return NULL; }
184 0 : memset( xsk, 0, sizeof(fd_xsk_t) );
185 :
186 0 : if( FD_UNLIKELY( !params->if_idx ) ) { FD_LOG_WARNING(( "zero if_idx" )); return NULL; }
187 0 : if( FD_UNLIKELY( (!params->fr_depth) | (!params->rx_depth) |
188 0 : (!params->tx_depth) | (!params->cr_depth) ) ) {
189 0 : FD_LOG_WARNING(( "invalid {fr,rx,tx,cr}_depth" ));
190 0 : return NULL;
191 0 : }
192 0 : if( FD_UNLIKELY( !params->umem_addr ) ) {
193 0 : FD_LOG_WARNING(( "NULL umem_addr" ));
194 0 : return NULL;
195 0 : }
196 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)params->umem_addr, 4096UL ) ) ) {
197 0 : FD_LOG_WARNING(( "misaligned params->umem_addr" ));
198 0 : return NULL;
199 0 : }
200 0 : if( FD_UNLIKELY( !params->frame_sz || !fd_ulong_is_pow2( params->frame_sz ) ) ) {
201 0 : FD_LOG_WARNING(( "invalid frame_sz" ));
202 0 : return NULL;
203 0 : }
204 :
205 0 : xsk->if_idx = params->if_idx;
206 0 : xsk->if_queue_id = params->if_queue_id;
207 :
208 : /* Create XDP socket (XSK) */
209 :
210 0 : xsk->xsk_fd = socket( AF_XDP, SOCK_RAW, 0 );
211 0 : if( FD_UNLIKELY( xsk->xsk_fd<0 ) ) {
212 0 : FD_LOG_WARNING(( "Failed to create XSK (%i-%s)", errno, fd_io_strerror( errno ) ));
213 0 : return NULL;
214 0 : }
215 :
216 : /* Associate UMEM region of fd_xsk_t with XSK via setsockopt() */
217 :
218 0 : if( FD_UNLIKELY( 0!=fd_xsk_setup_umem( xsk, params ) ) ) goto fail;
219 :
220 : /* Map XSK rings into local address space */
221 :
222 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_rx, xsk->xsk_fd, XDP_PGOFF_RX_RING, sizeof(struct xdp_desc), params->rx_depth, &xsk->offsets.rx ) ) ) goto fail;
223 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_tx, xsk->xsk_fd, XDP_PGOFF_TX_RING, sizeof(struct xdp_desc), params->tx_depth, &xsk->offsets.tx ) ) ) goto fail;
224 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_fr, xsk->xsk_fd, XDP_UMEM_PGOFF_FILL_RING, sizeof(ulong), params->fr_depth, &xsk->offsets.fr ) ) ) goto fail;
225 0 : if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_cr, xsk->xsk_fd, XDP_UMEM_PGOFF_COMPLETION_RING, sizeof(ulong), params->cr_depth, &xsk->offsets.cr ) ) ) goto fail;
226 :
227 : /* Bind XSK to queue on network interface */
228 :
229 0 : uint flags = XDP_USE_NEED_WAKEUP | params->bind_flags;
230 0 : struct sockaddr_xdp sa = {
231 0 : .sxdp_family = PF_XDP,
232 0 : .sxdp_ifindex = xsk->if_idx,
233 0 : .sxdp_queue_id = xsk->if_queue_id,
234 : /* See extended commentary below for details on XDP_USE_NEED_WAKEUP
235 : flag. */
236 0 : .sxdp_flags = (ushort)flags
237 0 : };
238 :
239 0 : char if_name[ IF_NAMESIZE ] = {0};
240 :
241 0 : if( FD_UNLIKELY( 0!=bind( xsk->xsk_fd, (void *)&sa, sizeof(struct sockaddr_xdp) ) ) ) {
242 0 : FD_LOG_WARNING(( "bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) failed (%i-%s)",
243 0 : xsk->if_idx, if_indextoname( xsk->if_idx, if_name ),
244 0 : xsk->if_queue_id, flags,
245 0 : errno, fd_io_strerror( errno ) ));
246 0 : goto fail;
247 0 : }
248 :
249 : /* We've seen that some popular Intel NICs seem to have a bug that
250 : prevents them from working in SKB mode with certain kernel
251 : versions. We can identify them by sendto returning ENXIO or EINVAL
252 : in newer versions. The core of the problem is that the kernel
253 : calls the generic ndo_bpf pointer instead of the driver-specific
254 : version. This means that the driver's pointer to the BPF program
255 : never gets set, yet the driver's wakeup function gets called. */
256 0 : if( FD_UNLIKELY( -1==sendto( xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) {
257 0 : if( FD_LIKELY( errno==ENXIO || errno==EINVAL ) ) {
258 0 : FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s). This likely indicates "
259 0 : "a bug with your NIC driver. Try switching XDP mode using "
260 0 : "tiles.net.xdp_mode in the configuration TOML.\n"
261 0 : "Certain Intel NICs with certain driver/kernel combinations "
262 0 : "are known to exhibit this issue in skb mode but work in drv "
263 0 : "mode.", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
264 0 : } else {
265 0 : FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
266 0 : }
267 0 : }
268 :
269 : /* XSK successfully configured. Traffic will arrive in XSK after
270 : configuring an XDP program to forward packets via XDP_REDIRECT.
271 : This requires providing the XSK file descriptor to the program via
272 : XSKMAP and is done in a separate step. */
273 :
274 0 : FD_LOG_INFO(( "AF_XDP socket initialized: bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) success",
275 0 : xsk->if_idx, if_indextoname( xsk->if_idx, if_name ), xsk->if_queue_id, flags ));
276 :
277 0 : return xsk;
278 :
279 0 : fail:
280 0 : fd_xsk_fini( xsk );
281 0 : return NULL;
282 0 : }
|