LCOV - code coverage report
Current view: top level - waltz/xdp - fd_xsk.c (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 165 0.0 %
Date: 2026-01-20 05:22:11 Functions: 0 6 0.0 %

          Line data    Source code
       1             : #if !defined(__linux__)
       2             : #error "fd_xsk requires Linux operating system with XDP support"
       3             : #endif
       4             : 
       5             : #define _GNU_SOURCE /* MADV_DONTDUMP */
       6             : 
       7             : #include <errno.h>
       8             : #include <stdio.h> /* snprintf */
       9             : #include <unistd.h>
      10             : #include <sys/mman.h> /* mmap */
      11             : #include <sys/types.h>
      12             : #include <sys/socket.h> /* sendto */
      13             : #include <sys/syscall.h> /* SYS_mlock */
      14             : 
      15             : #include "../../util/log/fd_log.h"
      16             : #include "fd_xsk.h"
      17             : 
      18             : /* Join/leave *********************************************************/
      19             : 
      20             : /* fd_xsk_mmap_offset_cstr: Returns a cstr describing the given offset
      21             :    param (6th argument of mmap(2)) assuming fd (5th param of mmap(2)) is
      22             :    an XSK file descriptor.  Returned cstr is valid until next call. */
      23             : static char const *
      24           0 : fd_xsk_mmap_offset_cstr( long mmap_off ) {
      25           0 :   switch( mmap_off ) {
      26           0 :   case XDP_PGOFF_RX_RING:              return "XDP_PGOFF_RX_RING";
      27           0 :   case XDP_PGOFF_TX_RING:              return "XDP_PGOFF_TX_RING";
      28           0 :   case XDP_UMEM_PGOFF_FILL_RING:       return "XDP_UMEM_PGOFF_FILL_RING";
      29           0 :   case XDP_UMEM_PGOFF_COMPLETION_RING: return "XDP_UMEM_PGOFF_COMPLETION_RING";
      30           0 :   default: {
      31           0 :     static char buf[ 19UL ];
      32           0 :     snprintf( buf, 19UL, "0x%lx", (ulong)mmap_off );
      33           0 :     return buf;
      34           0 :   }
      35           0 :   }
      36           0 : }
      37             : 
      38             : /* fd_xsk_mmap_ring maps the given XSK ring into the local address space
      39             :    and populates fd_ring_desc_t.  Every successful call to this function
      40             :    should eventually be paired with a call to fd_xsk_munmap_ring(). */
      41             : static int
      42             : fd_xsk_mmap_ring( fd_xdp_ring_t * ring,
      43             :                   int             xsk_fd,
      44             :                   long            map_off,
      45             :                   ulong           elem_sz,
      46             :                   ulong           depth,
      47           0 :                   struct xdp_ring_offset const * ring_offset ) {
      48             :   /* TODO what is ring_offset->desc ? */
      49             : 
      50             :   /* sanity check */
      51           0 :   if( depth > (ulong)UINT_MAX ) {
      52           0 :     return -1;
      53           0 :   }
      54             : 
      55           0 :   ulong map_sz = ring_offset->desc + depth*elem_sz;
      56             : 
      57           0 :   void * res = mmap( NULL, map_sz, PROT_READ|PROT_WRITE, MAP_SHARED, xsk_fd, map_off );
      58           0 :   if( FD_UNLIKELY( res==MAP_FAILED ) ) {
      59           0 :     FD_LOG_WARNING(( "mmap(NULL, %lu, PROT_READ|PROT_WRITE, MAP_SHARED, xsk_fd, %s) failed (%i-%s)",
      60           0 :                      map_sz, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
      61           0 :     return -1;
      62           0 :   }
      63             : 
      64             :   /* Lock descriptor rings to prevent swapping. Also advise the
      65             :      kernel to exclude this region from core dumps for consistency
      66             :      with fd_shmem. Reimplements syscall logic of fd_numa_mlock()
      67             :      from fd_shmem_private.h to circumvent the ASan interceptor
      68             :      and avoid private header dependencies. */
      69             : 
      70           0 :   if( FD_UNLIKELY( (int)syscall( SYS_mlock, res, map_sz ) ) )
      71           0 :     FD_LOG_WARNING(( "syscall(SYS_mlock, %p, %lu KiB) on %s ring failed (%i-%s); attempting to continue",
      72           0 :                      res, map_sz>>10, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
      73             : 
      74           0 :   if( FD_UNLIKELY( madvise( res, map_sz, MADV_DONTDUMP ) ) )
      75           0 :     FD_LOG_WARNING(( "madvise(%p, %lu KiB) on %s ring failed (%i-%s); attempting to continue",
      76           0 :                      res, map_sz>>10, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
      77             : 
      78             :   /* TODO add unit test asserting that cached prod/cons seq gets
      79             :           cleared on join */
      80           0 :   fd_memset( ring, 0, sizeof(fd_xdp_ring_t) );
      81             : 
      82           0 :   ring->mem    = res;
      83           0 :   ring->map_sz = map_sz;
      84           0 :   ring->depth  = (uint)depth;
      85           0 :   ring->ptr    = (void *)( (ulong)res + ring_offset->desc     );
      86           0 :   ring->flags  = (uint *)( (ulong)res + ring_offset->flags    );
      87           0 :   ring->prod   = (uint *)( (ulong)res + ring_offset->producer );
      88           0 :   ring->cons   = (uint *)( (ulong)res + ring_offset->consumer );
      89             : 
      90           0 :   return 0;
      91           0 : }
      92             : 
      93             : /* fd_xsk_munmap_ring unmaps the given XSK ring from the local address
      94             :    space and zeroes fd_ring_desc_t. */
      95             : static void
      96             : fd_xsk_munmap_ring( fd_xdp_ring_t * ring,
      97           0 :                     long             map_off ) {
      98           0 :   if( FD_UNLIKELY( !ring->mem ) ) return;
      99             : 
     100           0 :   void * mem = ring->mem;
     101           0 :   ulong  sz  = ring->map_sz;
     102             : 
     103           0 :   fd_memset( ring, 0, sizeof(fd_xdp_ring_t) );
     104             : 
     105           0 :   if( FD_UNLIKELY( 0!=munmap( mem, sz ) ) )
     106           0 :     FD_LOG_WARNING(( "munmap(%p, %lu) on %s ring failed (%i-%s)",
     107           0 :                      mem, sz, fd_xsk_mmap_offset_cstr( map_off ), errno, fd_io_strerror( errno ) ));
     108           0 : }
     109             : 
     110             : /* fd_xsk_cleanup undoes a (partial) join by releasing all active kernel
     111             :    objects, such as mapped memory regions and file descriptors.  Assumes
     112             :    that no join to `xsk` is currently being used. */
     113             : 
     114             : fd_xsk_t *
     115           0 : fd_xsk_fini( fd_xsk_t * xsk ) {
     116             :   /* Undo memory mappings */
     117             : 
     118           0 :   fd_xsk_munmap_ring( &xsk->ring_rx, XDP_PGOFF_RX_RING              );
     119           0 :   fd_xsk_munmap_ring( &xsk->ring_tx, XDP_PGOFF_TX_RING              );
     120           0 :   fd_xsk_munmap_ring( &xsk->ring_fr, XDP_UMEM_PGOFF_FILL_RING       );
     121           0 :   fd_xsk_munmap_ring( &xsk->ring_cr, XDP_UMEM_PGOFF_COMPLETION_RING );
     122             : 
     123             :   /* Release XSK */
     124             : 
     125           0 :   if( FD_LIKELY( xsk->xsk_fd>=0 ) ) {
     126             :     /* Clear XSK descriptors */
     127           0 :     fd_memset( &xsk->offsets, 0, sizeof(struct xdp_mmap_offsets) );
     128             :     /* Close XSK */
     129           0 :     close( xsk->xsk_fd );
     130           0 :     xsk->xsk_fd = -1;
     131           0 :   }
     132             : 
     133           0 :   return xsk;
     134           0 : }
     135             : 
     136             : /* fd_xsk_setup_umem: Initializes xdp_umem_reg and hooks up XSK with
     137             :    UMEM rings via setsockopt(). Retrieves xdp_mmap_offsets via
     138             :    getsockopt().  Returns 0 on success, -1 on failure. */
     139             : static int
     140             : fd_xsk_setup_umem( fd_xsk_t *              xsk,
     141           0 :                    fd_xsk_params_t const * params ) {
     142             : 
     143             :   /* Initialize xdp_umem_reg */
     144           0 :   struct xdp_umem_reg umem_reg = {
     145           0 :     .addr       = (ulong)params->umem_addr,
     146           0 :     .len        = params->umem_sz,
     147           0 :     .chunk_size = (uint)params->frame_sz,
     148           0 :   };
     149             : 
     150             :   /* Register UMEM region */
     151           0 :   int res;
     152           0 :   res = setsockopt( xsk->xsk_fd, SOL_XDP, XDP_UMEM_REG,
     153           0 :                     &umem_reg, sizeof(struct xdp_umem_reg) );
     154           0 :   if( FD_UNLIKELY( res!=0 ) ) {
     155           0 :     FD_LOG_WARNING(( "setsockopt(SOL_XDP,XDP_UMEM_REG(addr=%p,len=%lu,chunk_size=%lu)) failed (%i-%s)",
     156           0 :                      (void *)umem_reg.addr, (ulong)umem_reg.len, (ulong)umem_reg.chunk_size,
     157           0 :                      errno, fd_io_strerror( errno ) ));
     158           0 :     return -1;
     159           0 :   }
     160             : 
     161             :   /* Set ring frame counts */
     162           0 : # define FD_SET_XSK_RING_DEPTH(name, var)                                 \
     163           0 :     do {                                                                  \
     164           0 :       res = setsockopt( xsk->xsk_fd, SOL_XDP, name, &(var), 8UL );        \
     165           0 :       if( FD_UNLIKELY( res!=0 ) ) {                                       \
     166           0 :         FD_LOG_WARNING(( "setsockopt(SOL_XDP," #name ",%lu) failed (%i-%s)", \
     167           0 :                          var, errno, fd_io_strerror( errno ) ));          \
     168           0 :         return -1;                                                        \
     169           0 :       }                                                                   \
     170           0 :     } while(0)
     171           0 :   FD_SET_XSK_RING_DEPTH( XDP_UMEM_FILL_RING,       params->fr_depth );
     172           0 :   FD_SET_XSK_RING_DEPTH( XDP_RX_RING,              params->rx_depth );
     173           0 :   FD_SET_XSK_RING_DEPTH( XDP_TX_RING,              params->tx_depth );
     174           0 :   FD_SET_XSK_RING_DEPTH( XDP_UMEM_COMPLETION_RING, params->cr_depth );
     175           0 : # undef FD_SET_XSK_RING_DEPTH
     176             : 
     177             :   /* Request ring offsets */
     178           0 :   socklen_t offsets_sz = sizeof(struct xdp_mmap_offsets);
     179           0 :   res = getsockopt( xsk->xsk_fd, SOL_XDP, XDP_MMAP_OFFSETS,
     180           0 :                     &xsk->offsets, &offsets_sz );
     181           0 :   if( FD_UNLIKELY( res!=0 ) ) {
     182           0 :     FD_LOG_WARNING(( "getsockopt(SOL_XDP, XDP_MMAP_OFFSETS) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
     183           0 :     return -1;
     184           0 :   }
     185             : 
     186             :   /* OK */
     187           0 :   return 0;
     188           0 : }
     189             : 
     190             : /* fd_xsk_init: Creates and configures an XSK socket object, and
     191             :    attaches to a preinstalled XDP program.  The various steps are
     192             :    implemented in fd_xsk_setup_{...}. */
     193             : 
     194             : fd_xsk_t *
     195             : fd_xsk_init( fd_xsk_t *              xsk,
     196           0 :              fd_xsk_params_t const * params ) {
     197             : 
     198           0 :   if( FD_UNLIKELY( !xsk ) ) { FD_LOG_WARNING(( "NULL xsk" )); return NULL; }
     199           0 :   memset( xsk, 0, sizeof(fd_xsk_t) );
     200             : 
     201           0 :   if( FD_UNLIKELY( !params->if_idx ) ) { FD_LOG_WARNING(( "zero if_idx" )); return NULL; }
     202           0 :   if( FD_UNLIKELY( (!params->fr_depth) | (!params->rx_depth) |
     203           0 :                    (!params->tx_depth) | (!params->cr_depth) ) ) {
     204           0 :     FD_LOG_WARNING(( "invalid {fr,rx,tx,cr}_depth" ));
     205           0 :     return NULL;
     206           0 :   }
     207           0 :   if( FD_UNLIKELY( !params->umem_addr ) ) {
     208           0 :     FD_LOG_WARNING(( "NULL umem_addr" ));
     209           0 :     return NULL;
     210           0 :   }
     211           0 :   if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)params->umem_addr, 4096UL ) ) ) {
     212           0 :     FD_LOG_WARNING(( "misaligned params->umem_addr" ));
     213           0 :     return NULL;
     214           0 :   }
     215           0 :   if( FD_UNLIKELY( !params->frame_sz || !fd_ulong_is_pow2( params->frame_sz ) ) ) {
     216           0 :     FD_LOG_WARNING(( "invalid frame_sz" ));
     217           0 :     return NULL;
     218           0 :   }
     219             : 
     220           0 :   xsk->if_idx      = params->if_idx;
     221           0 :   xsk->if_queue_id = params->if_queue_id;
     222             : 
     223             :   /* Create XDP socket (XSK) */
     224             : 
     225           0 :   xsk->xsk_fd = socket( AF_XDP, SOCK_RAW, 0 );
     226           0 :   if( FD_UNLIKELY( xsk->xsk_fd<0 ) ) {
     227           0 :     FD_LOG_WARNING(( "Failed to create XSK (%i-%s)", errno, fd_io_strerror( errno ) ));
     228           0 :     return NULL;
     229           0 :   }
     230             : 
     231             :   /* Associate UMEM region of fd_xsk_t with XSK via setsockopt() */
     232             : 
     233           0 :   if( FD_UNLIKELY( 0!=fd_xsk_setup_umem( xsk, params ) ) ) goto fail;
     234             : 
     235             :   /* Map XSK rings into local address space */
     236             : 
     237           0 :   if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_rx, xsk->xsk_fd, XDP_PGOFF_RX_RING,              sizeof(struct xdp_desc), params->rx_depth, &xsk->offsets.rx ) ) ) goto fail;
     238           0 :   if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_tx, xsk->xsk_fd, XDP_PGOFF_TX_RING,              sizeof(struct xdp_desc), params->tx_depth, &xsk->offsets.tx ) ) ) goto fail;
     239           0 :   if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_fr, xsk->xsk_fd, XDP_UMEM_PGOFF_FILL_RING,       sizeof(ulong),           params->fr_depth, &xsk->offsets.fr ) ) ) goto fail;
     240           0 :   if( FD_UNLIKELY( 0!=fd_xsk_mmap_ring( &xsk->ring_cr, xsk->xsk_fd, XDP_UMEM_PGOFF_COMPLETION_RING, sizeof(ulong),           params->cr_depth, &xsk->offsets.cr ) ) ) goto fail;
     241             : 
     242             :   /* Bind XSK to queue on network interface */
     243             : 
     244           0 :   uint flags = XDP_USE_NEED_WAKEUP | params->bind_flags;
     245           0 :   struct sockaddr_xdp sa = {
     246           0 :     .sxdp_family   = PF_XDP,
     247           0 :     .sxdp_ifindex  = xsk->if_idx,
     248           0 :     .sxdp_queue_id = xsk->if_queue_id,
     249             :     /* See extended commentary below for details on XDP_USE_NEED_WAKEUP
     250             :        flag. */
     251           0 :     .sxdp_flags    = (ushort)flags
     252           0 :   };
     253             : 
     254           0 :   char if_name[ IF_NAMESIZE ] = {0};
     255             : 
     256           0 :   if( FD_UNLIKELY( 0!=bind( xsk->xsk_fd, (void *)&sa, sizeof(struct sockaddr_xdp) ) ) ) {
     257           0 :     FD_LOG_WARNING(( "bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) failed (%i-%s)",
     258           0 :                      xsk->if_idx, if_indextoname( xsk->if_idx, if_name ),
     259           0 :                      xsk->if_queue_id, flags,
     260           0 :                      errno, fd_io_strerror( errno ) ));
     261           0 :     goto fail;
     262           0 :   }
     263             : 
     264             :   /* We've seen that some popular Intel NICs seem to have a bug that
     265             :      prevents them from working in SKB mode with certain kernel
     266             :      versions.  We can identify them by sendto returning ENXIO or EINVAL
     267             :      in newer versions.  The core of the problem is that the kernel
     268             :      calls the generic ndo_bpf pointer instead of the driver-specific
     269             :      version.  This means that the driver's pointer to the BPF program
     270             :      never gets set, yet the driver's wakeup function gets called. */
     271           0 :   if( FD_UNLIKELY( -1==sendto( xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) {
     272           0 :     if( FD_LIKELY( errno==ENXIO || errno==EINVAL ) ) {
     273           0 :       FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s).  This likely indicates "
     274           0 :                    "a bug with your NIC driver.  Try switching XDP mode using "
     275           0 :                    "net.xdp.xdp_mode in the configuration TOML.\n"
     276           0 :                    "Certain Intel NICs with certain driver/kernel combinations "
     277           0 :                    "are known to exhibit this issue in skb mode but work in drv "
     278           0 :                    "mode.", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
     279           0 :     } else {
     280           0 :       FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", xsk->xsk_fd, errno, fd_io_strerror( errno ) ));
     281           0 :     }
     282           0 :   }
     283             : 
     284             :   /* XSK successfully configured.  Traffic will arrive in XSK after
     285             :      configuring an XDP program to forward packets via XDP_REDIRECT.
     286             :      This requires providing the XSK file descriptor to the program via
     287             :      XSKMAP and is done in a separate step. */
     288             : 
     289           0 :   FD_LOG_INFO(( "AF_XDP socket initialized: bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) success",
     290           0 :                 xsk->if_idx, if_indextoname( xsk->if_idx, if_name ), xsk->if_queue_id, flags ));
     291             : 
     292           0 :   return xsk;
     293             : 
     294           0 : fail:
     295           0 :   fd_xsk_fini( xsk );
     296             :   return NULL;
     297           0 : }

Generated by: LCOV version 1.14