LCOV - code coverage report
Current view: top level - waltz/xdp - fd_xsk.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 2 2 100.0 %
Date: 2025-01-08 12:08:44 Functions: 0 0 -

          Line data    Source code
       1             : #ifndef HEADER_fd_src_waltz_xdp_fd_xsk_h
       2             : #define HEADER_fd_src_waltz_xdp_fd_xsk_h
       3             : 
       4             : #if defined(__linux__)
       5             : 
       6             : /* fd_xsk manages an XSK file descriptor and provides RX/TX buffers.
       7             : 
       8             :    ### Background
       9             : 
      10             :    AF_XDP is a Linux API providing kernel-bypass networking in the form
      11             :    of shared memory ring buffers accessible from userspace.  The kernel
      12             :    redirects packets from/to these buffers with the appropriate XDP
      13             :    configuration (XDP_REDIRECT).  AF_XDP is hardware-agnostic and allows
      14             :    sharing a NIC with the Linux networking stack (unlike e.g. DPDK).
      15             :    This allows for deployment in existing, heterogeneous networks. An
      16             :    AF_XDP socket is called "XSK".  The shared memory region storing the
      17             :    packet data flowing through an XSK is called "UMEM".
      18             : 
      19             :    XDP (eXpress Data Path) is a framework for installing hooks in the
      20             :    form of eBPF programs at an early stage of packet processing (i.e.
      21             :    before tc and netfilter).  eBPF is user-deployable JIT-compiled
      22             :    bytecode that usually runs inside the kernel. Some hardware/driver
      23             :    combinations optionally allow offloading eBPF processing to NICs.
      24             :    This is not to be confused with other BPF-derived ISAs such as sBPF
      25             :    (Solana BPF).
      26             : 
      27             :      +--- Figure 1: AF_XDP RX Block Diagram -----------------+
      28             :      |                                                       |
      29             :      |   ┌─────┐  ┌────────┐  ┌─────┐ XDP_PASS ┌─────────┐   |
      30             :      |   │ NIC ├──> Driver ├──> XDP ├──────────> sk_buff │   |
      31             :      |   └─────┘  └────────┘  └─┬───┘          └─────────┘   |
      32             :      |                          │                            |
      33             :      |                          │ XDP_REDIRECT               |
      34             :      |                          │                            |
      35             :      |                       ┌──▼───────┐      ┌─────────┐   |
      36             :      |                       │ XSK/UMEM ├──────> fd_aio  │   |
      37             :      |                       └──────────┘      └─────────┘   |
      38             :      |                                                       |
      39             :      +-------------------------------------------------------+
      40             : 
      41             :    Figure 1 shows a simplified block diagram of RX packet flow within
      42             :    the kernel in `XDP_FLAGS_DRV_MODE` mode.  Notably, the chain of eBPF
      43             :    programs installed in the XDP facility get invoked for every incoming
      44             :    packet.  If all programs return the `XDP_PASS` action, the packet
      45             :    continues its usual path to the Linux networking stack, where it will
      46             :    be allocated in sk_buff, and eventually flow through ip_rcv(), tc,
      47             :    and netfilter before reaching downstream sockets.
      48             :    If the `XDP_REDIRECT` action is taken however, the packet is copied
      49             :    to the UMEM of an XSK, and a RX queue entry is allocated.  An fd_aio
      50             :    backend is provided by fd_xdp_aio.
      51             :    The more generic `XDP_FLAGS_SKB_MODE` XDP mode falls back to sk_buff-
      52             :    based memory mgmt (still skipping the rest of the generic path), but
      53             :    is more widely available.
      54             : 
      55             :      +--- Figure 2: AF_XDP TX Block Diagram -------------+
      56             :      |                                                   |
      57             :      |   ┌────────┐  ┌──────────┐  ┌────────┐  ┌─────┐   |
      58             :      |   │ fd_aio ├──> XSK/UMEM ├──> Driver ├──> NIC │   |
      59             :      |   └────────┘  └──────────┘  └────────┘  └─────┘   |
      60             :      |                                                   |
      61             :      +---------------------------------------------------+
      62             : 
      63             :    Figure 2 shows a simplified block diagram of the TX packet flow.
      64             :    Userspace applications deliver packets to the XSK/UMEM buffers.  The
      65             :    kernel then forwards these packets to the NIC.  This also means that
      66             :    the application is responsible for maintaining a routing table to
      67             :    resolve layer-3 dest addrs to NICs and layer-2 addrs.  As in the RX
      68             :    flow, netfilter (iptables, nftables) is not available.
      69             : 
      70             :    ### Memory Management
      71             : 
      72             :    The UMEM area is allocated from userspace.  It is recommended to use
      73             :    the fd_util shmem/wksp APIs to obtain large page-backed memory.  UMEM
      74             :    is divided into equally sized frames. At any point in time, each
      75             :    frame is either owned by userspace or the kernel.  On initialization,
      76             :    all frames are owned by userspace.
      77             : 
      78             :    Changes in UMEM frame ownership and packet RX/TX events are
      79             :    transmitted via four rings allocated by the kernel (mmap()ed in by
      80             :    the user). This allows for out-of-order processing of packets.
      81             : 
      82             :       Data flow:
      83             :       (U->K) is userspace-to-kernel communication, and
      84             :       (K->U) is kernel-to-userspace.
      85             : 
      86             :       FILL         Free frames are provided to the kernel using the FILL
      87             :       (U->K)       ring. The kernel may populate these frames with RX
      88             :                    packet data.
      89             : 
      90             :       RX           Once the kernel has populated a FILL frame with RX
      91             :       (K->U)       packet data, it passes back the frame to userspace
      92             :                    via the RX queue.
      93             : 
      94             :       TX           TX frames sent by userspace are provided to the
      95             :       (U->K)       kernel using the TX ring.
      96             : 
      97             :       COMPLETION   Once the kernel has processed a TX frame, it passes
      98             :       (K->U)       back the frame to the userspace via the COMPLETION
      99             :                    queue.
     100             : 
     101             :    Combined, the FILL-RX and TX-COMPLETION rings form two pairs.  The
     102             :    kernel will not move frames between the pairs. */
     103             : 
     104             : #include <linux/if_link.h>
     105             : #include <net/if.h>
     106             : 
     107             : #include "../../util/fd_util_base.h"
     108             : 
     109             : /* FD_XSK_ALIGN: alignment of fd_xsk_t. */
     110          66 : #define FD_XSK_ALIGN      (4096UL)
     111             : 
     112             : /* FD_XSK_UMEM_ALIGN: byte alignment of UMEM area within fd_xsk_t.
     113             :    This requirement is set by the kernel as of Linux 4.18. */
     114          21 : #define FD_XSK_UMEM_ALIGN (4096UL)
     115             : 
     116             : /* Forward declarations */
     117             : struct fd_xsk_private;
     118             : typedef struct fd_xsk_private fd_xsk_t;
     119             : 
     120             : /* fd_xsk_frame_meta_t: Frame metadata used to identify packet */
     121             : 
     122             : #define FD_XDP_FRAME_META_ALIGN (16UL)
     123             : 
     124             : struct __attribute__((aligned(FD_XDP_FRAME_META_ALIGN))) fd_xsk_frame_meta {
     125             :   ulong off;   /* Byte offset from UMEM start to start of packet */
     126             :   uint  sz;    /* Size of packet data starting at `off` */
     127             :   uint  flags; /* Undefined for now */
     128             : };
     129             : typedef struct fd_xsk_frame_meta fd_xsk_frame_meta_t;
     130             : 
     131             : /* fd_xsk_params_t: Memory layout parameters of XSK.
     132             :    Can be retrieved using fd_xsk_get_params() */
     133             : 
     134             : struct fd_xsk_params {
     135             :   /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX,
     136             :     TX, Completion XSK rings respectively. */
     137             :   ulong fr_depth;
     138             :   ulong rx_depth;
     139             :   ulong tx_depth;
     140             :   ulong cr_depth;
     141             : 
     142             :   /* frame_sz: Controls the frame size used in the UMEM ring buffers. */
     143             :   ulong frame_sz;
     144             : 
     145             :   /* umem_sz: Total size of XSK ring shared memory area (contiguous).
     146             :      Aligned by FD_XSK_ALIGN. */
     147             :   ulong umem_sz;
     148             : };
     149             : typedef struct fd_xsk_params fd_xsk_params_t;
     150             : 
     151             : FD_PROTOTYPES_BEGIN
     152             : 
     153             : /* Setup API **********************************************************/
     154             : 
     155             : /* fd_xsk_{align,footprint} return the required alignment and
     156             :    footprint of a memory region suitable for use as an fd_xsk_t.
     157             :    See fd_xsk_new for explanations on parameters. */
     158             : 
     159             : FD_FN_CONST ulong
     160             : fd_xsk_align( void );
     161             : 
     162             : FD_FN_CONST ulong
     163             : fd_xsk_footprint( ulong frame_sz,
     164             :                   ulong fr_depth,
     165             :                   ulong rx_depth,
     166             :                   ulong tx_depth,
     167             :                   ulong cr_depth );
     168             : 
     169             : /* fd_xsk_new formats an unused memory region for use as an fd_xsk_t.
     170             :    shmem must point to a memory region that matches fd_xsk_align() and
     171             :    fd_xsk_footprint().  frame_sz controls the frame size used in the
     172             :    UMEM ring buffers and should be either 2048 or 4096.
     173             :    {fr,rx,tx,cr}_depth control the number of frames allocated for the
     174             :    Fill, RX, TX, Completion rings respectively.  If zero_copy is
     175             :    non-zero, the xsk will be created in zero-copy mode.  Returns handle
     176             :    suitable for fd_xsk_join() on success. */
     177             : 
     178             : void *
     179             : fd_xsk_new( void * shmem,
     180             :             ulong  frame_sz,
     181             :             ulong  fr_depth,
     182             :             ulong  rx_depth,
     183             :             ulong  tx_depth,
     184             :             ulong  cr_depth );
     185             : 
     186             : /* fd_xsk_join joins the caller to the fd_xsk_t */
     187             : 
     188             : fd_xsk_t *
     189             : fd_xsk_join( void * shxsk );
     190             : 
     191             : /* fd_xsk_init creates an XSK, registers UMEM, maps rings, and binds the
     192             :    socket to the given interface queue.  This is a potentially
     193             :    destructive operation.  As of 2024-Jun, AF_XDP zero copy support is
     194             :    still buggy in some device drivers.  
     195             :    
     196             :    Assume that all traffic sent to this interface is compromised.  On
     197             :    some devices, the NIC is instructed to DMA all incoming packets into
     198             :    UMEM, even ones not belonging to Firedancer.  Those are then later
     199             :    on software-copied out to skbs again.  This further implies that
     200             :    enabling AF_XDP can slow down the regular kernel receive path.
     201             :    
     202             :    Requires CAP_SYS_ADMIN. May issue the following syscalls:
     203             : 
     204             :    - socket( AF_XDP, SOCK_RAW, 0 ) = fd
     205             :    - setsockopt( fd, SOL_XDP, ... )
     206             :    - getsockopt( fd, SOL_XDP, ... )
     207             :    - mmap( ..., fd, ... )
     208             :    - bind( fd, ... )
     209             :    - munmap  ; on fail
     210             :    - close   ; on fail */
     211             : 
     212             : fd_xsk_t *
     213             : fd_xsk_init( fd_xsk_t * xsk,
     214             :              uint       if_idx,        /* see if_nametoindex(3) */
     215             :              uint       if_queue,      /* queue index (type combined) */
     216             :              uint       bind_flags );  /* e.g. XDP_ZEROCOPY */
     217             : 
     218             : /* fd_xsk_fini unmaps XSK rings and closes the XSK file descriptor.
     219             :    This effectively returns the interface to the state before
     220             :    fd_xsk_init.
     221             : 
     222             :    May issue the following syscalls:
     223             :    
     224             :    - munmap 
     225             :    - close */
     226             : 
     227             : fd_xsk_t *
     228             : fd_xsk_fini( fd_xsk_t * xsk );
     229             : 
     230             : /* fd_xsk_leave leaves a current local join and releases all kernel
     231             :    resources.  Returns a pointer to the underlying shared memory region
     232             :    on success and NULL on failure (logs details).  Reasons for failure
     233             :    include xsk is NULL. */
     234             : 
     235             : void *
     236             : fd_xsk_leave( fd_xsk_t * xsk );
     237             : 
     238             : /* fd_xsk_delete unformats a memory region used as an fd_xsk_t. Assumes
     239             :    nobody is joined to the region.  Returns a pointer to the underlying
     240             :    shared memory region or NULL if used obviously in error (e.g. shxsk
     241             :    does not point to an fd_xsk_t ... logs details).  The ownership of
     242             :    the memory region is transferred to the caller on success. */
     243             : 
     244             : void *
     245             : fd_xsk_delete( void * shxsk );
     246             : 
     247             : /* I/O API ************************************************************/
     248             : 
     249             : /* fd_xsk_rx_enqueue: Enqueues a batch of frames for RX.
     250             : 
     251             :    An RX enqueue transfers ownership of frames to the kernel using the
     252             :    fill ring, providing it space for incoming packet data.  Successful
     253             :    enqueue does not imply that packets have actually been received, but
     254             :    rather just indicates that the frame memory is registered with the
     255             :    AF_XDP socket.
     256             : 
     257             :    offsets points to an array containing offsets_cnt items.
     258             :    Each offsets[k] for k in [0;offsets_cnt-1] is the frame's byte offset
     259             :    relative to the start of the UMEM region.  Returns the number of
     260             :    frames n enqueued where n<=offsets_cnt.  Each frame (identified by
     261             :    its offset) may not be reused in another enqueue until it is returned
     262             :    in fd_xsk_rx_complete.  The frames that failed to enqueue are in
     263             :    [n;offsets_cnt-1] and may be retried in a later call. */
     264             : 
     265             : ulong
     266             : fd_xsk_rx_enqueue( fd_xsk_t * xsk,
     267             :                    ulong *    offsets,
     268             :                    ulong      offsets_cnt );
     269             : 
     270             : /* fd_xsk_rx_enqueue2: See fd_xsk_rx_enqueue.
     271             : 
     272             :    meta points to an array containing meta_cnt items.  For each k in
     273             :    [0;meta_cnt-1], meta[k].off is the frame's byte offset relative to
     274             :    the start of the UMEM region.  meta[k].{sz,flags} are ignored. */
     275             : 
     276             : ulong
     277             : fd_xsk_rx_enqueue2( fd_xsk_t *            xsk,
     278             :                     fd_xsk_frame_meta_t * meta,
     279             :                     ulong                 meta_cnt );
     280             : 
     281             : /* fd_xsk_rx_complete: Receives RX completions for a batch of frames.
     282             : 
     283             :    An RX completion means that a packet has been received and transfers
     284             :    ownership of the frame holding the packet over to userspace.
     285             :    meta_cnt is the number of packets that the caller is able to receive.
     286             :    meta points to an array containing meta_cnt records where each k in
     287             :    [0,count-1] may fill a packet meta at meta[k].  Returns the number of
     288             :    packets actually received, which may be less than meta_cnt. */
     289             : 
     290             : ulong
     291             : fd_xsk_rx_complete( fd_xsk_t *            xsk,
     292             :                     fd_xsk_frame_meta_t * meta,
     293             :                     ulong                 meta_cnt );
     294             : 
     295             : 
     296             : /* fd_xsk_tx_enqueue: Enqueues a batch of frames for TX.
     297             : 
     298             :    meta_cnt is the number of packets to attempt to enqueue for transmit.
     299             :    meta points to an array containing meta_cnt records where each k in
     300             :    [0,count-1] enqueues frame at meta[k].  Returns the number of frames
     301             :    actually enqueued, which may be less than meta_cnt.  Successful en-
     302             :    queue does not imply that packets have actually been sent out to the
     303             :    network, but rather just indicates that the frame memory is
     304             :    registered with the AF_XDP sockets.  The frames that failed to
     305             :    enqueue are referred to by meta[N+] and may be retried in a later
     306             :    call. */
     307             : 
     308             : ulong
     309             : fd_xsk_tx_enqueue( fd_xsk_t *            xsk,
     310             :                    fd_xsk_frame_meta_t * meta,
     311             :                    ulong                 meta_cnt,
     312             :                    int                   flush );
     313             : 
     314             : 
     315             : /* fd_xsk_tx_complete: Check for TX completions and reclaim frames.
     316             : 
     317             :    A TX completion occurs when a previously enqueued TX packet has been
     318             :    fully handed off to the NIC or dropped.  This transfers the ownership
     319             :    of the corresponding frame back to the XSK, where the caller can
     320             :    retrieve it for future writes using this function.  Note that this
     321             :    does not guarantee successful delivery to the network destination.
     322             : 
     323             :    offsets points to an array containing offsets_cnt items.
     324             :    Returns the number of frames n completed where n<=offsets_cnt.
     325             :    Each k in [0;n-1] writes a completion at offsets[k] where offsets[k]
     326             :    is the frame byte offset relative to the start of the UMEM region. */
     327             : 
     328             : ulong
     329             : fd_xsk_tx_complete( fd_xsk_t * xsk,
     330             :                     ulong *    offsets,
     331             :                     ulong      offsets_cnt );
     332             : 
     333             : /* fd_xsk_tx_complete2: See fd_xsk_tx_complete.
     334             : 
     335             :    fd_xsk_tx_complete2 behaves similar to fd_xsk_tx_complete, except
     336             :    that it takes a pointer to an array of fd_xsk_frame_meta_t instead
     337             :    of ulong.  meta points to an array containing meta_cnt.
     338             :    Each k in [0;n-1] writes a frame meta at meta[k] where
     339             :    meta[k].off is the frame offset relative to the UMEM region's start
     340             :    and `meta[k].{sz,flags}` are undefined. */
     341             : 
     342             : ulong
     343             : fd_xsk_tx_complete2( fd_xsk_t *            xsk,
     344             :                      fd_xsk_frame_meta_t * meta,
     345             :                      ulong                 meta_cnt );
     346             : 
     347             : /* fd_xsk_fd: Returns the XSK file descriptor. */
     348             : 
     349             : FD_FN_PURE int
     350             : fd_xsk_fd( fd_xsk_t * const xsk );
     351             : 
     352             : /* fd_xsk_ifidx: Returns the network interface index of that the
     353             :    XSK is currently bound to.  May return zero if the XSK is not bound. */
     354             : 
     355             : FD_FN_PURE uint
     356             : fd_xsk_ifidx( fd_xsk_t * const xsk );
     357             : 
     358             : /* fd_xsk_ifqueue: Returns the queue index that the XSK is currently
     359             :    bound to (a network interface can have multiple queues). U.B if
     360             :    fd_xsk_ifname() returns NULL. */
     361             : 
     362             : FD_FN_PURE uint
     363             : fd_xsk_ifqueue( fd_xsk_t * const xsk );
     364             : 
     365             : /* fd_xsk_umem_laddr returns a pointer to the XSK frame memory region in
     366             :    the caller's local address space. */
     367             : 
     368             : FD_FN_CONST void *
     369             : fd_xsk_umem_laddr( fd_xsk_t * xsk );
     370             : 
     371             : /* fd_xsk_get_params returns a pointer to the memory layout params from
     372             :    xsk. The caller should zero-initialize the params buffer before use.
     373             :    xsk must be a valid join to fd_xsk_t and params must point to a
     374             :    memory region in the caller's local address space.  The returned
     375             :    params struct is valid during the lifetime of the xsk. */
     376             : 
     377             : FD_FN_CONST fd_xsk_params_t const *
     378             : fd_xsk_get_params( fd_xsk_t const * xsk );
     379             : 
     380             : FD_PROTOTYPES_END
     381             : 
     382             : #endif /* defined(__linux__) */
     383             : #endif /* HEADER_fd_src_waltz_xdp_fd_xsk_h */

Generated by: LCOV version 1.14