LCOV - code coverage report
Current view: top level - vinyl/io/ur - fd_vinyl_io_ur_private.h (source / functions) Hit Total Coverage
Test: cov.lcov Lines: 0 37 0.0 %
Date: 2026-02-13 06:06:24 Functions: 0 40 0.0 %

          Line data    Source code
       1             : #ifndef HEADER_fd_src_vinyl_io_ur_fd_vinyl_io_ur_private_h
       2             : #define HEADER_fd_src_vinyl_io_ur_fd_vinyl_io_ur_private_h
       3             : 
       4             : /* io_uring-based I/O backend.  This backend implements a number of
       5             :    advanced optimizations:
       6             : 
       7             :    1. Write-back cache: all bstream writes are buffered in memory and
       8             :       confirmed instantly, before attempting actual write operations.
       9             :       This significantly reduces latency for append and commit ops.
      10             :    2. In-place writes: normal alloc/append/commit usage directly
      11             :       emplaces new blocks into the write buffer.
      12             :    3. Direct I/O: Writes bypass the page cache for improved performance
      13             :       (reads still use the page cache, though).
      14             :    4. Fully async: Reads and writes are enqueued via io_uring and can
      15             :       be arbitrarily interleaved.
      16             : 
      17             :    Consequently, this backend's methods behave differently than
      18             :    fd_vinyl_io.h documents:
      19             : 
      20             :    - commit does not empty the scratch pad */
      21             : 
      22             : #include "fd_vinyl_io_ur.h"
      23             : #include "wb_ring.h"
      24             : #include "wq_ring.h"
      25             : #include <errno.h>
      26             : #include <unistd.h> /* pread, pwrite */
      27             : 
      28             : /* WQ_DEPTH sets the max number of write queue jobs that io_ur can track
      29             :    at once.
      30             : 
      31             :    WQ_BLOCK_SZ sets the write block size.
      32             : 
      33             :    WQ_DEPTH*WQ_BLOCK_SZ is thus the write window size (bandwidth-delay
      34             :    product).  For example, with a 32 MiB window size, and 1ms write
      35             :    latency, the max write rate is ~ 32 GiB/s. */
      36             : 
      37           0 : #define WQ_DEPTH    (64UL)
      38           0 : #define WQ_BLOCK_SZ (1UL<<19) /* 512KiB */
      39             : 
      40             : /* fd_vinyl_io_ur_rd_t extends fd_vinyl_io_rd_t.  Describes an inflight
      41             :    read request.  Each object gets created with a fd_vinyl_io_read()
      42             :    call, has at least the lifetime of a io_uring SQE/CQE transaction,
      43             :    and gets destroyed with fd_vinyl_io_poll().
      44             : 
      45             :    Each fd_vinyl_io_rd_t describes a contiguous read in bstream seq
      46             :    space.  When mapped to the device, this typically results in a single
      47             :    contiguous read. */
      48             : 
      49             : struct fd_vinyl_io_ur_rd;
      50             : typedef struct fd_vinyl_io_ur_rd fd_vinyl_io_ur_rd_t;
      51             : 
      52             : struct fd_vinyl_io_ur_rd {
      53             :   ulong                 ctx;  /* Must mirror fd_vinyl_io_rd_t */
      54             :   ulong                 seq;  /* " */
      55             :   void *                dst;  /* " */
      56             :   ulong                 sz;   /* " */
      57             : 
      58             :   fd_vinyl_io_ur_rd_t * next; /* Next element in ur rd queue */
      59             : 
      60             :   uint head_off;  uint head_sz;
      61             :   uint tail_off;  uint tail_sz;
      62             : };
      63             : 
      64             : FD_STATIC_ASSERT( sizeof(fd_vinyl_io_ur_rd_t)<=sizeof(fd_vinyl_io_rd_t), layout );
      65             : 
      66             : /* fd_vinyl_io_ur_t extends fd_vinyl_io_t. */
      67             : 
      68             : struct fd_vinyl_io_ur {
      69             :   fd_vinyl_io_t            base[1];
      70             :   int                      dev_fd;       /* File descriptor of block device */
      71             :   ulong                    dev_sync;     /* Offset to block that holds bstream sync (BLOCK_SZ multiple) */
      72             :   ulong                    dev_base;     /* Offset to first block (BLOCK_SZ multiple) */
      73             :   ulong                    dev_sz;       /* Block store byte size (BLOCK_SZ multiple) */
      74             :   fd_vinyl_bstream_block_t sync[1];
      75             : 
      76             :   /* reads waiting to be submitted to io_uring */
      77             :   fd_vinyl_io_ur_rd_t *    rq_head;      /* Pointer to queue head */
      78             :   fd_vinyl_io_ur_rd_t **   rq_tail_next; /* Pointer to queue &tail->next or &rd_head if empty. */
      79             : 
      80             :   /* reads completed early, awaiting poll() */
      81             :   fd_vinyl_io_ur_rd_t *    rc_head;      /* Pointer to queue head */
      82             :   fd_vinyl_io_ur_rd_t **   rc_tail_next; /* Pointer to queue &tail->next or &rc_head if empty. */
      83             : 
      84             :   fd_io_uring_t * ring;
      85             :   ulong sqe_prep_cnt;        /* SQEs prepared */
      86             :   ulong sqe_sent_cnt;        /* SQEs submitted */
      87             :   ulong cqe_cnt;             /* CQEs received */
      88             :   uint  cqe_pending;         /* Total CQEs pending */
      89             :   uint  cqe_read_pending;    /* CQEs for reads  pending */
      90             :   uint  cqe_write_pending;   /* CQEs for writes pending */
      91             :   ulong cqe_read_short_cnt;  /* CQEs with short reads received */
      92             : 
      93             :   /* write-back cache */
      94             :   wb_ring_t wb; /* write buffer */
      95             :   ulong     seq_cache;
      96             :   ulong     seq_clean;
      97             :   ulong     seq_write;
      98             :   void *    last_alloc;
      99             : 
     100             :   struct {
     101             :     wq_ring_t wq; /* write queue */
     102             :     wq_desc_t _desc[ WQ_DEPTH ];
     103             :   };
     104             : 
     105             :   /* write-back cache contents follow */
     106             : };
     107             : 
     108             : typedef struct fd_vinyl_io_ur fd_vinyl_io_ur_t;
     109             : 
     110             : FD_PROTOTYPES_BEGIN
     111             : 
     112             : /* fd_vinyl_io_ur_wb_buf returns a pointer to the first byte of the
     113             :    write-back buffer.  Offsets returned by wb_ring.h are compatible with
     114             :    this base pointer. */
     115             : 
     116             : static inline uchar *
     117           0 : fd_vinyl_io_ur_wb_buf( fd_vinyl_io_ur_t * io ) {
     118           0 :   return (uchar *)( io+1 );
     119           0 : }
     120             : 
     121             : /* Blocking read/write APIs */
     122             : 
     123             : static inline void
     124             : bd_read( int    fd,
     125             :          ulong  off,
     126             :          void * buf,
     127           0 :          ulong  sz ) {
     128           0 :   ssize_t ssz = pread( fd, buf, sz, (off_t)off );
     129           0 :   if( FD_LIKELY( ssz==(ssize_t)sz ) ) return;
     130           0 :   if( ssz<(ssize_t)0 ) FD_LOG_CRIT(( "pread(fd %i,off %lu,sz %lu) failed (%i-%s)", fd, off, sz, errno, fd_io_strerror( errno ) ));
     131           0 :   /**/                 FD_LOG_CRIT(( "pread(fd %i,off %lu,sz %lu) failed (unexpected sz %li)", fd, off, sz, (long)ssz ));
     132           0 : }
     133             : 
     134             : static inline void
     135             : bd_write( int          fd,
     136             :           ulong        off,
     137             :           void const * buf,
     138           0 :           ulong        sz ) {
     139           0 :   ssize_t ssz = pwrite( fd, buf, sz, (off_t)off );
     140           0 :   if( FD_LIKELY( ssz==(ssize_t)sz ) ) return;
     141           0 :   if( ssz<(ssize_t)0 ) FD_LOG_CRIT(( "pwrite(fd %i,off %lu,sz %lu) failed (%i-%s)", fd, off, sz, errno, fd_io_strerror( errno ) ));
     142           0 :   else                 FD_LOG_CRIT(( "pwrite(fd %i,off %lu,sz %lu) failed (unexpected sz %li)", fd, off, sz, (long)ssz ));
     143           0 : }
     144             : 
     145             : /* vinyl_io read API, provided by fd_vinyl_io_ur_rd.c *****************/
     146             : 
     147             : /* fd_vinyl_io_ur_read_imm does a synchronous blocking read. */
     148             : 
     149             : void
     150             : fd_vinyl_io_ur_read_imm( fd_vinyl_io_t * io,
     151             :                          ulong           seq0,
     152             :                          void *          _dst,
     153             :                          ulong           sz );
     154             : 
     155             : /* fd_vinyl_io_ur_read enqueues an asynchronous read. */
     156             : 
     157             : void
     158             : fd_vinyl_io_ur_read( fd_vinyl_io_t *    io,
     159             :                      fd_vinyl_io_rd_t * _rd );
     160             : 
     161             : /* fd_vinyl_io_ur_poll polls for the next read completion. */
     162             : 
     163             : int
     164             : fd_vinyl_io_ur_poll( fd_vinyl_io_t *     io,
     165             :                      fd_vinyl_io_rd_t ** _rd,
     166             :                      int                 flags );
     167             : 
     168             : /* vinyl_io write API, provided by fd_vinyl_io_ur_wb.c ****************/
     169             : 
     170             : void *
     171             : fd_vinyl_io_ur_alloc( fd_vinyl_io_t * io,
     172             :                       ulong           sz,
     173             :                       int             flags );
     174             : 
     175             : ulong
     176             : fd_vinyl_io_ur_append( fd_vinyl_io_t * io,
     177             :                        void const *    _src,
     178             :                        ulong           sz );
     179             : 
     180             : ulong
     181             : fd_vinyl_io_ur_copy( fd_vinyl_io_t * io,
     182             :                      ulong           seq_src0,
     183             :                      ulong           sz );
     184             : 
     185             : int
     186             : fd_vinyl_io_ur_commit( fd_vinyl_io_t * io,
     187             :                        int             flags );
     188             : 
     189             : ulong
     190             : fd_vinyl_io_ur_hint( fd_vinyl_io_t * io,
     191             :                      ulong           sz );
     192             : 
     193             : int
     194             : fd_vinyl_io_ur_sync( fd_vinyl_io_t * io,
     195             :                      int             flags );
     196             : 
     197             : void
     198             : fd_vinyl_io_ur_forget( fd_vinyl_io_t * io,
     199             :                        ulong           seq );
     200             : 
     201             : void
     202             : fd_vinyl_io_ur_rewind( fd_vinyl_io_t * io,
     203             :                        ulong           seq );
     204             : 
     205             : /* Auxiliary write path functions */
     206             : 
     207             : void
     208             : fd_vinyl_io_wq_completion( fd_vinyl_io_ur_t * io );
     209             : 
     210             : /* io_uring userdata encoding ******************************************
     211             : 
     212             :    io_uring userdata are arbitrary 64-bit words that are provided in SQE
     213             :    and echoed back in corresponding CQE.  We use the userdata to encode
     214             :    which request completed upon CQE receipt.  We need to minimally pack
     215             :    the request type (read or write) and the request identifier.  For the
     216             :    write path, this is an index; for the read path, this is a pointer to
     217             :    the descriptor.  Pointers are compressed to 61 bits (since the low
     218             :    3 bits are always zero for 8 byte aligned pointers). */
     219             : 
     220           0 : #define UR_REQ_READ      0  /* read SQE */
     221           0 : #define UR_REQ_READ_TAIL 1  /* read SQE, tail wraparound at end of bstream */
     222           0 : #define UR_REQ_WRITE     2  /* write SQE */
     223             : 
     224           0 : #define UR_REQ_TYPE_WIDTH 3
     225           0 : #define UR_REQ_TYPE_MASK ((1UL<<UR_REQ_TYPE_WIDTH)-1UL)
     226             : 
     227             : static inline ulong
     228             : ur_udata_pack_idx( ulong req_type, /* UR_REQ_* */
     229           0 :                    ulong idx ) {
     230           0 :   return (idx<<UR_REQ_TYPE_WIDTH) | (req_type & UR_REQ_TYPE_MASK);
     231           0 : }
     232             : 
     233             : static inline ulong
     234             : ur_udata_pack_ptr( ulong  req_type,
     235           0 :                    void * ptr ) {
     236           0 :   return ( ((ulong)ptr) & ~UR_REQ_TYPE_MASK ) | (req_type & UR_REQ_TYPE_MASK);
     237           0 : }
     238             : 
     239             : static inline ulong
     240           0 : ur_udata_req_type( ulong udata ) {
     241           0 :   return udata & UR_REQ_TYPE_MASK;
     242           0 : }
     243             : 
     244             : static inline ulong
     245           0 : ur_udata_idx( ulong udata ) {
     246           0 :   return udata >> UR_REQ_TYPE_WIDTH;
     247           0 : }
     248             : 
     249             : static inline void *
     250           0 : ur_udata_ptr( ulong udata ) {
     251           0 :   return (void *)( udata & ~UR_REQ_TYPE_MASK );
     252           0 : }
     253             : 
     254             : FD_PROTOTYPES_END
     255             : 
     256             : #endif /* HEADER_fd_src_vinyl_io_ur_fd_vinyl_io_ur_private_h */

Generated by: LCOV version 1.14