Line data Source code
1 : #ifndef HEADER_fd_src_vinyl_io_ur_fd_vinyl_io_ur_private_h
2 : #define HEADER_fd_src_vinyl_io_ur_fd_vinyl_io_ur_private_h
3 :
4 : /* io_uring-based I/O backend. This backend implements a number of
5 : advanced optimizations:
6 :
7 : 1. Write-back cache: all bstream writes are buffered in memory and
8 : confirmed instantly, before attempting actual write operations.
9 : This significantly reduces latency for append and commit ops.
10 : 2. In-place writes: normal alloc/append/commit usage directly
11 : emplaces new blocks into the write buffer.
12 : 3. Direct I/O: Writes bypass the page cache for improved performance
13 : (reads still use the page cache, though).
14 : 4. Fully async: Reads and writes are enqueued via io_uring and can
15 : be arbitrarily interleaved.
16 :
17 : Consequently, this backend's methods behave differently than
18 : fd_vinyl_io.h documents:
19 :
20 : - commit does not empty the scratch pad */
21 :
22 : #include "fd_vinyl_io_ur.h"
23 : #include "wb_ring.h"
24 : #include "wq_ring.h"
25 : #include <errno.h>
26 : #include <unistd.h> /* pread, pwrite */
27 :
28 : /* WQ_DEPTH sets the max number of write queue jobs that io_ur can track
29 : at once.
30 :
31 : WQ_BLOCK_SZ sets the write block size.
32 :
33 : WQ_DEPTH*WQ_BLOCK_SZ is thus the write window size (bandwidth-delay
34 : product). For example, with a 32 MiB window size, and 1ms write
35 : latency, the max write rate is ~ 32 GiB/s. */
36 :
37 0 : #define WQ_DEPTH (64UL)
38 0 : #define WQ_BLOCK_SZ (1UL<<19) /* 512KiB */
39 :
40 : /* fd_vinyl_io_ur_rd_t extends fd_vinyl_io_rd_t. Describes an inflight
41 : read request. Each object gets created with a fd_vinyl_io_read()
42 : call, has at least the lifetime of a io_uring SQE/CQE transaction,
43 : and gets destroyed with fd_vinyl_io_poll().
44 :
45 : Each fd_vinyl_io_rd_t describes a contiguous read in bstream seq
46 : space. When mapped to the device, this typically results in a single
47 : contiguous read. */
48 :
49 : struct fd_vinyl_io_ur_rd;
50 : typedef struct fd_vinyl_io_ur_rd fd_vinyl_io_ur_rd_t;
51 :
52 : struct fd_vinyl_io_ur_rd {
53 : ulong ctx; /* Must mirror fd_vinyl_io_rd_t */
54 : ulong seq; /* " */
55 : void * dst; /* " */
56 : ulong sz; /* " */
57 :
58 : fd_vinyl_io_ur_rd_t * next; /* Next element in ur rd queue */
59 :
60 : uint head_off; uint head_sz;
61 : uint tail_off; uint tail_sz;
62 : };
63 :
64 : FD_STATIC_ASSERT( sizeof(fd_vinyl_io_ur_rd_t)<=sizeof(fd_vinyl_io_rd_t), layout );
65 :
66 : /* fd_vinyl_io_ur_t extends fd_vinyl_io_t. */
67 :
68 : struct fd_vinyl_io_ur {
69 : fd_vinyl_io_t base[1];
70 : int dev_fd; /* File descriptor of block device */
71 : ulong dev_sync; /* Offset to block that holds bstream sync (BLOCK_SZ multiple) */
72 : ulong dev_base; /* Offset to first block (BLOCK_SZ multiple) */
73 : ulong dev_sz; /* Block store byte size (BLOCK_SZ multiple) */
74 : fd_vinyl_bstream_block_t sync[1];
75 :
76 : /* reads waiting to be submitted to io_uring */
77 : fd_vinyl_io_ur_rd_t * rq_head; /* Pointer to queue head */
78 : fd_vinyl_io_ur_rd_t ** rq_tail_next; /* Pointer to queue &tail->next or &rd_head if empty. */
79 :
80 : /* reads completed early, awaiting poll() */
81 : fd_vinyl_io_ur_rd_t * rc_head; /* Pointer to queue head */
82 : fd_vinyl_io_ur_rd_t ** rc_tail_next; /* Pointer to queue &tail->next or &rc_head if empty. */
83 :
84 : fd_io_uring_t * ring;
85 : ulong sqe_prep_cnt; /* SQEs prepared */
86 : ulong sqe_sent_cnt; /* SQEs submitted */
87 : ulong cqe_cnt; /* CQEs received */
88 : uint cqe_pending; /* Total CQEs pending */
89 : uint cqe_read_pending; /* CQEs for reads pending */
90 : uint cqe_write_pending; /* CQEs for writes pending */
91 : ulong cqe_read_short_cnt; /* CQEs with short reads received */
92 :
93 : /* write-back cache */
94 : wb_ring_t wb; /* write buffer */
95 : ulong seq_cache;
96 : ulong seq_clean;
97 : ulong seq_write;
98 : void * last_alloc;
99 :
100 : struct {
101 : wq_ring_t wq; /* write queue */
102 : wq_desc_t _desc[ WQ_DEPTH ];
103 : };
104 :
105 : /* write-back cache contents follow */
106 : };
107 :
108 : typedef struct fd_vinyl_io_ur fd_vinyl_io_ur_t;
109 :
110 : FD_PROTOTYPES_BEGIN
111 :
112 : /* fd_vinyl_io_ur_wb_buf returns a pointer to the first byte of the
113 : write-back buffer. Offsets returned by wb_ring.h are compatible with
114 : this base pointer. */
115 :
116 : static inline uchar *
117 0 : fd_vinyl_io_ur_wb_buf( fd_vinyl_io_ur_t * io ) {
118 0 : return (uchar *)( io+1 );
119 0 : }
120 :
121 : /* Blocking read/write APIs */
122 :
123 : static inline void
124 : bd_read( int fd,
125 : ulong off,
126 : void * buf,
127 0 : ulong sz ) {
128 0 : ssize_t ssz = pread( fd, buf, sz, (off_t)off );
129 0 : if( FD_LIKELY( ssz==(ssize_t)sz ) ) return;
130 0 : if( ssz<(ssize_t)0 ) FD_LOG_CRIT(( "pread(fd %i,off %lu,sz %lu) failed (%i-%s)", fd, off, sz, errno, fd_io_strerror( errno ) ));
131 0 : /**/ FD_LOG_CRIT(( "pread(fd %i,off %lu,sz %lu) failed (unexpected sz %li)", fd, off, sz, (long)ssz ));
132 0 : }
133 :
134 : static inline void
135 : bd_write( int fd,
136 : ulong off,
137 : void const * buf,
138 0 : ulong sz ) {
139 0 : ssize_t ssz = pwrite( fd, buf, sz, (off_t)off );
140 0 : if( FD_LIKELY( ssz==(ssize_t)sz ) ) return;
141 0 : if( ssz<(ssize_t)0 ) FD_LOG_CRIT(( "pwrite(fd %i,off %lu,sz %lu) failed (%i-%s)", fd, off, sz, errno, fd_io_strerror( errno ) ));
142 0 : else FD_LOG_CRIT(( "pwrite(fd %i,off %lu,sz %lu) failed (unexpected sz %li)", fd, off, sz, (long)ssz ));
143 0 : }
144 :
145 : /* vinyl_io read API, provided by fd_vinyl_io_ur_rd.c *****************/
146 :
147 : /* fd_vinyl_io_ur_read_imm does a synchronous blocking read. */
148 :
149 : void
150 : fd_vinyl_io_ur_read_imm( fd_vinyl_io_t * io,
151 : ulong seq0,
152 : void * _dst,
153 : ulong sz );
154 :
155 : /* fd_vinyl_io_ur_read enqueues an asynchronous read. */
156 :
157 : void
158 : fd_vinyl_io_ur_read( fd_vinyl_io_t * io,
159 : fd_vinyl_io_rd_t * _rd );
160 :
161 : /* fd_vinyl_io_ur_poll polls for the next read completion. */
162 :
163 : int
164 : fd_vinyl_io_ur_poll( fd_vinyl_io_t * io,
165 : fd_vinyl_io_rd_t ** _rd,
166 : int flags );
167 :
168 : /* vinyl_io write API, provided by fd_vinyl_io_ur_wb.c ****************/
169 :
170 : void *
171 : fd_vinyl_io_ur_alloc( fd_vinyl_io_t * io,
172 : ulong sz,
173 : int flags );
174 :
175 : ulong
176 : fd_vinyl_io_ur_append( fd_vinyl_io_t * io,
177 : void const * _src,
178 : ulong sz );
179 :
180 : ulong
181 : fd_vinyl_io_ur_copy( fd_vinyl_io_t * io,
182 : ulong seq_src0,
183 : ulong sz );
184 :
185 : int
186 : fd_vinyl_io_ur_commit( fd_vinyl_io_t * io,
187 : int flags );
188 :
189 : ulong
190 : fd_vinyl_io_ur_hint( fd_vinyl_io_t * io,
191 : ulong sz );
192 :
193 : int
194 : fd_vinyl_io_ur_sync( fd_vinyl_io_t * io,
195 : int flags );
196 :
197 : void
198 : fd_vinyl_io_ur_forget( fd_vinyl_io_t * io,
199 : ulong seq );
200 :
201 : void
202 : fd_vinyl_io_ur_rewind( fd_vinyl_io_t * io,
203 : ulong seq );
204 :
205 : /* Auxiliary write path functions */
206 :
207 : void
208 : fd_vinyl_io_wq_completion( fd_vinyl_io_ur_t * io );
209 :
210 : /* io_uring userdata encoding ******************************************
211 :
212 : io_uring userdata are arbitrary 64-bit words that are provided in SQE
213 : and echoed back in corresponding CQE. We use the userdata to encode
214 : which request completed upon CQE receipt. We need to minimally pack
215 : the request type (read or write) and the request identifier. For the
216 : write path, this is an index; for the read path, this is a pointer to
217 : the descriptor. Pointers are compressed to 61 bits (since the low
218 : 3 bits are always zero for 8 byte aligned pointers). */
219 :
220 0 : #define UR_REQ_READ 0 /* read SQE */
221 0 : #define UR_REQ_READ_TAIL 1 /* read SQE, tail wraparound at end of bstream */
222 0 : #define UR_REQ_WRITE 2 /* write SQE */
223 :
224 0 : #define UR_REQ_TYPE_WIDTH 3
225 0 : #define UR_REQ_TYPE_MASK ((1UL<<UR_REQ_TYPE_WIDTH)-1UL)
226 :
227 : static inline ulong
228 : ur_udata_pack_idx( ulong req_type, /* UR_REQ_* */
229 0 : ulong idx ) {
230 0 : return (idx<<UR_REQ_TYPE_WIDTH) | (req_type & UR_REQ_TYPE_MASK);
231 0 : }
232 :
233 : static inline ulong
234 : ur_udata_pack_ptr( ulong req_type,
235 0 : void * ptr ) {
236 0 : return ( ((ulong)ptr) & ~UR_REQ_TYPE_MASK ) | (req_type & UR_REQ_TYPE_MASK);
237 0 : }
238 :
239 : static inline ulong
240 0 : ur_udata_req_type( ulong udata ) {
241 0 : return udata & UR_REQ_TYPE_MASK;
242 0 : }
243 :
244 : static inline ulong
245 0 : ur_udata_idx( ulong udata ) {
246 0 : return udata >> UR_REQ_TYPE_WIDTH;
247 0 : }
248 :
249 : static inline void *
250 0 : ur_udata_ptr( ulong udata ) {
251 0 : return (void *)( udata & ~UR_REQ_TYPE_MASK );
252 0 : }
253 :
254 : FD_PROTOTYPES_END
255 :
256 : #endif /* HEADER_fd_src_vinyl_io_ur_fd_vinyl_io_ur_private_h */
|