Line data Source code
1 : #define _GNU_SOURCE
2 : #include "fd_io_uring_setup.h"
3 : #include "../shmem/fd_shmem.h"
4 : #include <errno.h>
5 : #include <sys/mman.h> /* mmap */
6 : #include <unistd.h> /* close */
7 :
8 0 : #define FD_IO_URING_SHMEM_HEADROOM (4096UL)
9 :
10 : ulong
11 0 : fd_io_uring_shmem_align( void ) {
12 0 : return FD_SHMEM_NORMAL_PAGE_SZ;
13 0 : }
14 :
15 : struct fd_io_uring_shmem_layout {
16 : /* offset to completion queue memory region
17 : This region contains registers (head/tail numbers), the submission
18 : queue array, and the completion queue (array of CQEs).
19 : (Do not assume this points to a CQE) */
20 : ulong cq_off;
21 :
22 : /* offset to SQE array */
23 : ulong sqe_off;
24 : };
25 :
26 : typedef struct fd_io_uring_shmem_layout fd_io_uring_shmem_layout_t;
27 :
28 : static ulong
29 : fd_io_uring_shmem_layout( fd_io_uring_shmem_layout_t * layout,
30 : ulong sq_depth,
31 0 : ulong cq_depth ) {
32 0 : memset( layout, 0, sizeof(fd_io_uring_shmem_layout_t) );
33 :
34 0 : if( FD_UNLIKELY( !fd_ulong_is_pow2( sq_depth ) ) ) return 0UL;
35 0 : if( FD_UNLIKELY( !fd_ulong_is_pow2( cq_depth ) ) ) return 0UL;
36 0 : if( FD_UNLIKELY( sq_depth>UINT_MAX ) ) return 0UL;
37 0 : if( FD_UNLIKELY( cq_depth>UINT_MAX ) ) return 0UL;
38 :
39 0 : ulong cq_sz;
40 0 : if( FD_UNLIKELY( __builtin_umull_overflow( cq_depth, sizeof(struct io_uring_cqe), &cq_sz ) ) ) return 0UL;
41 0 : ulong sqa_sz;
42 0 : if( FD_UNLIKELY( __builtin_umull_overflow( sq_depth, sizeof(uint), &sqa_sz ) ) ) return 0UL;
43 :
44 : /* io_uring CQ region
45 :
46 : This API matches Linux io_uring.c rings_size():
47 : https://elixir.bootlin.com/linux/v6.11.5/source/io_uring/io_uring.c#L2559 */
48 :
49 0 : FD_SCRATCH_ALLOC_INIT( l, NULL );
50 :
51 : /* The true footprint requirement depends on the kernel version. The
52 : head part of this region is 'struct io_rings', which is not stable
53 : ABI. We use a very conservative 4 KiB here. */
54 :
55 0 : layout->cq_off = (ulong)
56 0 : FD_SCRATCH_ALLOC_APPEND( l, FD_SHMEM_NORMAL_PAGE_SZ, FD_IO_URING_SHMEM_HEADROOM );
57 :
58 : /* Completion queue (cache line align) */
59 :
60 0 : FD_SCRATCH_ALLOC_APPEND( l, 128UL, cq_depth*sizeof(struct io_uring_cqe) );
61 :
62 : /* Submission queue index array (cache line align) */
63 :
64 0 : FD_SCRATCH_ALLOC_APPEND( l, 128UL, sq_depth*sizeof(uint) );
65 :
66 : /* io_uring SQEs region */
67 :
68 0 : layout->sqe_off = (ulong)FD_SCRATCH_ALLOC_APPEND(
69 0 : l, FD_SHMEM_NORMAL_PAGE_SZ, sq_depth*sizeof(struct io_uring_sqe) );
70 :
71 0 : return FD_SCRATCH_ALLOC_FINI( l, FD_SHMEM_NORMAL_PAGE_SZ );
72 0 : }
73 :
74 : ulong
75 : fd_io_uring_shmem_footprint( ulong sq_depth,
76 0 : ulong cq_depth ) {
77 0 : fd_io_uring_shmem_layout_t layout[1];
78 0 : return fd_io_uring_shmem_layout( layout, sq_depth, cq_depth );
79 0 : }
80 :
81 : fd_io_uring_params_t *
82 : fd_io_uring_shmem_setup( fd_io_uring_params_t * params,
83 : void * shmem,
84 : ulong sq_depth,
85 0 : ulong cq_depth ) {
86 :
87 0 : fd_io_uring_shmem_layout_t layout[1];
88 0 : ulong shmem_footprint = fd_io_uring_shmem_layout( layout, sq_depth, cq_depth );
89 0 : if( FD_UNLIKELY( !shmem_footprint ) ) {
90 0 : FD_LOG_WARNING(( "invalid sq_depth (%lu) or cq_depth (%lu)", sq_depth, cq_depth ));
91 0 : return NULL;
92 0 : }
93 :
94 0 : params->flags |= IORING_SETUP_NO_MMAP;
95 0 : params->sq_entries = (uint)sq_depth;
96 0 : params->cq_entries = (uint)cq_depth;
97 :
98 : /* cq_off points to the region containing the kernel private io_rings
99 : struct, the completion queue (array of CQEs), and the submission
100 : queue array (array of uints). */
101 :
102 0 : params->cq_off = (fd_io_cqring_offsets_t) {
103 0 : .user_addr = (unsigned long long)( (uchar *)shmem ),
104 0 : };
105 :
106 : /* sq_off points to the table of submission queue entries. */
107 :
108 0 : params->sq_off = (fd_io_sqring_offsets_t) {
109 0 : .user_addr = (unsigned long long)( (uchar *)shmem + layout->sqe_off ),
110 0 : };
111 :
112 0 : return params;
113 0 : }
114 :
115 : static void
116 : fd_io_uring_init_rings(
117 : fd_io_uring_sq_t * sq,
118 : fd_io_uring_cq_t * cq,
119 : fd_io_uring_params_t * params,
120 : void * sqe_mem,
121 : void * cq_mem
122 0 : ) {
123 0 : ulong sqe_laddr = (ulong)sqe_mem;
124 0 : ulong cq_laddr = (ulong)cq_mem;
125 :
126 0 : FD_CRIT( fd_ulong_is_pow2( params->sq_entries ), "invalid params->sq_entries" );
127 0 : FD_CRIT( fd_ulong_is_pow2( params->cq_entries ), "invalid params->cq_entries" );
128 :
129 0 : *sq = (fd_io_uring_sq_t) {
130 : /* Confusingly, in Linux io_uring, submission queue registers are
131 : located in the completion queue memory region */
132 0 : .khead = (void *)( cq_laddr + params->sq_off.head ),
133 0 : .ktail = (void *)( cq_laddr + params->sq_off.tail ),
134 0 : .kflags = (void *)( cq_laddr + params->sq_off.flags ),
135 0 : .kdropped = (void *)( cq_laddr + params->sq_off.dropped ),
136 :
137 0 : .array = (void *)( cq_laddr + params->sq_off.array ),
138 0 : .sqes = (void *)( sqe_laddr ),
139 :
140 0 : .sqe_head = 0,
141 0 : .sqe_tail = 0,
142 0 : .depth = params->sq_entries
143 0 : };
144 :
145 0 : *cq = (fd_io_uring_cq_t) {
146 0 : .depth = params->cq_entries,
147 :
148 0 : .khead = (void *)( cq_laddr + params->cq_off.head ),
149 0 : .ktail = (void *)( cq_laddr + params->cq_off.tail ),
150 0 : .koverflow = (void *)( cq_laddr + params->cq_off.overflow ),
151 :
152 0 : .cqes = (void *)( cq_laddr + params->cq_off.cqes )
153 0 : };
154 :
155 : /* io_uring uses a rather useless indirection table to map queue slots
156 : to entries. */
157 :
158 0 : for( uint i=0; i<params->sq_entries; i++ ) {
159 0 : sq->array[ i ] = i;
160 0 : }
161 0 : }
162 :
163 : fd_io_uring_t *
164 : fd_io_uring_init_shmem(
165 : fd_io_uring_t * ring,
166 : fd_io_uring_params_t * params,
167 : void * shmem,
168 : ulong sq_depth,
169 : ulong cq_depth
170 0 : ) {
171 0 : memset( ring, 0, sizeof(fd_io_uring_t) );
172 0 : ring->ioring_fd = -1;
173 :
174 0 : params->flags |= IORING_SETUP_CQSIZE;
175 0 : params->sq_entries = (uint)sq_depth;
176 0 : params->cq_entries = (uint)cq_depth;
177 :
178 0 : fd_io_uring_shmem_setup( params, shmem, sq_depth, cq_depth );
179 :
180 0 : memset( shmem, 0, FD_IO_URING_SHMEM_HEADROOM );
181 :
182 0 : int ring_fd = fd_io_uring_setup( (uint)sq_depth, params );
183 0 : if( FD_UNLIKELY( ring_fd<0 ) ) return NULL;
184 0 : ring->ioring_fd = ring_fd;
185 :
186 0 : fd_io_uring_shmem_layout_t layout[1];
187 0 : fd_io_uring_shmem_layout( layout, sq_depth, cq_depth );
188 :
189 0 : fd_io_uring_init_rings(
190 0 : ring->sq,
191 0 : ring->cq,
192 0 : params,
193 0 : (void *)( (ulong)shmem + layout->sqe_off ),
194 0 : (void *)( (ulong)shmem + layout->cq_off )
195 0 : );
196 0 : return ring;
197 0 : }
198 :
199 : fd_io_uring_t *
200 : fd_io_uring_init_mmap(
201 : fd_io_uring_t * ring,
202 : fd_io_uring_params_t * params
203 0 : ) {
204 0 : memset( ring, 0, sizeof(fd_io_uring_t) );
205 0 : ring->ioring_fd = -1;
206 :
207 0 : uint sq_depth = params->sq_entries;
208 0 : uint cq_depth = params->cq_entries;
209 :
210 0 : int ring_fd = fd_io_uring_setup( params->sq_entries, params );
211 0 : if( FD_UNLIKELY( ring_fd<0 ) ) return NULL;
212 0 : ring->ioring_fd = ring_fd;
213 :
214 0 : if( FD_UNLIKELY( params->sq_entries != sq_depth ||
215 0 : params->cq_entries != cq_depth ) ) {
216 0 : FD_LOG_WARNING(( "io_uring setup failed: requested (sq_depth=%u, cq_depth=%u) but kernel returned (sq_depth=%u, cq_depth=%u)",
217 0 : params->sq_entries, params->cq_entries,
218 0 : sq_depth, cq_depth ));
219 0 : close( ring_fd );
220 0 : ring->ioring_fd = -1;
221 0 : return NULL;
222 0 : }
223 :
224 0 : ring->kern_sq_sz = params->sq_off.array + params->sq_entries * sizeof(uint);
225 0 : ring->kern_sqe_sz = /* */ params->sq_entries * sizeof(struct io_uring_sqe);
226 0 : ring->kern_cq_sz = params->cq_off.cqes + params->cq_entries * sizeof(struct io_uring_cqe);
227 :
228 0 : ring->kern_sq_mem = mmap( NULL, ring->kern_sq_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, ring_fd, IORING_OFF_SQ_RING );
229 0 : if( FD_UNLIKELY( ring->kern_sq_mem==MAP_FAILED ) ) {
230 0 : FD_LOG_WARNING(( "mmap SQ ring failed (%i-%s)", errno, fd_io_strerror( errno ) ));
231 0 : close( ring_fd );
232 0 : ring->ioring_fd = -1;
233 0 : return NULL;
234 0 : }
235 :
236 0 : ring->kern_sqe_mem = mmap( NULL, ring->kern_sqe_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, ring_fd, IORING_OFF_SQES );
237 0 : if( FD_UNLIKELY( ring->kern_sqe_mem==MAP_FAILED ) ) {
238 0 : munmap( ring->kern_sq_mem, ring->kern_sq_sz );
239 0 : close( ring_fd );
240 0 : ring->ioring_fd = -1;
241 0 : FD_LOG_WARNING(( "mmap SQEs failed (%i-%s)", errno, fd_io_strerror( errno ) ));
242 0 : return NULL;
243 0 : }
244 :
245 0 : ring->kern_cq_mem = mmap( NULL, ring->kern_cq_sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, ring_fd, IORING_OFF_CQ_RING );
246 0 : if( FD_UNLIKELY( ring->kern_cq_mem==MAP_FAILED ) ) {
247 0 : munmap( ring->kern_sqe_mem, ring->kern_sqe_sz );
248 0 : munmap( ring->kern_sq_mem, ring->kern_sq_sz );
249 0 : close( ring_fd );
250 0 : ring->ioring_fd = -1;
251 0 : FD_LOG_WARNING(( "mmap CQ ring failed (%i-%s)", errno, fd_io_strerror( errno ) ));
252 0 : return NULL;
253 0 : }
254 :
255 0 : fd_io_uring_init_rings(
256 0 : ring->sq,
257 0 : ring->cq,
258 0 : params,
259 0 : ring->kern_sqe_mem,
260 0 : ring->kern_cq_mem
261 0 : );
262 :
263 0 : return ring;
264 0 : }
265 :
266 : void *
267 0 : fd_io_uring_fini( fd_io_uring_t * ring ) {
268 :
269 0 : if( ring->kern_cq_mem ) {
270 0 : if( FD_UNLIKELY( munmap( ring->kern_cq_mem, ring->kern_cq_sz ) ) ) {
271 0 : FD_LOG_WARNING(( "munmap CQ ring failed (%i-%s)", errno, fd_io_strerror( errno ) ));
272 0 : }
273 0 : ring->kern_cq_mem = NULL;
274 0 : ring->kern_cq_sz = 0UL;
275 0 : }
276 :
277 0 : if( ring->kern_sqe_mem ) {
278 0 : if( FD_UNLIKELY( munmap( ring->kern_sqe_mem, ring->kern_sqe_sz ) ) ) {
279 0 : FD_LOG_WARNING(( "munmap SQEs failed (%i-%s)", errno, fd_io_strerror( errno ) ));
280 0 : }
281 0 : ring->kern_sqe_mem = NULL;
282 0 : ring->kern_sqe_sz = 0UL;
283 0 : }
284 :
285 0 : if( ring->kern_sq_mem ) {
286 0 : if( FD_UNLIKELY( munmap( ring->kern_sq_mem, ring->kern_sq_sz ) ) ) {
287 0 : FD_LOG_WARNING(( "munmap SQ ring failed (%i-%s)", errno, fd_io_strerror( errno ) ));
288 0 : }
289 0 : ring->kern_sq_mem = NULL;
290 0 : ring->kern_sq_sz = 0UL;
291 0 : }
292 :
293 0 : if( ring->ioring_fd>=0 ) {
294 0 : if( FD_UNLIKELY( close( ring->ioring_fd ) ) ) {
295 0 : FD_LOG_WARNING(( "close(ring_fd) failed (%i-%s)", errno, fd_io_strerror( errno ) ));
296 0 : }
297 0 : ring->ioring_fd = -1;
298 0 : }
299 :
300 0 : memset( ring->sq, 0, sizeof(fd_io_uring_sq_t) );
301 0 : memset( ring->cq, 0, sizeof(fd_io_uring_cq_t) );
302 :
303 0 : return ring;
304 0 : }
|