Line data Source code
1 : #if FD_HAS_THREADS /* THREADS implies HOSTED */
2 : #define _GNU_SOURCE
3 : #endif
4 :
5 : #include "fd_shmem_private.h"
6 :
7 : /* Portable APIs */
8 :
9 : int
10 0 : fd_cstr_to_shmem_lg_page_sz( char const * cstr ) {
11 0 : if( !cstr ) return FD_SHMEM_UNKNOWN_LG_PAGE_SZ;
12 :
13 0 : if( !fd_cstr_casecmp( cstr, "normal" ) ) return FD_SHMEM_NORMAL_LG_PAGE_SZ;
14 0 : if( !fd_cstr_casecmp( cstr, "huge" ) ) return FD_SHMEM_HUGE_LG_PAGE_SZ;
15 0 : if( !fd_cstr_casecmp( cstr, "gigantic" ) ) return FD_SHMEM_GIGANTIC_LG_PAGE_SZ;
16 :
17 0 : int i = fd_cstr_to_int( cstr );
18 0 : if( i==FD_SHMEM_NORMAL_LG_PAGE_SZ ) return FD_SHMEM_NORMAL_LG_PAGE_SZ;
19 0 : if( i==FD_SHMEM_HUGE_LG_PAGE_SZ ) return FD_SHMEM_HUGE_LG_PAGE_SZ;
20 0 : if( i==FD_SHMEM_GIGANTIC_LG_PAGE_SZ ) return FD_SHMEM_GIGANTIC_LG_PAGE_SZ;
21 :
22 0 : return FD_SHMEM_UNKNOWN_LG_PAGE_SZ;
23 0 : }
24 :
25 : char const *
26 0 : fd_shmem_lg_page_sz_to_cstr( int lg_page_sz ) {
27 0 : switch( lg_page_sz ) {
28 0 : case FD_SHMEM_NORMAL_LG_PAGE_SZ: return "normal";
29 0 : case FD_SHMEM_HUGE_LG_PAGE_SZ: return "huge";
30 0 : case FD_SHMEM_GIGANTIC_LG_PAGE_SZ: return "gigantic";
31 0 : default: break;
32 0 : }
33 0 : return "unknown";
34 0 : }
35 :
36 : ulong
37 72 : fd_cstr_to_shmem_page_sz( char const * cstr ) {
38 72 : if( !cstr ) return FD_SHMEM_UNKNOWN_PAGE_SZ;
39 :
40 72 : if( !fd_cstr_casecmp( cstr, "normal" ) ) return FD_SHMEM_NORMAL_PAGE_SZ;
41 72 : if( !fd_cstr_casecmp( cstr, "huge" ) ) return FD_SHMEM_HUGE_PAGE_SZ;
42 72 : if( !fd_cstr_casecmp( cstr, "gigantic" ) ) return FD_SHMEM_GIGANTIC_PAGE_SZ;
43 :
44 0 : ulong u = fd_cstr_to_ulong( cstr );
45 0 : if( u==FD_SHMEM_NORMAL_PAGE_SZ ) return FD_SHMEM_NORMAL_PAGE_SZ;
46 0 : if( u==FD_SHMEM_HUGE_PAGE_SZ ) return FD_SHMEM_HUGE_PAGE_SZ;
47 0 : if( u==FD_SHMEM_GIGANTIC_PAGE_SZ ) return FD_SHMEM_GIGANTIC_PAGE_SZ;
48 :
49 0 : return FD_SHMEM_UNKNOWN_PAGE_SZ;
50 0 : }
51 :
52 : char const *
53 3687 : fd_shmem_page_sz_to_cstr( ulong page_sz ) {
54 3687 : switch( page_sz ) {
55 507 : case FD_SHMEM_NORMAL_PAGE_SZ: return "normal";
56 507 : case FD_SHMEM_HUGE_PAGE_SZ: return "huge";
57 2673 : case FD_SHMEM_GIGANTIC_PAGE_SZ: return "gigantic";
58 0 : default: break;
59 3687 : }
60 0 : return "unknown";
61 3687 : }
62 :
63 : #if FD_HAS_HOSTED
64 :
65 : #include <ctype.h>
66 : #include <errno.h>
67 : #include <unistd.h>
68 : #include <fcntl.h>
69 : #include <linux/mempolicy.h>
70 : #include <sys/mman.h>
71 : #include <sys/stat.h>
72 : #include <linux/mman.h>
73 :
74 : #if FD_HAS_THREADS
75 : pthread_mutex_t fd_shmem_private_lock[1];
76 : #endif
77 :
78 : char fd_shmem_private_base[ FD_SHMEM_PRIVATE_BASE_MAX ]; /* "" at thread group start, initialized at boot */
79 : ulong fd_shmem_private_base_len; /* 0UL at ", initialized at boot */
80 :
81 : /* NUMA TOPOLOGY APIS *************************************************/
82 :
83 : static ulong fd_shmem_private_numa_cnt; /* 0UL at thread group start, initialized at boot */
84 : static ulong fd_shmem_private_cpu_cnt; /* " */
85 : static ushort fd_shmem_private_numa_idx[ FD_SHMEM_CPU_MAX ]; /* " */
86 : static ushort fd_shmem_private_cpu_idx [ FD_SHMEM_NUMA_MAX ]; /* " */
87 :
88 3 : ulong fd_shmem_numa_cnt( void ) { return fd_shmem_private_numa_cnt; }
89 1239 : ulong fd_shmem_cpu_cnt ( void ) { return fd_shmem_private_cpu_cnt; }
90 :
91 : ulong
92 1530 : fd_shmem_numa_idx( ulong cpu_idx ) {
93 1530 : if( FD_UNLIKELY( cpu_idx>=fd_shmem_private_cpu_cnt ) ) return ULONG_MAX;
94 1530 : return (ulong)fd_shmem_private_numa_idx[ cpu_idx ];
95 1530 : }
96 :
97 : ulong
98 456 : fd_shmem_cpu_idx( ulong numa_idx ) {
99 456 : if( FD_UNLIKELY( numa_idx>=fd_shmem_private_numa_cnt ) ) return ULONG_MAX;
100 456 : return (ulong)fd_shmem_private_cpu_idx[ numa_idx ];
101 456 : }
102 :
103 : int
104 : fd_shmem_numa_validate( void const * mem,
105 : ulong page_sz,
106 : ulong page_cnt,
107 525 : ulong cpu_idx ) {
108 525 : if( FD_UNLIKELY( !mem ) ) {
109 0 : FD_LOG_WARNING(( "NULL mem" ));
110 0 : return EINVAL;
111 0 : }
112 :
113 525 : if( FD_UNLIKELY( !fd_shmem_is_page_sz( page_sz ) ) ) {
114 0 : FD_LOG_WARNING(( "bad page_sz (%lu)", page_sz ));
115 0 : return EINVAL;
116 0 : }
117 :
118 525 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, page_sz ) ) ) {
119 0 : FD_LOG_WARNING(( "misaligned mem" ));
120 0 : return EINVAL;
121 0 : }
122 :
123 525 : if( FD_UNLIKELY( !((1UL<=page_cnt) & (page_cnt<=(((ulong)LONG_MAX)/page_sz))) ) ) {
124 0 : FD_LOG_WARNING(( "bad page_cnt (%lu)", page_cnt ));
125 0 : return EINVAL;
126 0 : }
127 :
128 525 : if( FD_UNLIKELY( !(cpu_idx<fd_shmem_cpu_cnt()) ) ) {
129 0 : FD_LOG_WARNING(( "bad cpu_idx (%lu)", cpu_idx ));
130 0 : return EINVAL;
131 0 : }
132 :
133 525 : ulong numa_idx = fd_shmem_numa_idx( cpu_idx );
134 :
135 525 : ulong page = (ulong)mem;
136 525 : int batch_status[ 512 ];
137 525 : void * batch_page [ 512 ];
138 525 : ulong batch_cnt = 0UL;
139 64488471 : while( page_cnt ) {
140 64487946 : batch_page[ batch_cnt++ ] = (void *)page;
141 64487946 : page += page_sz;
142 64487946 : page_cnt--;
143 64487946 : if( FD_UNLIKELY( ((batch_cnt==512UL) | (!page_cnt) ) ) ) {
144 126354 : if( FD_UNLIKELY( fd_numa_move_pages( 0, batch_cnt, batch_page, NULL, batch_status, 0 ) ) ) {
145 0 : FD_LOG_WARNING(( "fd_numa_move_pages query failed (%i-%s)", errno, fd_io_strerror( errno ) ));
146 0 : return errno;
147 0 : }
148 64614300 : for( ulong batch_idx=0UL; batch_idx<batch_cnt; batch_idx++ ) {
149 64487946 : if( FD_UNLIKELY( batch_status[batch_idx]<0 ) ) {
150 0 : int err = -batch_status[batch_idx];
151 0 : FD_LOG_WARNING(( "page status failed (%i-%s)", err, fd_io_strerror( err ) ));
152 0 : return err;
153 0 : }
154 64487946 : if( FD_UNLIKELY( batch_status[batch_idx]!=(int)numa_idx ) ) {
155 0 : FD_LOG_WARNING(( "page allocated to numa %i instead of numa %lu", batch_status[batch_idx], numa_idx ));
156 0 : return EFAULT;
157 0 : }
158 64487946 : }
159 126354 : batch_cnt = 0UL;
160 126354 : }
161 64487946 : }
162 :
163 525 : return 0;
164 525 : }
165 :
166 : /* SHMEM REGION CREATION AND DESTRUCTION ******************************/
167 :
168 : static int
169 : fd_shmem_create_multi_flags( char const * name,
170 : ulong page_sz,
171 : ulong sub_cnt,
172 : ulong const * _sub_page_cnt,
173 : ulong const * _sub_cpu_idx,
174 : ulong mode,
175 27 : int open_flags ) {
176 :
177 : /* Check input args */
178 :
179 27 : if( FD_UNLIKELY( !fd_shmem_name_len( name ) ) ) { FD_LOG_WARNING(( "bad name (%s)", name ? name : "NULL" )); return EINVAL; }
180 :
181 27 : if( FD_UNLIKELY( !fd_shmem_is_page_sz( page_sz ) ) ) { FD_LOG_WARNING(( "bad page_sz (%lu)", page_sz )); return EINVAL; }
182 :
183 27 : if( FD_UNLIKELY( !sub_cnt ) ) { FD_LOG_WARNING(( "zero sub_cnt" )); return EINVAL; }
184 27 : if( FD_UNLIKELY( !_sub_page_cnt ) ) { FD_LOG_WARNING(( "NULL sub_page_cnt" )); return EINVAL; }
185 27 : if( FD_UNLIKELY( !_sub_cpu_idx ) ) { FD_LOG_WARNING(( "NULL sub_cpu_idx" )); return EINVAL; }
186 :
187 27 : ulong cpu_cnt = fd_shmem_cpu_cnt();
188 :
189 27 : ulong page_cnt = 0UL;
190 51 : for( ulong sub_idx=0UL; sub_idx<sub_cnt; sub_idx++ ) {
191 27 : ulong sub_page_cnt = _sub_page_cnt[ sub_idx ];
192 27 : if( FD_UNLIKELY( !sub_page_cnt ) ) continue; /* Skip over empty subregions */
193 :
194 27 : page_cnt += sub_page_cnt;
195 27 : if( FD_UNLIKELY( page_cnt<sub_page_cnt ) ) {
196 0 : FD_LOG_WARNING(( "sub[%lu] sub page_cnt overflow (page_cnt %lu, sub_page_cnt %lu)",
197 0 : sub_idx, page_cnt-sub_page_cnt, sub_page_cnt ));
198 0 : return EINVAL;
199 0 : }
200 :
201 27 : ulong sub_cpu_idx = _sub_cpu_idx[ sub_idx ];
202 27 : if( FD_UNLIKELY( sub_cpu_idx>=cpu_cnt ) ) {
203 3 : FD_LOG_WARNING(( "sub[%lu] bad cpu_idx (%lu)", sub_idx, sub_cpu_idx ));
204 3 : return EINVAL;
205 3 : }
206 27 : }
207 :
208 24 : if( FD_UNLIKELY( !((1UL<=page_cnt) & (page_cnt<=(((ulong)LONG_MAX)/page_sz))) ) ) { /* LONG_MAX from off_t */
209 0 : FD_LOG_WARNING(( "bad total page_cnt (%lu)", page_cnt ));
210 0 : return EINVAL;
211 0 : }
212 :
213 24 : if( FD_UNLIKELY( mode!=(ulong)(mode_t)mode ) ) { FD_LOG_WARNING(( "bad mode (0%03lo)", mode )); return EINVAL; }
214 :
215 : /* We use the FD_SHMEM_LOCK in create just to be safe given some
216 : thread safety ambiguities in the documentation for some of the
217 : below APIs. */
218 :
219 21 : FD_SHMEM_LOCK;
220 :
221 21 : int err;
222 :
223 21 : # define ERROR( cleanup ) do { err = errno; goto cleanup; } while(0)
224 :
225 21 : int orig_mempolicy;
226 21 : ulong orig_nodemask[ (FD_SHMEM_NUMA_MAX+63UL)/64UL ];
227 21 : char path[ FD_SHMEM_PRIVATE_PATH_BUF_MAX ];
228 21 : int fd;
229 21 : void * shmem;
230 :
231 21 : ulong sz = page_cnt*page_sz;
232 :
233 : /* Save this thread's numa node mempolicy */
234 :
235 21 : if( FD_UNLIKELY( fd_numa_get_mempolicy( &orig_mempolicy, orig_nodemask, FD_SHMEM_NUMA_MAX, NULL, 0UL ) ) ) {
236 0 : FD_LOG_WARNING(( "fd_numa_get_mempolicy failed (%i-%s)", errno, fd_io_strerror( errno ) ));
237 0 : ERROR( done );
238 0 : }
239 :
240 : /* Create the region */
241 :
242 21 : fd = open( fd_shmem_private_path( name, page_sz, path ), open_flags, (mode_t)mode );
243 21 : if( FD_UNLIKELY( fd==-1 ) ) {
244 0 : FD_LOG_WARNING(( "open(\"%s\",%#x,0%03lo) failed (%i-%s)", path, (uint)open_flags, mode, errno, fd_io_strerror( errno ) ));
245 0 : ERROR( restore );
246 0 : }
247 :
248 : /* Size the region */
249 :
250 21 : if( FD_UNLIKELY( ftruncate( fd, (off_t)sz ) ) ) {
251 0 : FD_LOG_WARNING(( "ftruncate(\"%s\",%lu KiB) failed (%i-%s)", path, sz>>10, errno, fd_io_strerror( errno ) ));
252 0 : ERROR( close );
253 0 : }
254 :
255 : /* Map the region into our address space. */
256 :
257 21 : shmem = mmap( NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, (off_t)0);
258 21 : if( FD_UNLIKELY( shmem==MAP_FAILED ) ) {
259 0 : FD_LOG_WARNING(( "mmap(NULL,%lu KiB,PROT_READ|PROT_WRITE,MAP_SHARED,\"%s\",0) failed (%i-%s)",
260 0 : sz>>10, path, errno, fd_io_strerror( errno ) ));
261 0 : ERROR( close );
262 0 : }
263 :
264 : /* Validate the mapping */
265 :
266 21 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, page_sz ) ) ) {
267 0 : FD_LOG_WARNING(( "misaligned memory mapping for \"%s\"\n\t"
268 0 : "This thread group's hugetlbfs mount path (--shmem-path / FD_SHMEM_PATH):\n\t"
269 0 : "\t%s\n\t"
270 0 : "has probably been corrupted and needs to be redone.\n\t"
271 0 : "See 'bin/fd_shmem_cfg help' for more information.",
272 0 : path, fd_shmem_private_base ));
273 0 : errno = EFAULT; /* ENOMEM is arguable */
274 0 : ERROR( unmap );
275 0 : }
276 :
277 : /* For each subregion */
278 :
279 21 : uchar * sub_shmem = (uchar *)shmem;
280 42 : for( ulong sub_idx=0UL; sub_idx<sub_cnt; sub_idx++ ) {
281 21 : ulong sub_page_cnt = _sub_page_cnt[ sub_idx ];
282 21 : if( FD_UNLIKELY( !sub_page_cnt ) ) continue; /* Skip over empty sub-regions */
283 :
284 21 : ulong sub_sz = sub_page_cnt*page_sz;
285 21 : ulong sub_cpu_idx = _sub_cpu_idx[ sub_idx ];
286 21 : ulong sub_numa_idx = fd_shmem_numa_idx( sub_cpu_idx );
287 :
288 21 : ulong nodemask[ (FD_SHMEM_NUMA_MAX+63UL)/64UL ];
289 :
290 : /* Set the mempolicy to bind newly allocated memory to the numa idx
291 : corresponding to logical cpu cpu_idx. This should force page
292 : allocation to be on the desired numa node, keeping our fingers
293 : crossed that even the ftruncate / mmap above did not trigger
294 : this; it doesn't seem too, even when the user's thread group has
295 : configured things like mlockall(MCL_CURRENT | MCL_FUTURE ).
296 : Theoretically, the fd_numa_mbind below should do it without this
297 : but the Linux kernel tends to view requests to move pages between
298 : numa nodes after allocation as for entertainment purposes only. */
299 :
300 21 : fd_memset( nodemask, 0, 8UL*((FD_SHMEM_NUMA_MAX+63UL)/64UL) );
301 21 : nodemask[ sub_numa_idx >> 6 ] = 1UL << (sub_numa_idx & 63UL);
302 :
303 21 : if( FD_UNLIKELY( fd_numa_set_mempolicy( MPOL_BIND | MPOL_F_STATIC_NODES, nodemask, FD_SHMEM_NUMA_MAX ) ) ) {
304 0 : FD_LOG_WARNING(( "fd_numa_set_mempolicy failed (%i-%s)", errno, fd_io_strerror( errno ) ));
305 0 : ERROR( unmap );
306 0 : }
307 :
308 : /* If a mempolicy has been set and the numa_idx node does not have
309 : sufficient pages to back the mapping, touching the memory will
310 : trigger a SIGBUS when it touches the first part of the mapping
311 : for which there are no pages. Unfortunately, mmap will only
312 : error if there are insufficient pages across all NUMA nodes (even
313 : if using mlockall( MCL_FUTURE ) or passing MAP_POPULATE), so we
314 : need to check that the mapping can be backed without handling
315 : signals.
316 :
317 : So we mlock the subregion to force the region to be backed by
318 : pages now. The subregion should be backed by page_sz pages
319 : (thanks to the hugetlbfs configuration) and should be on the
320 : correct NUMA node (thanks to the mempolicy above). Specifically,
321 : mlock will error with ENOMEM if there were insufficient pages
322 : available. mlock guarantees that if it succeeds, the mapping has
323 : been fully backed by pages and these pages will remain resident
324 : in DRAM at least until the mapping is closed. We can then
325 : proceed as usual without the risk of meeting SIGBUS or its
326 : friends. */
327 :
328 21 : if( FD_UNLIKELY( fd_numa_mlock( sub_shmem, sub_sz ) ) ) {
329 0 : FD_LOG_WARNING(( "sub[%lu]: fd_numa_mlock(\"%s\",%lu KiB) failed (%i-%s)",
330 0 : sub_idx, path, sub_sz>>10, errno, fd_io_strerror( errno ) ));
331 0 : ERROR( unmap );
332 0 : }
333 :
334 : /* At this point all pages in this subregion should be allocated on
335 : the right NUMA node and resident in DRAM. But in the spirit of
336 : not trusting Linux to get this right robustly, we continue with
337 : touching pages from cpu_idx. */
338 :
339 : /* FIXME: NUMA TOUCH HERE (ALSO WOULD A LOCAL TOUCH WORK GIVEN THE
340 : MEMPOLICY DONE ABOVE?) */
341 :
342 : /* fd_numa_mbind the memory subregion to this numa node to nominally
343 : stay put after we unmap it. We recompute the nodemask to be on
344 : the safe side in case set mempolicy above clobbered it. */
345 :
346 21 : fd_memset( nodemask, 0, 8UL*((FD_SHMEM_NUMA_MAX+63UL)/64UL) );
347 21 : nodemask[ sub_numa_idx >> 6 ] = 1UL << (sub_numa_idx & 63UL);
348 :
349 21 : if( FD_UNLIKELY( fd_numa_mbind( sub_shmem, sub_sz, MPOL_BIND, nodemask, FD_SHMEM_NUMA_MAX, MPOL_MF_MOVE|MPOL_MF_STRICT ) ) ) {
350 0 : FD_LOG_WARNING(( "sub[%lu]: fd_numa_mbind(\"%s\",%lu KiB,MPOL_BIND,1UL<<%lu,MPOL_MF_MOVE|MPOL_MF_STRICT) failed (%i-%s)",
351 0 : sub_idx, path, sub_sz>>10, sub_numa_idx, errno, fd_io_strerror( errno ) ));
352 0 : ERROR( unmap );
353 0 : }
354 :
355 : /* And since the fd_numa_mbind still often will ignore requests, we
356 : double check that the pages are in the right place. */
357 :
358 21 : int warn = fd_shmem_numa_validate( sub_shmem, page_sz, sub_page_cnt, sub_cpu_idx ); /* logs details */
359 21 : if( FD_UNLIKELY( warn ) )
360 0 : FD_LOG_WARNING(( "sub[%lu]: mmap(NULL,%lu KiB,PROT_READ|PROT_WRITE,MAP_SHARED,\"%s\",0) numa binding failed (%i-%s)",
361 21 : sub_idx, sub_sz>>10, path, warn, fd_io_strerror( warn ) ));
362 :
363 21 : sub_shmem += sub_sz;
364 21 : }
365 :
366 21 : err = 0;
367 :
368 21 : # undef ERROR
369 :
370 21 : unmap:
371 21 : if( FD_UNLIKELY( munmap( shmem, sz ) ) )
372 0 : FD_LOG_ERR(( "munmap(\"%s\",%lu KiB) failed (%i-%s)",
373 21 : path, sz>>10, errno, fd_io_strerror( errno ) ));
374 :
375 21 : close:
376 21 : if( FD_UNLIKELY( err ) && FD_UNLIKELY( unlink( path ) ) )
377 0 : FD_LOG_ERR(( "unlink(\"%s\") failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
378 21 : if( FD_UNLIKELY( close( fd ) ) )
379 0 : FD_LOG_ERR(( "close(\"%s\") failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
380 :
381 21 : restore:
382 21 : if( FD_UNLIKELY( fd_numa_set_mempolicy( orig_mempolicy, orig_nodemask, FD_SHMEM_NUMA_MAX ) ) )
383 0 : FD_LOG_ERR(( "fd_numa_set_mempolicy failed (%i-%s)", errno, fd_io_strerror( errno ) ));
384 :
385 21 : done:
386 21 : FD_SHMEM_UNLOCK;
387 21 : return err;
388 21 : }
389 :
390 : int
391 : fd_shmem_create_multi( char const * name,
392 : ulong page_sz,
393 : ulong sub_cnt,
394 : ulong const * _sub_page_cnt,
395 : ulong const * _sub_cpu_idx,
396 27 : ulong mode ) {
397 27 : return fd_shmem_create_multi_flags( name, page_sz, sub_cnt, _sub_page_cnt, _sub_cpu_idx, mode, O_RDWR | O_CREAT | O_EXCL );
398 27 : }
399 :
400 : int
401 : fd_shmem_update_multi( char const * name,
402 : ulong page_sz,
403 : ulong sub_cnt,
404 : ulong const * _sub_page_cnt,
405 : ulong const * _sub_cpu_idx,
406 0 : ulong mode ) {
407 0 : return fd_shmem_create_multi_flags( name, page_sz, sub_cnt, _sub_page_cnt, _sub_cpu_idx, mode, O_RDWR );
408 0 : }
409 :
410 : int
411 : fd_shmem_unlink( char const * name,
412 21 : ulong page_sz ) {
413 21 : char path[ FD_SHMEM_PRIVATE_PATH_BUF_MAX ];
414 :
415 : /* Check input args */
416 :
417 21 : if( FD_UNLIKELY( !fd_shmem_name_len( name ) ) ) { FD_LOG_WARNING(( "bad name (%s)", name ? name : "NULL" )); return EINVAL; }
418 :
419 21 : if( FD_UNLIKELY( !fd_shmem_is_page_sz( page_sz ) ) ) { FD_LOG_WARNING(( "bad page_sz (%lu)", page_sz )); return EINVAL; }
420 :
421 : /* Unlink the name */
422 :
423 21 : if( FD_UNLIKELY( unlink( fd_shmem_private_path( name, page_sz, path ) ) ) ) {
424 0 : FD_LOG_WARNING(( "unlink(\"%s\") failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
425 0 : return errno;
426 0 : }
427 :
428 21 : return 0;
429 21 : }
430 :
431 : int
432 : fd_shmem_info( char const * name,
433 : ulong page_sz,
434 4152 : fd_shmem_info_t * opt_info ) {
435 :
436 4152 : if( FD_UNLIKELY( !fd_shmem_name_len( name ) ) ) { FD_LOG_WARNING(( "bad name (%s)", name ? name : "NULL" )); return EINVAL; }
437 :
438 4152 : if( !page_sz ) {
439 1569 : if( !fd_shmem_info( name, FD_SHMEM_GIGANTIC_PAGE_SZ, opt_info ) ) return 0;
440 507 : if( !fd_shmem_info( name, FD_SHMEM_HUGE_PAGE_SZ, opt_info ) ) return 0;
441 507 : if( !fd_shmem_info( name, FD_SHMEM_NORMAL_PAGE_SZ, opt_info ) ) return 0;
442 507 : return ENOENT;
443 507 : }
444 :
445 2583 : if( FD_UNLIKELY( !fd_shmem_is_page_sz( page_sz ) ) ) { FD_LOG_WARNING(( "bad page_sz (%lu)", page_sz )); return EINVAL; }
446 :
447 2583 : char path[ FD_SHMEM_PRIVATE_PATH_BUF_MAX ];
448 2583 : int fd = open( fd_shmem_private_path( name, page_sz, path ), O_RDONLY, (mode_t)0 );
449 2583 : if( FD_UNLIKELY( fd==-1 ) ) return errno; /* no logging here as this might be an existence check */
450 :
451 1062 : struct stat stat[1];
452 1062 : if( FD_UNLIKELY( fstat( fd, stat ) ) ) {
453 0 : FD_LOG_WARNING(( "fstat failed (%i-%s)", errno, fd_io_strerror( errno ) ));
454 0 : int err = errno;
455 0 : if( FD_UNLIKELY( close( fd ) ) )
456 0 : FD_LOG_WARNING(( "close(\"%s\") failed (%i-%s); attempting to continue", path, errno, fd_io_strerror( errno ) ));
457 0 : return err;
458 0 : }
459 :
460 1062 : ulong sz = (ulong)stat->st_size;
461 1062 : if( FD_UNLIKELY( !fd_ulong_is_aligned( sz, page_sz ) ) ) {
462 0 : FD_LOG_WARNING(( "\"%s\" size (%lu) not a page size (%lu) multiple\n\t"
463 0 : "This thread group's hugetlbfs mount path (--shmem-path / FD_SHMEM_PATH):\n\t"
464 0 : "\t%s\n\t"
465 0 : "has probably been corrupted and needs to be redone.\n\t"
466 0 : "See 'bin/fd_shmem_cfg help' for more information.",
467 0 : path, sz, page_sz, fd_shmem_private_base ));
468 0 : if( FD_UNLIKELY( close( fd ) ) )
469 0 : FD_LOG_WARNING(( "close(\"%s\") failed (%i-%s); attempting to continue", path, errno, fd_io_strerror( errno ) ));
470 0 : return EFAULT;
471 0 : }
472 1062 : ulong page_cnt = sz / page_sz;
473 :
474 1062 : if( FD_UNLIKELY( close( fd ) ) )
475 0 : FD_LOG_WARNING(( "close(\"%s\") failed (%i-%s); attempting to continue", path, errno, fd_io_strerror( errno ) ));
476 :
477 1062 : if( opt_info ) {
478 1062 : opt_info->page_sz = page_sz;
479 1062 : opt_info->page_cnt = page_cnt;
480 1062 : }
481 1062 : return 0;
482 1062 : }
483 :
484 : /* RAW PAGE ALLOCATION APIS *******************************************/
485 :
486 : void *
487 : fd_shmem_acquire_multi( ulong page_sz,
488 : ulong sub_cnt,
489 : ulong const * _sub_page_cnt,
490 528 : ulong const * _sub_cpu_idx ) {
491 :
492 : /* Check input args */
493 :
494 528 : if( FD_UNLIKELY( !fd_shmem_is_page_sz( page_sz ) ) ) { FD_LOG_WARNING(( "bad page_sz (%lu)", page_sz )); return NULL; }
495 :
496 528 : if( FD_UNLIKELY( !sub_cnt ) ) { FD_LOG_WARNING(( "zero sub_cnt" )); return NULL; }
497 528 : if( FD_UNLIKELY( !_sub_page_cnt ) ) { FD_LOG_WARNING(( "NULL sub_page_cnt" )); return NULL; }
498 528 : if( FD_UNLIKELY( !_sub_cpu_idx ) ) { FD_LOG_WARNING(( "NULL sub_cpu_idx" )); return NULL; }
499 :
500 528 : ulong cpu_cnt = fd_shmem_cpu_cnt();
501 :
502 528 : ulong page_cnt = 0UL;
503 1053 : for( ulong sub_idx=0UL; sub_idx<sub_cnt; sub_idx++ ) {
504 528 : ulong sub_page_cnt = _sub_page_cnt[ sub_idx ];
505 528 : if( FD_UNLIKELY( !sub_page_cnt ) ) continue; /* Skip over empty subregions */
506 :
507 528 : page_cnt += sub_page_cnt;
508 528 : if( FD_UNLIKELY( page_cnt<sub_page_cnt ) ) {
509 0 : FD_LOG_WARNING(( "sub[%lu] sub page_cnt overflow (page_cnt %lu, sub_page_cnt %lu)",
510 0 : sub_idx, page_cnt-sub_page_cnt, sub_page_cnt ));
511 0 : return NULL;
512 0 : }
513 :
514 528 : ulong sub_cpu_idx = _sub_cpu_idx[ sub_idx ];
515 528 : if( FD_UNLIKELY( sub_cpu_idx>=cpu_cnt ) ) {
516 3 : FD_LOG_WARNING(( "sub[%lu] bad cpu_idx (%lu)", sub_idx, sub_cpu_idx ));
517 3 : return NULL;
518 3 : }
519 528 : }
520 :
521 525 : if( FD_UNLIKELY( !((1UL<=page_cnt) & (page_cnt<=(((ulong)LONG_MAX)/page_sz))) ) ) { /* LONG_MAX from off_t */
522 0 : FD_LOG_WARNING(( "bad total page_cnt (%lu)", page_cnt ));
523 0 : return NULL;
524 0 : }
525 :
526 525 : int flags = MAP_PRIVATE | MAP_ANONYMOUS;
527 525 : if( page_sz==FD_SHMEM_HUGE_PAGE_SZ ) flags |= (int)MAP_HUGETLB | (int)MAP_HUGE_2MB;
528 525 : if( page_sz==FD_SHMEM_GIGANTIC_PAGE_SZ ) flags |= (int)MAP_HUGETLB | (int)MAP_HUGE_1GB;
529 :
530 : /* See fd_shmem_create_multi for details on the locking, mempolicy
531 : and what not tricks */
532 :
533 525 : FD_SHMEM_LOCK;
534 :
535 525 : int err;
536 :
537 525 : # define ERROR( cleanup ) do { err = errno; goto cleanup; } while(0)
538 :
539 525 : int orig_mempolicy;
540 525 : ulong orig_nodemask[ (FD_SHMEM_NUMA_MAX+63UL)/64UL ];
541 525 : void * mem = NULL;
542 :
543 525 : ulong sz = page_cnt*page_sz;
544 :
545 525 : if( FD_UNLIKELY( fd_numa_get_mempolicy( &orig_mempolicy, orig_nodemask, FD_SHMEM_NUMA_MAX, NULL, 0UL ) ) ) {
546 0 : FD_LOG_WARNING(( "fd_numa_get_mempolicy failed (%i-%s)", errno, fd_io_strerror( errno ) ));
547 0 : ERROR( done );
548 0 : }
549 :
550 525 : mem = mmap( NULL, sz, PROT_READ | PROT_WRITE, flags, -1, (off_t)0);
551 525 : if( FD_UNLIKELY( mem==MAP_FAILED ) ) {
552 21 : FD_LOG_WARNING(( "mmap(NULL,%lu KiB,PROT_READ|PROT_WRITE,%x,-1,0) failed (%i-%s)",
553 21 : sz>>10, (uint)flags, errno, fd_io_strerror( errno ) ));
554 21 : ERROR( restore );
555 21 : }
556 :
557 504 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, page_sz ) ) ) {
558 0 : FD_LOG_WARNING(( "mmap(NULL,%lu KiB,PROT_READ|PROT_WRITE,%x,-1,0) misaligned", sz>>10, (uint)flags ));
559 0 : errno = EFAULT; /* ENOMEM is arguable */
560 0 : ERROR( unmap );
561 0 : }
562 :
563 504 : uchar * sub_mem = (uchar *)mem;
564 1008 : for( ulong sub_idx=0UL; sub_idx<sub_cnt; sub_idx++ ) {
565 504 : ulong sub_page_cnt = _sub_page_cnt[ sub_idx ];
566 504 : if( FD_UNLIKELY( !sub_page_cnt ) ) continue;
567 :
568 504 : ulong sub_sz = sub_page_cnt*page_sz;
569 504 : ulong sub_cpu_idx = _sub_cpu_idx[ sub_idx ];
570 504 : ulong sub_numa_idx = fd_shmem_numa_idx( sub_cpu_idx );
571 :
572 504 : ulong nodemask[ (FD_SHMEM_NUMA_MAX+63UL)/64UL ];
573 :
574 504 : fd_memset( nodemask, 0, 8UL*((FD_SHMEM_NUMA_MAX+63UL)/64UL) );
575 504 : nodemask[ sub_numa_idx >> 6 ] = 1UL << (sub_numa_idx & 63UL);
576 :
577 504 : if( FD_UNLIKELY( fd_numa_set_mempolicy( MPOL_BIND | MPOL_F_STATIC_NODES, nodemask, FD_SHMEM_NUMA_MAX ) ) ) {
578 0 : FD_LOG_WARNING(( "fd_numa_set_mempolicy failed (%i-%s)", errno, fd_io_strerror( errno ) ));
579 0 : ERROR( unmap );
580 0 : }
581 :
582 504 : if( FD_UNLIKELY( fd_numa_mlock( sub_mem, sub_sz ) ) ) {
583 0 : FD_LOG_WARNING(( "sub[%lu]: fd_numa_mlock(anon,%lu KiB) failed (%i-%s)",
584 0 : sub_idx, sub_sz>>10, errno, fd_io_strerror( errno ) ));
585 0 : ERROR( unmap );
586 0 : }
587 :
588 : /* FIXME: NUMA TOUCH HERE (ALSO WOULD A LOCAL TOUCH WORK GIVEN THE
589 : MEMPOLICY DONE ABOVE?) */
590 :
591 504 : fd_memset( nodemask, 0, 8UL*((FD_SHMEM_NUMA_MAX+63UL)/64UL) );
592 504 : nodemask[ sub_numa_idx >> 6 ] = 1UL << (sub_numa_idx & 63UL);
593 :
594 504 : if( FD_UNLIKELY( fd_numa_mbind( sub_mem, sub_sz, MPOL_BIND, nodemask, FD_SHMEM_NUMA_MAX, MPOL_MF_MOVE|MPOL_MF_STRICT ) ) ) {
595 0 : FD_LOG_WARNING(( "sub[%lu]: fd_numa_mbind(anon,%lu KiB,MPOL_BIND,1UL<<%lu,MPOL_MF_MOVE|MPOL_MF_STRICT) failed (%i-%s)",
596 0 : sub_idx, sub_sz>>10, sub_numa_idx, errno, fd_io_strerror( errno ) ));
597 0 : ERROR( unmap );
598 0 : }
599 :
600 504 : int warn = fd_shmem_numa_validate( sub_mem, page_sz, sub_page_cnt, sub_cpu_idx ); /* logs details */
601 504 : if( FD_UNLIKELY( warn ) )
602 0 : FD_LOG_WARNING(( "sub[%lu]: mmap(NULL,%lu KiB,PROT_READ|PROT_WRITE,%x,-1,0) numa binding failed (%i-%s)",
603 504 : sub_idx, sub_sz>>10, (uint)flags, warn, fd_io_strerror( warn ) ));
604 :
605 504 : sub_mem += sub_sz;
606 504 : }
607 :
608 504 : err = 0;
609 :
610 504 : # undef ERROR
611 :
612 504 : unmap:
613 504 : if( FD_UNLIKELY( err ) && FD_UNLIKELY( munmap( mem, sz ) ) )
614 0 : FD_LOG_WARNING(( "munmap(anon,%lu KiB) failed (%i-%s); attempting to continue",
615 504 : sz>>10, errno, fd_io_strerror( errno ) ));
616 :
617 525 : restore:
618 525 : if( FD_UNLIKELY( fd_numa_set_mempolicy( orig_mempolicy, orig_nodemask, FD_SHMEM_NUMA_MAX ) ) )
619 0 : FD_LOG_WARNING(( "fd_numa_set_mempolicy failed (%i-%s); attempting to continue", errno, fd_io_strerror( errno ) ));
620 :
621 525 : done:
622 525 : FD_SHMEM_UNLOCK;
623 525 : return err ? NULL : mem;
624 525 : }
625 :
626 : int
627 : fd_shmem_release( void * mem,
628 : ulong page_sz,
629 210 : ulong page_cnt ) {
630 210 : if( FD_UNLIKELY( !mem ) ) {
631 0 : FD_LOG_WARNING(( "NULL mem" ));
632 0 : return -1;
633 0 : }
634 :
635 210 : if( FD_UNLIKELY( !fd_shmem_is_page_sz( page_sz ) ) ) {
636 0 : FD_LOG_WARNING(( "bad page_sz (%lu)", page_sz ));
637 0 : return -1;
638 0 : }
639 :
640 210 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)mem, page_sz ) ) ) {
641 0 : FD_LOG_WARNING(( "misaligned mem" ));
642 0 : return -1;
643 0 : }
644 :
645 210 : if( FD_UNLIKELY( !((1UL<=page_cnt) & (page_cnt<=(((ulong)LONG_MAX)/page_sz))) ) ) {
646 0 : FD_LOG_WARNING(( "bad page_cnt (%lu)", page_cnt ));
647 0 : return -1;
648 0 : }
649 :
650 210 : ulong sz = page_sz*page_cnt;
651 :
652 210 : int result = munmap( mem, sz );
653 210 : if( FD_UNLIKELY( result ) )
654 0 : FD_LOG_WARNING(( "munmap(anon,%lu KiB) failed (%i-%s); attempting to continue", sz>>10, errno, fd_io_strerror( errno ) ));
655 :
656 210 : return result;
657 210 : }
658 :
659 : /* SHMEM PARSING APIS *************************************************/
660 :
661 : ulong
662 117270 : fd_shmem_name_len( char const * name ) {
663 117270 : if( FD_UNLIKELY( !name ) ) return 0UL; /* NULL name */
664 :
665 117249 : ulong len = 0UL;
666 633549 : while( FD_LIKELY( len<FD_SHMEM_NAME_MAX ) ) {
667 633549 : char c = name[len];
668 633549 : if( FD_UNLIKELY( !c ) ) break;
669 516303 : if( FD_UNLIKELY( !( (!!isalnum( c )) | ((len>0UL) & ((c=='_') | (c=='-') | (c=='.'))) ) ) ) return 0UL; /* Bad character */
670 516300 : len++;
671 516300 : }
672 :
673 117246 : if( FD_UNLIKELY( !len ) ) return 0UL; /* Name too short (empty string) */
674 117246 : if( FD_UNLIKELY( len>=FD_SHMEM_NAME_MAX ) ) return 0UL; /* Name too long */
675 117246 : return len;
676 117246 : }
677 :
678 : /* BOOT/HALT APIs *****************************************************/
679 :
680 : void
681 : fd_shmem_private_boot( int * pargc,
682 1152 : char *** pargv ) {
683 1152 : FD_LOG_INFO(( "fd_shmem: booting" ));
684 :
685 : /* Initialize the phtread mutex */
686 :
687 1152 : # if FD_HAS_THREADS
688 1152 : pthread_mutexattr_t lockattr[1];
689 :
690 1152 : if( FD_UNLIKELY( pthread_mutexattr_init( lockattr ) ) )
691 0 : FD_LOG_ERR(( "fd_shmem: pthread_mutexattr_init failed" ));
692 :
693 1152 : if( FD_UNLIKELY( pthread_mutexattr_settype( lockattr, PTHREAD_MUTEX_RECURSIVE ) ) )
694 0 : FD_LOG_ERR(( "fd_shmem: pthread_mutexattr_settype failed" ));
695 :
696 1152 : if( FD_UNLIKELY( pthread_mutex_init( fd_shmem_private_lock, lockattr ) ) )
697 0 : FD_LOG_ERR(( "fd_shmem: pthread_mutex_init failed" ));
698 :
699 1152 : if( FD_UNLIKELY( pthread_mutexattr_destroy( lockattr ) ) )
700 0 : FD_LOG_WARNING(( "fd_shmem: pthread_mutexattr_destroy failed; attempting to continue" ));
701 1152 : # endif /* FD_HAS_THREADS */
702 :
703 : /* Cache the numa topology for this thread group's host for
704 : subsequent fast use by the application. */
705 :
706 1152 : ulong numa_cnt = fd_numa_node_cnt();
707 1152 : if( FD_UNLIKELY( !((1UL<=numa_cnt) & (numa_cnt<=FD_SHMEM_NUMA_MAX)) ) )
708 0 : FD_LOG_ERR(( "fd_shmem: unexpected numa_cnt %lu (expected in [1,%lu])", numa_cnt, FD_SHMEM_NUMA_MAX ));
709 1152 : fd_shmem_private_numa_cnt = numa_cnt;
710 :
711 1152 : ulong cpu_cnt = fd_numa_cpu_cnt();
712 1152 : if( FD_UNLIKELY( !((1UL<=cpu_cnt) & (cpu_cnt<=FD_SHMEM_CPU_MAX)) ) )
713 0 : FD_LOG_ERR(( "fd_shmem: unexpected cpu_cnt %lu (expected in [1,%lu])", cpu_cnt, FD_SHMEM_CPU_MAX ));
714 1152 : fd_shmem_private_cpu_cnt = cpu_cnt;
715 :
716 74880 : for( ulong cpu_rem=cpu_cnt; cpu_rem; cpu_rem-- ) {
717 73728 : ulong cpu_idx = cpu_rem-1UL;
718 73728 : ulong numa_idx = fd_numa_node_idx( cpu_idx );
719 73728 : if( FD_UNLIKELY( numa_idx>=FD_SHMEM_NUMA_MAX) )
720 0 : FD_LOG_ERR(( "fd_shmem: unexpected numa idx (%lu) for cpu idx %lu", numa_idx, cpu_idx ));
721 73728 : fd_shmem_private_numa_idx[ cpu_idx ] = (ushort)numa_idx;
722 73728 : fd_shmem_private_cpu_idx [ numa_idx ] = (ushort)cpu_idx;
723 73728 : }
724 :
725 : /* Determine the shared memory domain for this thread group */
726 :
727 1152 : char const * shmem_base = fd_env_strip_cmdline_cstr( pargc, pargv, "--shmem-path", "FD_SHMEM_PATH", "/mnt/.fd" );
728 :
729 1152 : ulong len = strlen( shmem_base );
730 1152 : while( (len>1UL) && (shmem_base[len-1UL]=='/') ) len--; /* lop off any trailing slashes */
731 1152 : if( FD_UNLIKELY( !len ) ) FD_LOG_ERR(( "Too short --shmem-base" ));
732 1152 : if( FD_UNLIKELY( len>=FD_SHMEM_PRIVATE_BASE_MAX ) ) FD_LOG_ERR(( "Too long --shmem-base" ));
733 1152 : fd_memcpy( fd_shmem_private_base, shmem_base, len );
734 1152 : fd_shmem_private_base[len] = '\0';
735 1152 : fd_shmem_private_base_len = (ulong)len;
736 :
737 : /* At this point, shared memory is online */
738 :
739 1152 : FD_LOG_INFO(( "fd_shmem: --shmem-path %s", fd_shmem_private_base ));
740 1152 : FD_LOG_INFO(( "fd_shmem: boot success" ));
741 1152 : }
742 :
743 : void
744 1140 : fd_shmem_private_halt( void ) {
745 1140 : FD_LOG_INFO(( "fd_shmem: halting" ));
746 :
747 : /* At this point, shared memory is offline */
748 :
749 1140 : fd_shmem_private_numa_cnt = 0;
750 1140 : fd_shmem_private_cpu_cnt = 0;
751 1140 : fd_memset( fd_shmem_private_numa_idx, 0, FD_SHMEM_CPU_MAX );
752 :
753 1140 : fd_shmem_private_base[0] = '\0';
754 1140 : fd_shmem_private_base_len = 0UL;
755 :
756 1140 : # if FD_HAS_THREADS
757 1140 : if( FD_UNLIKELY( pthread_mutex_destroy( fd_shmem_private_lock ) ) )
758 0 : FD_LOG_WARNING(( "fd_shmem: pthread_mutex_destroy failed; attempting to continue" ));
759 1140 : # endif /* FD_HAS_THREADS */
760 :
761 1140 : FD_LOG_INFO(( "fd_shmem: halt success" ));
762 1140 : }
763 :
764 : #else /* unhosted */
765 :
766 : void
767 : fd_shmem_private_boot( int * pargc,
768 : char *** pargv ) {
769 : FD_LOG_INFO(( "fd_shmem: booting" ));
770 :
771 : /* Strip the command line even though ignored to make environment
772 : parsing identical to downstream regardless of platform. */
773 :
774 : (void)fd_env_strip_cmdline_cstr( pargc, pargv, "--shmem-path", "FD_SHMEM_PATH", "/mnt/.fd" );
775 :
776 : FD_LOG_INFO(( "fd_shmem: --shmem-path (ignored)" ));
777 : FD_LOG_INFO(( "fd_shmem: boot success" ));
778 : }
779 :
780 : void
781 : fd_shmem_private_halt( void ) {
782 : FD_LOG_INFO(( "fd_shmem: halting" ));
783 : FD_LOG_INFO(( "fd_shmem: halt success" ));
784 : }
785 :
786 : #endif
|