Line data Source code
1 : /* syscall API requires _GNU_SOURCE */
2 : #define _GNU_SOURCE
3 : #include "fd_shmem_private.h"
4 : #include "../sanitize/fd_msan.h"
5 : #include <errno.h>
6 : #include <dirent.h>
7 : #include <sys/sysinfo.h>
8 :
9 : /* The below uses the sysfs API added ~2009-Dec. See
10 : https://github.com/torvalds/linux/commit/1830794ae6392ce12d36dbcc5ff52f11298ddab6 */
11 :
12 : /* fd_numa_private_parse_node_idx parses a cstr of the form
13 : `node[0-9]+` into a node idx. The value will strictly interpreted as
14 : a non-negative base 10 value. Returns -1 if the value could not be
15 : parsed (e.g. s is NULL, s does not have a node prefix, s does not
16 : have a base 10 suffix, the value overflows an int representation).
17 : FIXME: consider having the user pass the prefix to scan for to allow
18 : extracting more general indices from sysfs paths. */
19 :
20 : FD_FN_PURE static int
21 604065 : fd_numa_private_parse_node_idx( char const * s ) {
22 604065 : if( FD_UNLIKELY( !s ) ) return -1;
23 604065 : if( FD_UNLIKELY( strncmp( s, "node", 4UL ) ) ) return -1;
24 75075 : s += 4;
25 :
26 75075 : long val = 0L;
27 :
28 75075 : char const * t = s;
29 150150 : for(;;) {
30 150150 : char c = *t;
31 150150 : if( !c ) break; /* host dep branch prob */
32 75075 : if( FD_UNLIKELY( !(('0'<=c) | (c<='9')) ) ) return -1; /* non-digit encountered */
33 75075 : val = (long)(c-'0') + 10L*val;
34 75075 : if( FD_UNLIKELY( val>(long)INT_MAX ) ) return -1; /* overflow */
35 75075 : t++;
36 75075 : }
37 75075 : if( FD_UNLIKELY( s==t ) ) return -1; /* empty idx */
38 :
39 75075 : return (int)val;
40 75075 : }
41 :
42 : ulong
43 1155 : fd_numa_node_cnt( void ) {
44 :
45 : /* Open sysfs dir containing NUMA config. Abort if this fails. */
46 :
47 1155 : char const * path = "/sys/devices/system/node";
48 1155 : DIR * dir = opendir( path );
49 1155 : if( FD_UNLIKELY( !dir ) ) {
50 0 : FD_LOG_WARNING(( "opendir( \"%s\" ) failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
51 0 : return 0UL;
52 0 : }
53 :
54 : /* Scan dir to get number of NUMA nodes. Note that we do not assume
55 : the system indexes numa nodes contiguously (but it almost certainly
56 : does). */
57 :
58 1155 : int node_idx_max = INT_MIN;
59 13860 : for(;;) {
60 13860 : struct dirent * dirent = readdir( dir );
61 13860 : if( !dirent ) break;
62 12705 : node_idx_max = fd_int_max( fd_numa_private_parse_node_idx( dirent->d_name ), node_idx_max );
63 12705 : }
64 :
65 : /* Close dir and return what was found */
66 :
67 1155 : if( FD_UNLIKELY( closedir( dir ) ) )
68 0 : FD_LOG_WARNING(( "closedir( \"%s\" ) failed (%i-%s); attempting to continue", path, errno, fd_io_strerror( errno ) ));
69 :
70 1155 : if( FD_UNLIKELY( node_idx_max<0 ) ) {
71 0 : FD_LOG_WARNING(( "No numa nodes found in \"%s\"", path ));
72 0 : return 0UL;
73 0 : }
74 :
75 1155 : return ((ulong)node_idx_max) + 1UL;
76 1155 : }
77 :
78 : ulong
79 1155 : fd_numa_cpu_cnt( void ) {
80 :
81 : /* FIXME: Consider using get_nprocs_conf, syscall or sysfs director
82 : scan. */
83 :
84 1155 : int cpu_cnt = get_nprocs();
85 1155 : if( FD_UNLIKELY( cpu_cnt<=0 ) ) {
86 0 : FD_LOG_WARNING(( "Unexpected return (%i) from get_nprocs", cpu_cnt ));
87 0 : return 0UL;
88 0 : }
89 :
90 1155 : return (ulong)cpu_cnt;
91 1155 : }
92 :
93 : ulong
94 73920 : fd_numa_node_idx( ulong cpu_idx ) {
95 :
96 : /* Open sysfs dir containing CPU config. Abort if this fails. */
97 :
98 73920 : char path[64];
99 73920 : DIR * dir = opendir( fd_cstr_printf( path, 64UL, NULL, "/sys/devices/system/cpu/cpu%lu", cpu_idx ) );
100 73920 : if( FD_UNLIKELY( !dir ) ) {
101 0 : FD_LOG_WARNING(( "opendir( \"%s\" ) failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
102 0 : return ULONG_MAX;
103 0 : }
104 :
105 : /* Scan dir for symlink to numa config */
106 :
107 73920 : int node_idx = -1;
108 591360 : for(;;) {
109 591360 : struct dirent * dirent = readdir( dir );
110 591360 : if( !dirent ) break;
111 591360 : node_idx = fd_numa_private_parse_node_idx( dirent->d_name );
112 591360 : if( node_idx!=-1 ) break;
113 591360 : }
114 :
115 : /* Close dir and return what was found */
116 :
117 73920 : if( FD_UNLIKELY( closedir( dir ) ) )
118 0 : FD_LOG_WARNING(( "closedir( \"%s\" ) failed (%i-%s); attempting to continue", path, errno, fd_io_strerror( errno ) ));
119 :
120 73920 : if( FD_UNLIKELY( node_idx<0 ) ) {
121 0 : FD_LOG_WARNING(( "No numa node found in \"%s\"", path ));
122 0 : return ULONG_MAX;
123 0 : }
124 :
125 73920 : return (ulong)node_idx;
126 73920 : }
127 :
128 : /* FIXME: probably should do a FD_HAS_ASAN switch for the below to use
129 : the appropriate functionality when FD_HAS_ASAN is set (or maybe have
130 : a separate implementation for compiling under FD_HAS_ASAN). */
131 :
132 : #include <unistd.h>
133 : #include <sys/syscall.h>
134 :
135 : /* Note that the LLVM AddressSanitizer (ASan) intercepts all mlock
136 : calls.
137 :
138 : This has an interesting history. These interceptors were first added
139 : in 2012 and are still present in LLVM 14.0.6:
140 :
141 : https://github.com/llvm/llvm-project/commit/71d759d392f03025bcc8b20f060bc5c22e580ea1
142 :
143 : They stub `mlock`, `munlock`, `mlockall`, `munlockall` to no-ops.
144 :
145 : ASan is known to map large amounts (~16TiB) of unbacked pages. This
146 : rules out the use of `mlockall`.
147 :
148 : `mlock` only locks selected pages, therefore should be fine. The
149 : comments in various revisions of these interceptors suggest that
150 : older Linux kernels had a bug that prevented the use of `mlock`.
151 :
152 : However, current Firedancer will use the `move_pages` syscall to
153 : verify whether "allocated" pages are actually backed by DRAM.
154 :
155 : This makes Firedancer and ASan incompatible unless we either
156 :
157 : 1) Remove the `mlock` interceptor upstream, or
158 : 2) Circumvent the interceptor with a raw syscall
159 :
160 : We do option 2 below */
161 :
162 : int
163 : fd_numa_mlock( void const * addr,
164 1587 : ulong len ) {
165 1587 : return (int)syscall( SYS_mlock, addr, len );
166 1587 : }
167 :
168 : int
169 : fd_numa_munlock( void const * addr,
170 0 : ulong len ) {
171 0 : return (int)syscall( SYS_mlock, addr, len );
172 0 : }
173 :
174 : long
175 : fd_numa_get_mempolicy( int * mode,
176 : ulong * nodemask,
177 : ulong maxnode,
178 : void * addr,
179 546 : uint flags ) {
180 546 : long rc = syscall( SYS_get_mempolicy, mode, nodemask, maxnode, addr, flags );
181 546 : if( rc==0 ) {
182 546 : if( mode ) fd_msan_unpoison( mode, sizeof(int) );
183 546 : if( nodemask ) fd_msan_unpoison( nodemask, 8UL*((maxnode+63UL)/64UL) );
184 546 : }
185 546 : return rc;
186 546 : }
187 :
188 : long
189 : fd_numa_set_mempolicy( int mode,
190 : ulong const * nodemask,
191 1071 : ulong maxnode ) {
192 1071 : return syscall( SYS_set_mempolicy, mode, nodemask, maxnode );
193 1071 : }
194 :
195 : long
196 : fd_numa_mbind( void * addr,
197 : ulong len,
198 : int mode,
199 : ulong const * nodemask,
200 : ulong maxnode,
201 525 : uint flags ) {
202 525 : return syscall( SYS_mbind, addr, len, mode, nodemask, maxnode, flags );
203 525 : }
204 :
205 : long
206 : fd_numa_move_pages( int pid,
207 : ulong count,
208 : void ** pages,
209 : int const * nodes,
210 : int * status,
211 126354 : int flags ) {
212 126354 : long rc = syscall( SYS_move_pages, pid, count, pages, nodes, status, flags );
213 126354 : if( rc==0 ) fd_msan_unpoison( status, count*sizeof(int) );
214 126354 : return rc;
215 126354 : }
|