Line data Source code
1 : #ifndef HEADER_fd_src_util_wksp_fd_wksp_h
2 : #define HEADER_fd_src_util_wksp_fd_wksp_h
3 :
4 : #include "../pod/fd_pod.h"
5 : #include "../shmem/fd_shmem.h"
6 : #include "../sanitize/fd_asan.h"
7 :
8 : /* API for creating NUMA-aware and TLB-efficient workspaces used for
9 : complex inter-thread and inter-process shared memory communication
10 : patterns. fd must be booted to use the APIs in this module.
11 :
12 : For example, startup scripts could reserve some memory on each NUMA
13 : node backed by huge and gigantic pages:
14 :
15 : sudo bin/fd_shmem_cfg alloc 8 gigantic 0 \
16 : alloc 8 gigantic 1 \
17 : alloc 256 huge 0 \
18 : alloc 256 huge 1
19 :
20 : and then some of this memory could be formatted into fd_wksp for each
21 : NUMA node:
22 :
23 : bin/fd_shmem_ctl new my-wksp-numa-0 1 gigantic 0 \
24 : new my-wksp-numa-1 1 gigantic 1
25 :
26 : Then, at application startup, processes can join these fd_wksp and
27 : concurrently allocate memory from the desired NUMA nodes as
28 : necessary. E.g.
29 :
30 : fd_wksp_t * wksp = fd_wksp_attach( "my-wksp-numa-0" ); // logs details on failure
31 : if( !fd_wksp ) ... handle attach failure ...;
32 :
33 : ulong gaddr = fd_wksp_alloc( wksp, align, sz ); // logs details on failure
34 : if( !gaddr ) ... handle alloc failure ...;
35 :
36 : The local address of a workspace global address can be found via:
37 :
38 : void * laddr = fd_wksp_laddr( wksp, gaddr ); // logs details on failure
39 : if( !laddr ) ... handle bad (wksp,gaddr) ...;
40 :
41 : and the global address of a workspace local address can be found via:
42 :
43 : ulong gaddr = fd_wksp_gaddr( wksp, laddr ); // logs details on failure
44 : if( !gaddr ) ... handle bad (wksp,laddr) ...;
45 :
46 : Allocations can be freed via:
47 :
48 : fd_wksp_free( wksp, gaddr );
49 :
50 : Any join can free any allocation regardless of who made it.
51 :
52 : When the application is done using a wksp, it should leave it. The
53 : workspace will continue to exist (it just is no longer safe to access
54 : in the caller's address space). E.g.
55 :
56 : fd_wksp_detach( wksp ); // logs details on failure
57 :
58 : Likewise, if the workspaces are no longer in use, they can be deleted
59 : via something like:
60 :
61 : bin/fd_wksp_ctl delete my-wksp-numa-0 \
62 : delete my-wksp-numa-1
63 :
64 : All allocations can be freed via something like:
65 :
66 : bin/fd_wksp_ctl reset my-wksp-numa-0 \
67 : reset my-wksp-numa-1
68 :
69 : or in code:
70 :
71 : fd_wksp_reset( wksp, seed ); // logs details on failure
72 :
73 : It is the caller's responsibility to ensure that previous allocations
74 : to the wksp are not in use.
75 :
76 : Note: while this presents "aligned_alloc" style API semantics, this
77 : is not designed to be algorithmically optimal, HPC implementation or
78 : efficient at doing lots of tiny allocations. Rather it is designed
79 : to be akin to an "mmap" / "sbrk" style allocator of last resort, done
80 : rarely and then ideally at application startup (e.g. setting up
81 : datastructures at box startup or used in an interprocess lockfree
82 : allocator as a mmap replacement).
83 :
84 : Instead, this tries to keep wksp fragmentation low with low overhead
85 : and tight packing of larger size allocations (normal page size and
86 : up). It further tries to proactively limit the risk of heap
87 : _metadata_ corruption (proactive intraworkspace heap application
88 : _data_ corruption prevention is not a goal though typical mechanisms
89 : for such are in _direct_ opposition to efficient use of TLB, low
90 : fragmentation and tight allocation packing). It is quasi-lockfree
91 : such that a process _killed_ in the middle of a workspace operation
92 : will not prevent other processes from using the workspace but a
93 : process _stalled_ in the middle of a workspace operations can stall
94 : other applications waiting to use the workspace indefinitely.
95 : Operators can track down an errant process stalled in the middle of
96 : workspace operations and blocking other processes). Likewise
97 : detailed usage and metadata integrity checking and repair can be done
98 : via something like fd_wksp_ctl check / verify / rebuild / etc.
99 : Practically speaking, none of this really matters if usage occurs
100 : predominantly during application startup / shutdown.
101 :
102 : See below for more details. */
103 :
104 : /* FD_WKSP_SUCCESS is used by various APIs to indicate an operation
105 : successfully completed. This will be 0. FD_WKSP_ERR_* gives a
106 : number of error codes used by fd_wksp APIs. These will be negative
107 : integers. */
108 :
109 1250207229 : #define FD_WKSP_SUCCESS (0) /* Success */
110 27 : #define FD_WKSP_ERR_INVAL (-1) /* Failed due to obviously invalid inputs */
111 15 : #define FD_WKSP_ERR_FAIL (-2) /* Failed due to shared memory limitation */
112 130251573 : #define FD_WKSP_ERR_CORRUPT (-3) /* Workspace memory corruption detected (potentially recoverable by rebuilding) */
113 :
114 : /* FD_WKSP_{ALIGN,FOOTPRINT} describe the alignment and footprint of a
115 : fd_wksp_t. ALIGN is a positive integer power of 2. FOOTPRINT is a
116 : multiple of ALIGN. FOOTPRINT assumes part_max and data_max are
117 : non-zero and small enough that the footprint will not overflow at
118 : most ULONG_MAX bytes. These are provided to facilitate compile time
119 : declarations. */
120 :
121 3408 : #define FD_WKSP_ALIGN (128UL)
122 : #define FD_WKSP_FOOTPRINT( part_max, data_max ) \
123 : FD_LAYOUT_FINI( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_APPEND( FD_LAYOUT_INIT, \
124 : FD_WKSP_ALIGN, 128UL ), /* header */ \
125 : 64UL, 64UL*(part_max) ), /* partition info */ \
126 : 1UL, (data_max)+1UL ), /* data region and footer */ \
127 : FD_WKSP_ALIGN ) /* tail padding */
128 :
129 : /* FD_WKSP_ALIGN_DEFAULT gives the default alignments of a wksp
130 : allocation. This is a positive integer power of two of at least 16
131 : (for malloc compatibility). Additional details described in
132 : fd_wksp_alloc. */
133 :
134 82640694 : #define FD_WKSP_ALIGN_DEFAULT (4096UL)
135 :
136 : /* FD_WKSP_CSTR_MAX is the number of bytes maximum that can be in a wksp
137 : global address cstr. */
138 :
139 : #define FD_WKSP_CSTR_MAX (FD_SHMEM_NAME_MAX + 21UL)
140 :
141 : /* FD_WKSP_CHECKPT_STYLE_* specifies the streaming format to use for
142 : a workspace checkpoint. These are non-zero.
143 :
144 : RAW - the stream will have extensively workspace metadata followed
145 : by the used workspace partitions. No compression or
146 : hashing is done of the workspace partitions.
147 :
148 : DEFAULT - the style to use when not specified by user. */
149 :
150 54 : #define FD_WKSP_CHECKPT_STYLE_RAW (1)
151 9 : #define FD_WKSP_CHECKPT_STYLE_DEFAULT FD_WKSP_CHECKPT_STYLE_RAW
152 :
153 : /* A fd_wksp_t * is an opaque handle of a workspace */
154 :
155 : struct fd_wksp_private;
156 : typedef struct fd_wksp_private fd_wksp_t;
157 :
158 : /* A fd_wksp_usage_t is used to return workspace usage stats. */
159 :
160 : struct fd_wksp_usage {
161 : ulong total_max;
162 : ulong total_cnt; ulong total_sz;
163 : ulong free_cnt; ulong free_sz;
164 : ulong used_cnt; ulong used_sz;
165 : };
166 :
167 : typedef struct fd_wksp_usage fd_wksp_usage_t;
168 :
169 : FD_PROTOTYPES_BEGIN
170 :
171 : /* Admin APIs *********************************************************/
172 :
173 : /* It is rare to need to use the admin APIs directly (especially on a
174 : hosted system). Recommend using the helper APIs below for most
175 : needs. */
176 :
177 : /* Constructors */
178 :
179 : /* fd_wksp_part_max_est computes an estimated maximum number of
180 : partitions for a workspace that needs to fit within footprint bytes
181 : and has sz_typical allocations typically. Returns a positive value
182 : on success and 0 on failure. Reasons for failure include footprint
183 : too small, sz_typical is 0 and sz_typical is so large that footprint
184 : has no room for metadata anyway. Useful for determining how to pack
185 : a workspace tightly into a known footprint region. */
186 :
187 : FD_FN_CONST ulong
188 : fd_wksp_part_max_est( ulong footprint,
189 : ulong sz_typical );
190 :
191 : /* fd_wksp_data_max_est computes an estimated maximum data region size
192 : for footprint sized workspace with part_max partitions. Returns a
193 : positive value on success and 0 on failure. Reasons for failure
194 : include footprint is too small, part_max is 0, part_max is too large
195 : for under the hood implementation limitations or part_max is too
196 : large to have a non-zero sized data region. Useful for determining
197 : how to pack a workspace into a known footprint region. */
198 :
199 : FD_FN_CONST ulong
200 : fd_wksp_data_max_est( ulong footprint,
201 : ulong part_max );
202 :
203 : /* fd_wksp_{align,footprint} give the required alignment and footprint
204 : for a workspace that can support up to part_max partitions and with a
205 : data region of data_max bytes. fd_wksp_align returns FD_WKSP_ALIGN.
206 : fd_wksp_footprint(part_max,data_max) returns
207 : FD_WKSP_FOOTPRINT(part_max,data_max) on success and 0 on failure.
208 : Reasons for failure include zero part_max, part_max too large for
209 : this implementation, zero data_max, part_max/data_max requires a
210 : footprint that overflows a ULONG_MAX. */
211 :
212 : FD_FN_CONST ulong
213 : fd_wksp_align( void );
214 :
215 : FD_FN_CONST ulong
216 : fd_wksp_footprint( ulong part_max,
217 : ulong data_max );
218 :
219 : /* fd_wksp_new formats an unused memory region with the appropriate
220 : footprint and alignment mapped into the caller's address space at
221 : shmem into a wksp with given name (should be a valid fd_shmem name
222 : and will match the underlying shared memory region name / anonymous
223 : join for a wksp created via the shmem helpers below). seed is the
224 : arbitrary value used to seed the heap priorities under the hood.
225 : Returns NULL on failure (logs details) or shmem on success. The
226 : caller is _not_ joined on return. */
227 :
228 : void *
229 : fd_wksp_new( void * shmem,
230 : char const * name,
231 : uint seed,
232 : ulong part_max,
233 : ulong data_max );
234 :
235 : /* fd_wksp_join joins a workspace. shwksp is the location of the where
236 : the wksp has been mapped into the caller's address space. Returns
237 : the local handle of the join on success or NULL on failure (logs
238 : details). The caller can read / write memory in the joined workspace
239 : on return (a caller can do a read only join by mapping the shwksp
240 : into the local address as read only). There is no practical
241 : limitation on the number of concurrent joins in a thread, process or
242 : system wide.*/
243 :
244 : fd_wksp_t *
245 : fd_wksp_join( void * shwksp );
246 :
247 : /* fd_wksp_leave leaves a workspace. Returns shwksp on success and NULL
248 : on failure (logs details). The caller should not continue to read or
249 : write any memory for the join on return but the workspace will
250 : continue to exist. */
251 :
252 : void *
253 : fd_wksp_leave( fd_wksp_t * wksp );
254 :
255 : /* fd_wksp_delete unformats a memory region used as a workspace.
256 : Returns the shmem on pointer on success and NULL on failure (logs
257 : details). There should not be anybody joined to the workspace when
258 : it is deleted. */
259 :
260 : void *
261 : fd_wksp_delete( void * shwksp );
262 :
263 : /* Accessors */
264 :
265 : /* fd_wksp_name a cstr pointer to the wksp name (will point to a valid
266 : region name, e.g. strlen( name ) in [1,FD_SHMEM_NAME_MAX)). Assumes
267 : wksp is a valid current join. Lifetime of the returned string is the
268 : lifetime of the join. The pointer value is const and the string
269 : pointed at is const for the lifetime of join.
270 :
271 : fd_wksp_seed returns the seed used at creation / most recent rebuild.
272 : Assumes wksp is a current local join.
273 :
274 : fd_wksp_{part_max,data_max} returns {part_max,data_max} used at
275 : creation. Assumes wksp is a current local join. */
276 :
277 : FD_FN_CONST char const * fd_wksp_name ( fd_wksp_t const * wksp );
278 : FD_FN_PURE uint fd_wksp_seed ( fd_wksp_t const * wksp );
279 : FD_FN_PURE ulong fd_wksp_part_max( fd_wksp_t const * wksp );
280 : FD_FN_PURE ulong fd_wksp_data_max( fd_wksp_t const * wksp );
281 :
282 : /* fd_wksp_owner returns the id of the thread group that was currently
283 : in a wksp operation (0 indicates the wksp was in the process of being
284 : constructed) or ULONG_MAX if there was no operation in progress on
285 : the workspace. Assumes wksp is a current local join. The value will
286 : correspond to some point of time between when the call was made and
287 : the call returned. */
288 :
289 : ulong fd_wksp_owner( fd_wksp_t const * wksp );
290 :
291 : /* Misc */
292 :
293 : /* fd_wksp_strerror converts an FD_WKSP_SUCCESS / FD_WKSP_ERR_* code
294 : into a human readable cstr. The lifetime of the returned pointer is
295 : infinite. The returned pointer is always to a non-NULL cstr. */
296 :
297 : FD_FN_CONST char const *
298 : fd_wksp_strerror( int err );
299 :
300 : /* fd_wksp_verify does extensive verification of wksp. Returns
301 : FD_WKSP_SUCCESS (0) if there are no issues detected with the wksp or
302 : FD_WKSP_ERR_CORRUPT (negative) otherwise (logs details). wksp is a
303 : current local join to a workspace. This is used internally for
304 : verifying the integrity of a workspace if a caller detects in an
305 : operation that another caller died in the middle of a wksp operation.
306 : Users typically do not need to call this but it can be useful in
307 : debugging and testing.
308 :
309 : IMPORTANT SAFETY TIP! This assumes there are no concurrent
310 : operations on wksp. */
311 :
312 : int
313 : fd_wksp_verify( fd_wksp_t * wksp );
314 :
315 : /* fd_wksp_rebuilds a wksp. This is used internally for rebuilding
316 : workspace when a caller detects that another caller died in the
317 : middle of an alloc or free and left the workspace in an inconsistent
318 : state. Returns FD_WKSP_SUCCESS (0) if wksp was rebuilt successfully
319 : or a FD_WKSP_ERR_CORRUPT (negative) if it could not (logs details).
320 :
321 : Rebuilding operates under the principle of "do no harm".
322 : Specifically, rebuilding does not impact any completed wksp
323 : allocations (even when it fails). It can either complete or rollback
324 : any partially complete alloc / free depends on far along the partial
325 : operation was.
326 :
327 : Rebuilding should be always possible outside of actual memory
328 : corruption or code bug. The main reason for failure is overlapping
329 : allocations were discovered during the rebuild (which would either be
330 : caused by memory corruption or a bug).
331 :
332 : Users typically do not need to call this but it can be useful as a
333 : weak form of ASLR by changing up the seed. This is not a fast
334 : operation.
335 :
336 : IMPORTANT SAFETY TIP! This assumes there are no concurrent
337 : operations on wksp. */
338 :
339 : int
340 : fd_wksp_rebuild( fd_wksp_t * wksp,
341 : uint seed );
342 :
343 : /* User APIs **********************************************************/
344 :
345 : /* fd_wksp_laddr map a wksp global address (an address all joiners
346 : agree upon) to the caller's local address space. Invalid global
347 : addresses and/or 0UL will map to NULL (logs details if invalid).
348 : Assumes wksp is a current local join (NULL returns NULL). */
349 :
350 : void *
351 : fd_wksp_laddr( fd_wksp_t const * wksp,
352 : ulong gaddr );
353 :
354 : /* fd_wksp_gaddr maps a wksp local address to the corresponding wksp
355 : global address (an address all joiners agree upon). Invalid local
356 : addresses and/or NULL will map to 0UL (logs details if invalid).
357 : Assumes wksp is a current local join (NULL returns NULL). */
358 :
359 : ulong
360 : fd_wksp_gaddr( fd_wksp_t const * wksp,
361 : void const * laddr );
362 :
363 : /* fd_wksp_gaddr_fast converts a laddr into a gaddr under the assumption
364 : wksp is a current local join and laddr is non-NULL local address in
365 : the wksp. */
366 :
367 : FD_FN_CONST static inline ulong
368 : fd_wksp_gaddr_fast( fd_wksp_t const * wksp,
369 206382408 : void const * laddr ) {
370 206382408 : return (ulong)laddr - (ulong)wksp;
371 206382408 : }
372 :
373 : /* fd_wksp_laddr_fast converts a gaddr into a laddr under the assumption
374 : wksp is a current local join and gaddr is non-NULL. */
375 :
376 : FD_FN_CONST static inline void *
377 : fd_wksp_laddr_fast( fd_wksp_t const * wksp,
378 5294211711 : ulong gaddr ) {
379 5294211711 : return (void *)((ulong)wksp + gaddr);
380 5294211711 : }
381 :
382 : /* fd_wksp_alloc_at_least allocates at least sz bytes from wksp with
383 : an alignment of at least align (align must be a non-negative integer
384 : power-of-two or 0, which indicates to use the default alignment
385 : FD_WKSP_ALIGN_DEFAULT). The allocation will be tagged with a
386 : positive value tag. Returns the fd_wksp global address of the join
387 : on success and "NULL" (0UL) on failure (logs details). A zero sz
388 : returns "NULL" (silent). On return, [*lo,*hi) will contain the
389 : actually gaddr range allocated. On success, [*lo,*hi) will overlap
390 : completely [ret,ret+sz) and ret will be aligned to requested
391 : alignment. Assumes lo and hi are non-NULL.
392 :
393 : fd_wksp_alloc is a simple wrapper around fd_wksp_alloc_at_least for
394 : use when applications do not care about details of the actual
395 : allocated region.
396 :
397 : Note that fd_wksp_alloc / fd_wksp_free are not HPC implementations.
398 : Instead, these are designed to be akin to a mmap / sbrk allocator of
399 : "last resort" under the hood in other allocators like fd_alloc. As
400 : such it prioritizes packing efficiency (best fit with arbitrary sizes
401 : and alignments allowed) over algorithmic efficiency (e.g.
402 : O(lg wksp_alloc_cnt) instead of O(1) like fd_alloc) and prioritize
403 : robustness against heap corruption (e.g. overrunning an allocation
404 : might corrupt the data in other allocations but will not corrupt the
405 : heap structure ... as the goal of this data structure is to encourage
406 : minimization of TLB usage, there is very little that can be done to
407 : proactively prevent intraworkspace interallocation data corruption).
408 :
409 : These operations are "quasi-lock-free". Specifically, while they can
410 : suffer priority inversion due to a slow thread stalling other threads
411 : from using these operations, a process that is terminated in the
412 : middle of these operations leaves the wksp in a recoverable state.
413 : The only risk is the same risk generally from any application that
414 : uses persistent resources: applications that are terminated abruptly
415 : might leave allocations in the wksp that would have been freed had
416 : the application terminated normally. As the allocator has no way to
417 : tell the difference between such allocations and allocations that are
418 : intended to outlive the application, it is the caller's
419 : responsibility to clean up such (allocation tagging can help greatly
420 : simplify this for users). It would be possible to widen this API for
421 : applications to explicitly signal this intent and automatically clean
422 : up allocations not meant to outlive their creator but the general use
423 : here is expected to be long lived allocations.
424 :
425 : Priority inversion is not expected to be an issue practically as the
426 : expected use case is at app startup (some non-latency critical
427 : processes will do a handful of wksp operations to setup workspaces
428 : for applications on that box going forward and then the allocations
429 : will not be used again until the wksp is tore down / reset / etc).
430 : The remaining cases (e.g. a fine grained allocator like fd_alloc
431 : needs to procure more memory from the workspace) are expected to be
432 : rare enough that the O(lg N) costs still will be more than adequate.
433 : Note further that fd_alloc allows very fast interprocess allocations
434 : to be done by using a wksp as an allocator of last resort (in such,
435 : all allocations would be strictly lock free unless they needed to
436 : invoke this allocator, as is typically the case in other lock free
437 : allocators).
438 :
439 : Likewise, operations do extensive allocation metadata integrity
440 : checks to facilitate robust persistent usage. If there is metadata
441 : data corruption detected (e.g. hardware fault, code corruption, etc),
442 : there are fsck-like APIs to rebuild wksp metadata. Data integrity
443 : protection is more defined by the application.
444 :
445 : Tags are application specific. They can allow manual and automated
446 : processes to do various debugging, diagnostics, analytics and garbage
447 : collection on a workspace (e.g. superblocks from a fd_alloc can be
448 : tagged specifically for that fd_alloc to allow memory leaks in
449 : general to be detected at program termination with no additional
450 : overheads and allow such leaks cleaned up via tagged frees).
451 : Notably, tags are wide enough to encode gaddrs. This opens up the
452 : possibly for filesystem-like complex metadata operations.
453 :
454 : IMPORTANT! align technically refers to the alignment in the wksp's
455 : global address space. As such, wksp must be mmaped into each local
456 : address space with an alignment of at least the largest alignment the
457 : overall application intends to use. Common practices automatically
458 : satisfy this (e.g. if wksp is backed by normal/huge/gigantic pages
459 : and only asks for alignments of at most a normal/huge/gigantic page
460 : sz, this constraint is automatically satisfied as fd_shmem_join needs
461 : to mmap wksp into the local address space with normal/huge/gigantic
462 : alignment anyway). If doing more exotic things (e.g. backing wksp by
463 : normal pages but requiring much larger alignments), explicitly
464 : specifying the wksp virtual address location (e.g. in the
465 : fd_shmem_join call) might be necessary to satisfy this constraint.
466 :
467 : This implementation support arbitrary sz and align efficiently but
468 : each allocation will use up 1-3 wksp partitions to achieve this. As
469 : these are a finite resources (and typically sized for a wksp that
470 : handles primarily larger allocations, like a fd_alloc huge
471 : superblock) and as there are allocators like fd_alloc that faster are
472 : algorithmically, lower overhead and lockfree O(1) for small sizes and
473 : alignment, it is strongly recommended to use this as an allocator of
474 : last resort and/or use this for larger chunkier allocations at
475 : application startup (e.g. sz + align >>> cache line). An allocator
476 : like fd_alloc can then manage most allocations, falling back on this
477 : only when necessary. */
478 :
479 : ulong
480 : fd_wksp_alloc_at_least( fd_wksp_t * wksp,
481 : ulong align,
482 : ulong sz,
483 : ulong tag,
484 : ulong * lo,
485 : ulong * hi );
486 :
487 : static inline ulong
488 : fd_wksp_alloc( fd_wksp_t * wksp,
489 : ulong align,
490 : ulong sz,
491 874977 : ulong tag ) {
492 874977 : ulong dummy[2];
493 874977 : return fd_wksp_alloc_at_least( wksp, align, sz, tag, dummy, dummy+1 );
494 874977 : }
495 :
496 : /* fd_wksp_free frees a wksp allocation. gaddr is a global address that
497 : points to any byte in the allocation to free (i.e. can point to
498 : anything in of the gaddr range [*lo,*hi) returned by
499 : fd_wksp_alloc_at_least). Logs details of any weirdness detected.
500 : Free of "NULL" (0UL) silently returns. There are no restrictions on
501 : which join might free an allocation. See note above other details. */
502 :
503 : void
504 : fd_wksp_free( fd_wksp_t * wksp,
505 : ulong gaddr );
506 :
507 : /* fd_wksp_tag returns the tag associated with an allocation. gaddr
508 : is a wksp global address that points to any byte in the allocation.
509 : This is a fast O(lg wksp_alloc_cnt). A return of 0 indicates that
510 : gaddr did not point into an allocation at some point in time between
511 : when this function was called until when it returned (this includes
512 : the cases when wksp is NULL and/or gaddr is 0). This function is
513 : silent to facilitate integration with various analysis tools. */
514 :
515 : ulong
516 : fd_wksp_tag( fd_wksp_t * wksp,
517 : ulong gaddr );
518 :
519 : /* fd_wksp_tag_query queries the workspace for all partitions that match
520 : one of the given tags. The tag array is indexed [0,tag_cnt).
521 : Returns info_cnt, the number of matching partitions. Further, if
522 : info_max is non-zero, will return detailed information for the first
523 : (from low to high gaddr) min(info_cnt,info_max). Returns 0 if no
524 : partitions match any tags. If any wonkiness encountered (e.g. wksp
525 : is NULL, tag is not in positive, etc) returns 0 and logs details.
526 : This is O(wksp_alloc_cnt*tag_cnt) currently (but could be made
527 : O(wksp_alloc_cnt) with some additional work). */
528 :
529 : struct fd_wksp_tag_query_info {
530 : ulong gaddr_lo; /* Partition covers workspace global addresses [gaddr_lo,gaddr_hi) */
531 : ulong gaddr_hi; /* 0<gaddr_lo<gaddr_hi */
532 : ulong tag; /* Partition tag */
533 : };
534 :
535 : typedef struct fd_wksp_tag_query_info fd_wksp_tag_query_info_t;
536 :
537 : ulong
538 : fd_wksp_tag_query( fd_wksp_t * wksp,
539 : ulong const * tag,
540 : ulong tag_cnt,
541 : fd_wksp_tag_query_info_t * info,
542 : ulong info_max );
543 :
544 : /* fd_wksp_tag_free frees all allocations in wksp that match one of the
545 : given tags. The tag array is indexed [0,tag_cnt). Logs details if
546 : any wonkiness encountered (e.g. wksp is NULL, tag is not in positive.
547 : This is O(wksp_alloc_cnt*tag_cnt) currently (but could be made
548 : O(wksp_alloc_cnt) with some additional work). */
549 :
550 : void
551 : fd_wksp_tag_free( fd_wksp_t * wksp,
552 : ulong const * tag,
553 : ulong tag_cnt );
554 :
555 : /* fd_wksp_memset sets all bytes in a wksp allocation to character c.
556 : gaddr is a global address that points to any byte in the allocation
557 : (i.e. can point to anything in range returned by
558 : fd_wksp_alloc_at_least and will fill the whole range). Logs details
559 : of any weirdness detected. Clear of "NULL" (0UL) silently returns.
560 : Atomic with respect to other operations on this workspace. */
561 :
562 : void
563 : fd_wksp_memset( fd_wksp_t * wksp,
564 : ulong gaddr,
565 : int c );
566 :
567 : /* fd_wksp_reset frees all allocations from the wksp. Logs details on
568 : failure. */
569 :
570 : void
571 : fd_wksp_reset( fd_wksp_t * wksp,
572 : uint seed );
573 :
574 : /* fd_wksp_usage computes the wksp usage at some point in time between
575 : when the call was made and the call returned, populating the user
576 : provided usage structure with the result. Always returns usage.
577 :
578 : wksp is a current local join to the workspace to compute usage.
579 :
580 : tag[tag_idx] for tag_idx in [0,tag_cnt) is an array of tags to
581 : compute the usage. The order doesn't matter and, if a tag appears
582 : multiple times in the array, it will be counted once in the used
583 : stats. A zero tag_cnt (potentially with a NULL tag) is fine
584 : (used_cnt,used_set for such will be 0,0). A tag of 0 indicates to
585 : include free partitions in the used stats.
586 :
587 : total_max is the maximum partitions the wksp can have. This will be
588 : positive (==part_max).
589 :
590 : total_sz is the number of bytes the wksp has available for
591 : partitioning (==data_max). As the partitioning always covers the
592 : entire wksp, total_sz is constant for the lifetime of the wksp.
593 :
594 : total_cnt is the number of partitions the wksp currently has. This
595 : will be in [1,total_max].
596 :
597 : free_cnt/sz is the number of free partitions / free bytes the wksp
598 : currently has. A free partition has a tag of 0 and is currently
599 : available for splitting to satisfy the a future fd_wksp_alloc
600 : request.
601 :
602 : used_cnt/sz is the number of partitions / bytes used by wksp
603 : partitions whose tags match those in the provided tag set.
604 :
605 : This is O(wksp_alloc_cnt*tag_cnt) and will lock the wksp while
606 : running (and potentially block the caller if others are holding onto
607 : the lock). So use in testing, etc. Likewise, the precise meaning of
608 : the statistics computed by this API are dependent on the
609 : implementation details under the hood (that is do not be surprised if
610 : this API gets changed in the future). */
611 :
612 : fd_wksp_usage_t *
613 : fd_wksp_usage( fd_wksp_t * wksp,
614 : ulong const * tag,
615 : ulong tag_cnt,
616 : fd_wksp_usage_t * usage );
617 :
618 : /* shmem APIs *********************************************************/
619 :
620 : /* fd_wksp_new_named creates a shared memory region named name and
621 : formats as a workspace. Ignoring error trapping, this is a shorthand
622 : for:
623 :
624 : // Size the workspace to use all the memory
625 : ulong footprint = sum( sub_page_cnt[*] )*page_sz
626 : ulong part_max = opt_part_max ? opt_part_max : fd_wksp_part_max_est( footprint, 64 KiB );
627 : ulong data_max = fd_wksp_data_max_est( footprint, part_max );
628 :
629 : // Create the shared memory region and format as a workspace
630 : fd_shmem_create_multi( name, page_sz, sub_cnt, sub_page_cnt, sub_cpu_idx, mode );
631 : void * shmem = fd_shmem_join( name, FD_SHMEM_JOIN_MODE_READ_WRITE, NULL, NULL, NULL ) );
632 : fd_wksp_new( shmem, name, seed, part_max, data_max );
633 : fd_shmem_leave( shmem, NULL, NULL );
634 :
635 : The 64 KiB above is where fd_alloc currently transitions to directly
636 : allocating from the wksp.
637 :
638 : Returns FD_WKSP_SUCCESS (0) on success and an FD_WKSP_ERR_*
639 : (negative) on failure (logs details). Reasons for failure include
640 : INVAL (user arguments obviously bad) and FAIL (could not procure or
641 : format the shared memory region). */
642 :
643 : int
644 : fd_wksp_new_named( char const * name,
645 : ulong page_sz,
646 : ulong sub_cnt,
647 : ulong const * sub_page_cnt,
648 : ulong const * sub_cpu_idx,
649 : ulong mode,
650 : uint seed,
651 : ulong opt_part_max );
652 :
653 : /* fd_wksp_delete_named deletes a workspace created with
654 : fd_wksp_new_named. There should not be any other joins / attachments
655 : to wksp when this is called. Returns FD_WKSP_SUCCESS (0) on success
656 : and FD_WKSP_ERR_* (negative) on failure (logs details). */
657 :
658 : int
659 : fd_wksp_delete_named( char const * name );
660 :
661 : /* fd_wksp_new_anon creates a workspace local to this thread group that
662 : otherwise looks and behaves _exactly_ like a workspace shared between
663 : multiple thread groups on this host of the same name, TLB and NUMA
664 : properties. Ignoring error trapping, this is a shorthand for:
665 :
666 : // Size the workspace to use all the memory
667 : ulong page_cnt = sum( sub_page_cnt[*] );
668 : ulong footprint = page_cnt*page_sz;
669 : ulong part_max = opt_part_max ? opt_part_max : fd_wksp_part_max_est( footprint, 64 KiB );
670 : ulong data_max = fd_wksp_data_max_est( footprint, part_max );
671 :
672 : // Create the anonymous memory region and format as a workspace
673 : void * mem = fd_shmem_acquire_multi( page_sz, sub_cnt, sub_page_cnt, sub_cpu_idx );
674 : fd_wksp_t * wksp = fd_wksp_join( fd_wksp_new( mem, name, seed, part_max, data_max ) );
675 : fd_shmem_join_anonymous( name, FD_SHMEM_JOIN_MODE_READ_WRITE, wksp, mem, page_sz, page_cnt );
676 :
677 : There should be must no current shmem joins to name and the anonymous
678 : join will shadow any preexisting fd_shmem region with the same name
679 : in the calling thread group). Returns the joined workspace on
680 : success and NULL on failure (logs details). The final leave and
681 : delete to this workspace should be through fd_wksp_delete_anon. */
682 :
683 : fd_wksp_t *
684 : fd_wksp_new_anon( char const * name,
685 : ulong page_sz,
686 : ulong sub_cnt,
687 : ulong const * sub_page_cnt,
688 : ulong const * sub_cpu_idx,
689 : uint seed,
690 : ulong opt_part_max );
691 :
692 : /* fd_wksp_delete_anon deletes a workspace created with fd_wksp_new_anon
693 : There should not be any other joins / attachments to wksp when this
694 : is called. This cannot fail from the caller's POV; logs details if
695 : any wonkiness is detected during the delete. */
696 :
697 : void
698 : fd_wksp_delete_anon( fd_wksp_t * wksp );
699 :
700 : /* TODO: eliminate these legacy versions of the in favor of the above. */
701 :
702 : static inline fd_wksp_t *
703 : fd_wksp_new_anonymous( ulong page_sz,
704 : ulong page_cnt,
705 : ulong cpu_idx,
706 : char const * name,
707 483 : ulong opt_part_max ) {
708 483 : return fd_wksp_new_anon( name, page_sz, 1UL, &page_cnt, &cpu_idx, 0U, opt_part_max );
709 483 : }
710 :
711 168 : static inline void fd_wksp_delete_anonymous( fd_wksp_t * wksp ) { fd_wksp_delete_anon( wksp ); }
712 :
713 : /* fd_wksp_attach attach to the workspace held by the shared memory
714 : region with the given name. If there are regions with the same name
715 : backed by different page sizes, defaults to the region backed by the
716 : largest page size. Returns wksp on success and NULL on failure
717 : (details are logged). Multiple attachments within are fine (all but
718 : the first attachment will be a reasonably fast O(1) call); all
719 : attachments in a process will use the same local fd_wksp_t handle.
720 : Every attach should be paired with a detach. TODO: CONST-VARIANTS? */
721 :
722 : fd_wksp_t *
723 : fd_wksp_attach( char const * name );
724 :
725 : /* fd_wksp_detach detaches from the given workspace. All but the last
726 : detach should be a reasonably fast O(1) call. Returns non-zero on
727 : failure. */
728 :
729 : int
730 : fd_wksp_detach( fd_wksp_t * wksp );
731 :
732 : /* fd_wksp_containing maps a fd_wksp local addr to the corresponding
733 : fd_wksp local join. Returns NULL if laddr does not appear to be from
734 : a locally joined fd_wksp. Always silent such that this can be used
735 : to detect if a pointer is from a fd_wksp or not. This is not a
736 : terribly fast call. This API can only be used on laddrs in wksp are
737 : either named or anonymous workspaces. */
738 :
739 : fd_wksp_t *
740 : fd_wksp_containing( void const * laddr );
741 :
742 : /* fd_wksp_alloc_laddr is the same as fd_wksp_alloc but returns a
743 : pointer in the caller's local address space if the allocation was
744 : successful (and NULL if not). Ignoring error trapping, this is a
745 : shorthand for:
746 :
747 : fd_wksp_laddr( wksp, fd_wksp_alloc( wksp, align, sz, tag ) ) */
748 :
749 : void *
750 : fd_wksp_alloc_laddr( fd_wksp_t * wksp,
751 : ulong align,
752 : ulong sz,
753 : ulong tag );
754 :
755 : /* fd_wksp_free_laddr is the same as fd_wksp_free but takes a pointer
756 : in the caller's local address space into a workspace allocation.
757 : Ignoring error trapping, this is a shorthand for:
758 :
759 : fd_wksp_t * wksp = fd_wksp_containing( laddr );
760 : fd_wksp_free( wksp, fd_wksp_gaddr( wksp, laddr ) );
761 :
762 : This API can only be used on laddrs in wksp are either named or
763 : anonymous workspaces. */
764 :
765 : void
766 : fd_wksp_free_laddr( void * laddr );
767 :
768 : /* cstr helper APIs ***************************************************/
769 :
770 : /* Overall, these are meant for use at application startup / shutdown
771 : and not in critical loops. */
772 :
773 : /* fd_wksp_cstr prints the wksp global address gaddr into cstr as a
774 : [fd_wksp_name(wksp)]:[gaddr]. Caller promises that cstr has room for
775 : FD_WKSP_CSTR_MAX bytes. Returns cstr on success and NULL on failure
776 : (logs details). Reasons for failure include NULL wksp, gaddr not in
777 : the data region (or one past), NULL cstr. */
778 :
779 : char *
780 : fd_wksp_cstr( fd_wksp_t const * wksp,
781 : ulong gaddr,
782 : char * cstr );
783 :
784 : /* fd_wksp_cstr_laddr is the same fd_wksp_cstr but takes a pointer in
785 : the caller's local address space to a wksp location. Ignoring error
786 : trapping, this is a shorthand for:
787 :
788 : fd_wksp_t * wksp = fd_wksp_containing( laddr );
789 : return fd_wksp_cstr( wksp, fd_wksp_gaddr( wksp, laddr ), cstr );
790 :
791 : Returns NULL if laddr does not point strictly inside a workspace
792 : (logs details). This API can only be used on laddrs in wksp are
793 : either named or anonymous workspaces. */
794 :
795 : char *
796 : fd_wksp_cstr_laddr( void const * laddr,
797 : char * cstr );
798 :
799 : /* fd_wksp_cstr_alloc allocates sz bytes with alignment align from name
800 : or anonymous wksp with name. align and sz have the exact same
801 : semantics as fd_wksp_alloc. cstr must be non-NULL with space for up
802 : to FD_WKSP_CSTR_MAX bytes.
803 :
804 : Returns cstr on success and NULL on failure (logs details). On
805 : success, cstr will contain a [name]:[gaddr] string suitable for use
806 : by fd_wksp_map and fd_wksp_cstr_free. cstr will be untouched
807 : otherwise. Ignoring error trapping, this is a shorthand for:
808 :
809 : fd_wksp_t * wksp = fd_wksp_attach( name );
810 : ulong gaddr = fd_wksp_alloc( wksp, align, sz );
811 : fd_wksp_detach( wksp );
812 : sprintf( cstr, "%s:%lu", name, gaddr );
813 : return cstr;
814 :
815 : As such, if doing many allocations from the same wksp, it is faster
816 : to do a fd_wksp_attach upfront, followed by the allocations and then
817 : a wksp detach (and faster still to use the advanced APIs to further
818 : amortize the fd_wksp_attach / fd_wksp_detach calls). */
819 :
820 : char *
821 : fd_wksp_cstr_alloc( char const * name,
822 : ulong align,
823 : ulong sz,
824 : ulong tag,
825 : char * cstr );
826 :
827 : /* fd_wksp_cstr_free frees a wksp allocation specified by a cstr
828 : containing [name]:[gaddr]. Ignoring parsing and error trapping, this
829 : is a shorthand for:
830 :
831 : fd_wksp_t * wksp = fd_wksp_attach( name );
832 : fd_wksp_free( wksp, gaddr );
833 : fd_wksp_detach( wksp );
834 :
835 : As such, if doing many frees from the same wksp, it is faster to do a
836 : fd_wksp_attach upfront, followed by the frees and then a
837 : fd_wksp_detach (and faster still to use the advanced APIs to further
838 : amortize the fd_wksp_attach / fd_wksp_detach calls.) */
839 :
840 : void
841 : fd_wksp_cstr_free( char const * cstr );
842 :
843 : /* fd_wksp_cstr_tag queries the tag of a wksp allocation specified by a
844 : cstr containing [name]:[gaddr]. Ignoring parsing and error trapping,
845 : this is a shorthand for:
846 :
847 : fd_wksp_t * wksp = fd_wksp_attach( name );
848 : ulong tag = fd_wksp_tag( wksp, gaddr );
849 : fd_wksp_detach( wksp );
850 :
851 : As such, if doing many queries on the same wksp, it is faster to do
852 : fd_wksp_attach upfront, followed by the queries and then a
853 : fd_wksp_detach (and faster still to use the advanced APIs to further
854 : amortize the fd_wksp_attach / fd_wksp_detach calls.) */
855 :
856 : ulong
857 : fd_wksp_cstr_tag( char const * cstr );
858 :
859 : /* fd_wksp_cstr_memset memsets a wksp allocation specified by a cstr
860 : containing [name]:[gaddr] to c. Ignoring parsing and error trapping,
861 : equivalent to:
862 :
863 : fd_wksp_t * wksp = fd_wksp_attach( name );
864 : fd_wksp_memset( wksp, gaddr, c );
865 : fd_wksp_detach( wksp );
866 :
867 : As such, if doing many memset in the same wksp, it is faster to do a
868 : fd_wksp_attach upfront, followed by the memsets and then a
869 : fd_wksp_detach (and faster still to use the advanced APIs to further
870 : amortize the fd_wksp_attach / fd_wksp_detach calls.) */
871 :
872 : void
873 : fd_wksp_cstr_memset( char const * cstr,
874 : int c );
875 :
876 : /* fd_wksp_map returns a pointer in the caller's address space to
877 : the wksp allocation specified by a cstr containing [name]:[gaddr].
878 : [name] is the name of the shared memory region holding the wksp.
879 : [gaddr] is converted to a number via fd_cstr_to_ulong that should
880 : correspond to a valid non-NULL global address in that wksp. Ignoring
881 : parsing, edge cases and error trapping, this is a shorthand for:
882 :
883 : fd_wksp_laddr( fd_wksp_attach( name ), gaddr )
884 :
885 : Returns non-NULL on successful (the lifetime of the returned pointer
886 : will be until fd_wksp_unmap is called on it). Returns NULL and logs
887 : details on failure.
888 :
889 : fd_wksp_map is algorithmically efficient and reasonably low overhead
890 : (especially if is this not the first attachment to the wksp).
891 :
892 : TODO: consider const-correct variant? */
893 :
894 : void *
895 : fd_wksp_map( char const * cstr );
896 :
897 : /* fd_wksp_unmap unmaps a pointer returned by fd_wksp_map, logs details
898 : if anything weird is detected. Ignoring error trapping, this is a
899 : shorthand for:
900 :
901 : fd_wksp_detach( fd_wksp_containing( laddr ) )
902 :
903 : Undefined behavior if laddr is not currently mapped by fd_wksp_map.
904 : fd_wksp_unmap is not algorithmically efficient but practically still
905 : quite fast (especially if this is not the last attachment to wksp).
906 : This API can only be used on laddrs in wksp are either named or
907 : anonymous workspaces. */
908 :
909 : void
910 : fd_wksp_unmap( void const * laddr );
911 :
912 : /* pod helper APIs ****************************************************/
913 :
914 : /* Ignoring error trapping, fd_wksp_pod_attach( cstr ) is shorthand
915 : for:
916 :
917 : fd_pod_join( fd_wksp_map( cstr ) )
918 :
919 : Cannot fail from the caller's point of view (will terminate the
920 : thread group of the caller with a detailed FD_LOG_ERR message on
921 : failure. Calls to fd_wksp_pod_attach should be paired with calls to
922 : fd_wksp_pod_detach when pod usage is done. */
923 :
924 : uchar const *
925 : fd_wksp_pod_attach( char const * cstr );
926 :
927 : /* Ignoring error trapping, fd_wksp_pod_detach( pod ) is shorthand for:
928 :
929 : fd_wksp_unmap( fd_pod_leave( pod ) )
930 :
931 : Provided for symmetry with fd_wksp_pod_attach. Cannot fail from the
932 : caller's point of view (will terminate the thread group of the caller
933 : with a detailed FD_LOG_ERR message on failure and will FD_LOG_WARNING
934 : if anything wonky occurs in the unmap under the hood). */
935 :
936 : void
937 : fd_wksp_pod_detach( uchar const * pod );
938 :
939 : /* Ignoring error trapping, fd_wksp_pod_map( pod, path ) is shorthand
940 : for:
941 :
942 : fd_wksp_map( fd_pod_query_cstr( pod, path, NULL ) )
943 :
944 : Cannot fail from the caller's point of view (will terminate the
945 : thread group of the caller with detailed FD_LOG_ERR message on
946 : failure). Calls to fd_wksp_pod_map should be paired with calls to
947 : fd_wksp_pod_unmap. */
948 :
949 : void *
950 : fd_wksp_pod_map( uchar const * pod,
951 : char const * path );
952 :
953 : /* Ignoring error trapping, fd_wksp_pod_unmap( obj ) is shorthand for:
954 :
955 : fd_wksp_unmap( obj )
956 :
957 : Provided for symmetry with fd_wksp_pod_map. Cannot fail from the
958 : caller's point of view (will terminate the thread group of the caller
959 : with a detailed FD_LOG_ERR message on failure and will FD_LOG_WARNING
960 : if anything wonky occurs in the unmap under the hood). */
961 :
962 : void
963 : fd_wksp_pod_unmap( void * obj );
964 :
965 : /* io APIs ************************************************************/
966 :
967 : /* fd_wksp_checkpt will write the wksp's state to a file. The file
968 : will be located at path with UNIX style permissions given by mode.
969 : style specifies the checkpt style and should be a
970 : FD_WKSP_CHECKPT_STYLE_* value or 0 (0 indicates to use
971 : FD_WKSP_CHECKPT_STYLE_DEFAULT). uinfo points to a cstr with optional
972 : additional user context (NULL will be treated as the empty string ""
973 : ... if the strlen is longer than 16384 bytes, the info will be
974 : truncated to a strlen of 16383).
975 :
976 : Returns FD_WKSP_SUCCESS (0) on success or a FD_WKSP_ERR_* on failure
977 : (logs details). Reasons for failure include INVAL (NULL wksp, NULL
978 : path, bad mode, unsupported style), CORRUPT (wksp memory corruption
979 : detected), FAIL (fail already exists, I/O error). On failure, this
980 : will make a best effort to clean up after any partially written
981 : checkpt file. */
982 :
983 : int
984 : fd_wksp_checkpt( fd_wksp_t * wksp,
985 : char const * path,
986 : ulong mode,
987 : int style,
988 : char const * uinfo );
989 :
990 : /* fd_wksp_restore will replace all allocations in the current workspace
991 : with the allocations from the checkpt at path. The restored
992 : workspace will use the given seed.
993 :
994 : IMPORTANT! It is okay for wksp to have a different size, backing
995 : page sz and/or numa affinity than the original wksp. The only
996 : requirements are the wksp be able to support as many allocations as
997 : are in the checkpt and that these partitions can be restored to their
998 : original positions in wksp's global address space. If wksp has
999 : part_max in checkpt's [alloc_cnt,part_max] and a data_max>=checkpt's
1000 : data_max, this is guaranteed.
1001 :
1002 : Returns FD_WKSP_SUCCESS (0) on success or a FD_WKSP_ERR_* on failure
1003 : (logs details). Reasons for failure include INVAL (NULL wksp, NULL
1004 : path), FAIL or CORRUPT (couldn't open checkpt, I/O error, checkpt
1005 : format error, incompatible wksp for checkpt, etc ... logs details).
1006 : For the INVAL and FAIL cases, the original workspace allocations was
1007 : untouched. For the CORRUPT case, original workspace allocations were
1008 : removed because the checkpt issues were detected after the restore
1009 : process began (a best effort to reset wksp to the empty state was
1010 : done before return). */
1011 :
1012 : int
1013 : fd_wksp_restore( fd_wksp_t * wksp,
1014 : char const * path,
1015 : uint seed );
1016 :
1017 : /* fd_wksp_restore_preview extracts key parameters from a checkpoint
1018 : file. These can be used with fd_funk_new for a correct restore. */
1019 : int
1020 : fd_wksp_restore_preview( char const * path,
1021 : uint * out_seed,
1022 : ulong * out_part_max,
1023 : ulong * out_data_max );
1024 :
1025 : /* fd_wksp_mprotect marks all the memory in a workspace as read-only
1026 : (flag==1) or read-write (flag==0). Accessing read-only memory produces
1027 : a seg fault. */
1028 :
1029 : void
1030 : fd_wksp_mprotect( fd_wksp_t * wksp, int flag );
1031 :
1032 : FD_PROTOTYPES_END
1033 :
1034 : #endif /* HEADER_fd_src_util_wksp_fd_wksp_h */
|