Line data Source code
1 : #ifndef HEADER_fd_src_util_scratch_fd_scratch_h
2 : #define HEADER_fd_src_util_scratch_fd_scratch_h
3 :
4 : /* APIs for high performance scratch pad memory allocation. There
5 : are two allocators provided. One is fd_alloca, which is an alignment
6 : aware equivalent of alloca. It is meant for use anywhere alloca
7 : would normally be used. This is only available if the built target
8 : has the FD_HAS_ALLOCA capability. The second as fd_scratch_alloc.
9 : It is meant for use in situations that have very complex and large
10 : temporary memory usage. */
11 :
12 : #include "../sanitize/fd_sanitize.h"
13 : #include "../tile/fd_tile.h"
14 : #include "../valloc/fd_valloc.h"
15 :
16 : /* FD_SCRATCH_USE_HANDHOLDING: Define this to non-zero at compile time
17 : to turn on additional run-time checks. */
18 :
19 : #ifndef FD_SCRATCH_USE_HANDHOLDING
20 : #define FD_SCRATCH_USE_HANDHOLDING 0
21 : #endif
22 :
23 : /* FD_SCRATCH_ALLOC_ALIGN_DEFAULT is the default alignment to use for
24 : allocations.
25 :
26 : Default should be at least 16 for consistent cross platform behavior
27 : that is language conformant across a wide range of targets (i.e. the
28 : largest primitive type across all possible build ... practically
29 : sizeof(int128)). This also naturally covers SSE natural alignment on
30 : x86. 8 could be used if features like int128 and so forth and still
31 : be linguistically conformant (sizeof(ulong) here is the limit).
32 : Likewise, 32, 64, 128 could be used to guarantee all allocations will
33 : have natural AVX/AVX2, natural AVX-512 / cache-line,
34 : adjacent-cache-line-prefetch false sharing avoidance / natural GPU
35 : alignment properties.
36 :
37 : 128 for default was picked as double x86 cache line for ACLPF false
38 : sharing avoidance and for consistency with GPU warp sizes ... i.e.
39 : the default allocation behaviors are naturally interthread
40 : communication false sharing resistant and GPU friendly. This also
41 : naturally covers cases like SSE, AVX, AVX2 and AVX-512. */
42 :
43 16110093 : #define FD_SCRATCH_ALIGN_DEFAULT (128UL) /* integer power-of-2 >=16 */
44 :
45 : /* FD_SCRATCH_{SMEM,FMEM}_ALIGN give the alignment requirements for
46 : the memory regions used to a scratch pad memory. There are not many
47 : restrictions on the SMEM alignment practically other than it be a
48 : reasonable integer power of two. 128 was picked to harmonize with
49 : FD_SCRATCH_ALIGN_DEFAULT (which does have more technical motivations
50 : behind its choice) but this is not strictly required.
51 : FD_SCRATCH_FMEM_ALIGN is required to be sizeof(ulong). */
52 :
53 49158 : #define FD_SCRATCH_SMEM_ALIGN (128UL) /* integer power-of-2, harmonized with ALIGN_DEFAULT */
54 : #define FD_SCRATCH_FMEM_ALIGN (8UL) /* ==sizeof(ulong) but avoids bugs with some compilers */
55 :
56 : FD_PROTOTYPES_BEGIN
57 :
58 : /* Private APIs *******************************************************/
59 :
60 : #if FD_SCRATCH_USE_HANDHOLDING
61 : extern FD_TL int fd_scratch_in_prepare;
62 : #endif
63 :
64 : extern FD_TL ulong fd_scratch_private_start;
65 : extern FD_TL ulong fd_scratch_private_free;
66 : extern FD_TL ulong fd_scratch_private_stop;
67 :
68 : extern FD_TL ulong * fd_scratch_private_frame;
69 : extern FD_TL ulong fd_scratch_private_frame_cnt;
70 : extern FD_TL ulong fd_scratch_private_frame_max;
71 :
72 : FD_FN_CONST static inline int
73 2601757 : fd_scratch_private_align_is_valid( ulong align ) {
74 2601757 : return !(align & (align-1UL)); /* returns true if power or 2 or zero, compile time typically */
75 2601757 : }
76 :
77 : FD_FN_CONST static inline ulong
78 15356252 : fd_scratch_private_true_align( ulong align ) {
79 15356252 : return fd_ulong_if( !align, FD_SCRATCH_ALIGN_DEFAULT, align ); /* compile time typically */
80 15356252 : }
81 :
82 : /* Public APIs ********************************************************/
83 :
84 : /* Constructor APIs */
85 :
86 : /* fd_scratch_smem_{align,footprint} return the alignment and footprint
87 : of a memory region suitable for use as a scratch pad memory that can
88 : hold up to smax bytes. There are very few restrictions on the nature
89 : of this memory. It could even be just a flat address space that is
90 : not backed by an actual physical memory as far as scratch is
91 : concerned. In typical use cases though, the scratch pad memory
92 : should point to a region of huge or gigantic page backed memory on
93 : the caller's numa node.
94 :
95 : A shared memory region for smem is fine for smem. This could be used
96 : for example to allow other threads / processes to access a scratch
97 : allocation from this thread for the lifetime of a scratch allocation.
98 :
99 : Even more generally, a shared memory region for both smem and fmem
100 : could make it is theoretically possible to have a scratch pad memory
101 : that is shared across multiple threads / processes. The API is not
102 : well designed for such though (the main reason to use fmem in shared
103 : memory would be convenience and/or adding hot swapping
104 : functionality). In the common scratch scenario, every thread would
105 : attach to their local join of the shared smem and shared fmem. But
106 : since the operations below are not designed to be thread safe, the
107 : threads would have to protect against concurrent use of push and pop
108 : (and attach would probably need to be tweaked to make it easier to
109 : attach to an already in use scratch pad).
110 :
111 : Compile time allocation is possible via the FD_SCRATCH_SMEM_ALIGN
112 : define. E.g.:
113 :
114 : uchar my_smem[ MY_SMAX ] __attribute__((aligned(FD_SCRATCH_SMEM_ALIGN)));
115 :
116 : will be valid to use as a scratch smem with space for up to MY_SMAX
117 : bytes. */
118 :
119 0 : FD_FN_CONST static inline ulong fd_scratch_smem_align( void ) { return FD_SCRATCH_SMEM_ALIGN; }
120 :
121 : FD_FN_CONST static inline ulong
122 49155 : fd_scratch_smem_footprint( ulong smax ) {
123 49155 : return fd_ulong_align_up( smax, FD_SCRATCH_SMEM_ALIGN );
124 49155 : }
125 :
126 : /* fd_scratch_fmem_{align,footprint} return the alignment and footprint
127 : of a memory region suitable for holding the scratch pad memory
128 : metadata (typically very small). The scratch pad memory will be
129 : capable of holding up to depth scratch frames.
130 :
131 : Compile time allocation is possible via the FD_SCRATCH_FMEM_ALIGN
132 : define. E.g.
133 :
134 : ulong my_fmem[ MY_DEPTH ] __attribute((aligned(FD_SCRATCH_FMEM_ALIGN)));
135 :
136 : or, even simpler:
137 :
138 : ulong my_fmem[ MY_DEPTH ];
139 :
140 : will be valid to use as a scratch fmem with space for up to depth
141 : frames. The attribute variant is not strictly necessary, just for
142 : consistency with the smem above (where it is required). */
143 :
144 0 : FD_FN_CONST static inline ulong fd_scratch_fmem_align ( void ) { return sizeof(ulong); }
145 51 : FD_FN_CONST static inline ulong fd_scratch_fmem_footprint( ulong depth ) { return sizeof(ulong)*depth; }
146 :
147 : /* fd_scratch_attach attaches the calling thread to memory regions
148 : sufficient to hold up to smax (positive) bytes and with up to depth
149 : (positive) frames. smem/fmem should have the required alignment and
150 : footprint specified for smax/depth from the above and be non-NULL).
151 : The caller has a read/write interest in these regions while attached
152 : (and thus the local lifetime of these regions must cover the lifetime
153 : of the attachment). Only one scratch pad memory may be attached to a
154 : caller at a time. This cannot fail from the caller's point of view
155 : (if handholding is enabled, it will abort the caller with a
156 : descriptive error message if used obviously in error). */
157 :
158 : static inline void
159 : fd_scratch_attach( void * smem,
160 : void * fmem,
161 : ulong smax,
162 24173 : ulong depth ) {
163 :
164 : # if FD_SCRATCH_USE_HANDHOLDING
165 87 : if( FD_UNLIKELY( fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "already attached" ));
166 87 : if( FD_UNLIKELY( !smem ) ) FD_LOG_ERR(( "bad smem" ));
167 87 : if( FD_UNLIKELY( !fmem ) ) FD_LOG_ERR(( "bad fmem" ));
168 87 : if( FD_UNLIKELY( !smax ) ) FD_LOG_ERR(( "bad smax" ));
169 87 : if( FD_UNLIKELY( !depth ) ) FD_LOG_ERR(( "bad depth" ));
170 87 : fd_scratch_in_prepare = 0;
171 87 : # endif
172 :
173 87 : fd_scratch_private_start = (ulong)smem;
174 87 : fd_scratch_private_free = fd_scratch_private_start;
175 87 : fd_scratch_private_stop = fd_scratch_private_start + smax;
176 :
177 87 : fd_scratch_private_frame = (ulong *)fmem;
178 87 : fd_scratch_private_frame_cnt = 0UL;
179 87 : fd_scratch_private_frame_max = depth;
180 :
181 : # if FD_HAS_DEEPASAN
182 : /* Poison the entire smem region. Underpoison the boundaries to respect
183 : alignment requirements. */
184 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
185 : ulong aligned_end = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
186 : fd_asan_poison( (void*)aligned_start, aligned_end - aligned_start );
187 : # endif
188 87 : }
189 :
190 : /* fd_scratch_detach detaches the calling thread from its current
191 : attachment. Returns smem used on attach and, if opt_fmem is
192 : non-NULL, opt_fmem[0] will contain the fmem used on attach on return.
193 :
194 : This relinquishes the calling threads read/write interest on these
195 : memory regions. All the caller's scratch frames are popped, any
196 : prepare in progress is canceled and all the caller's scratch
197 : allocations are freed implicitly by this.
198 :
199 : This cannot fail from the caller's point of view (if handholding is
200 : enabled, it will abort the caller with a descriptive error message if
201 : used obviously in error). */
202 :
203 : static inline void *
204 24097 : fd_scratch_detach( void ** _opt_fmem ) {
205 :
206 : # if FD_SCRATCH_USE_HANDHOLDING
207 0 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
208 0 : fd_scratch_in_prepare = 0;
209 0 : # endif
210 :
211 : # if FD_HAS_DEEPASAN
212 : /* Unpoison the entire scratch space. There should now be an underlying
213 : allocation which has not been poisoned. */
214 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
215 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
216 : fd_asan_unpoison( (void*)aligned_start, aligned_stop - aligned_start );
217 : # endif
218 :
219 0 : void * smem = (void *)fd_scratch_private_start;
220 0 : void * fmem = (void *)fd_scratch_private_frame;
221 :
222 0 : fd_scratch_private_start = 0UL;
223 0 : fd_scratch_private_free = 0UL;
224 0 : fd_scratch_private_stop = 0UL;
225 :
226 0 : fd_scratch_private_frame = NULL;
227 0 : fd_scratch_private_frame_cnt = 0UL;
228 0 : fd_scratch_private_frame_max = 0UL;
229 :
230 24097 : if( _opt_fmem ) _opt_fmem[0] = fmem;
231 0 : return smem;
232 24097 : }
233 :
234 : /* User APIs */
235 :
236 : /* fd_scratch_{used,free} returns the number of bytes used/free in the
237 : caller's scratch. Returns 0 if not attached. Because of alignment
238 : overheads, an allocation is guaranteed to succeed if free>=sz+align-1
239 : where align is the actual alignment required for the allocation (e.g.
240 : align==0 -> default, align<min -> min). It is guaranteed to fail if
241 : free<sz. It might succeed or fail in between depending on the
242 : alignments of previously allocations. These are freaky fast (O(3)
243 : fast asm operations under the hood). */
244 :
245 9 : static inline ulong fd_scratch_used( void ) { return fd_scratch_private_free - fd_scratch_private_start; }
246 24045 : static inline ulong fd_scratch_free( void ) { return fd_scratch_private_stop - fd_scratch_private_free; }
247 :
248 : /* fd_scratch_frame_{used,free} returns the number of scratch frames
249 : used/free in the caller's scratch. Returns 0 if not attached. push
250 : is guaranteed to succeed if free is non-zero and guaranteed to fail
251 : otherwise. pop is guaranteed to succeed if used is non-zero and
252 : guaranteed to fail otherwise. These are freaky fast (O(1-3) fast asm
253 : operations under the hood). */
254 :
255 3040386 : static inline ulong fd_scratch_frame_used( void ) { return fd_scratch_private_frame_cnt; }
256 2999381 : static inline ulong fd_scratch_frame_free( void ) { return fd_scratch_private_frame_max - fd_scratch_private_frame_cnt; }
257 :
258 : /* fd_scratch_reset frees all allocations (if any) and pops all scratch
259 : frames (if any) such that the caller's scratch will be in the same
260 : state it was immediately after attach. The caller must be attached
261 : to a scratch memory to use. This cannot fail from the caller's point
262 : of view (if handholding is enabled, it will abort the caller with a
263 : descriptive error message if used obviously in error). This is
264 : freaky fast (O(3) fast asm operations under the hood). */
265 :
266 : static inline void
267 730 : fd_scratch_reset( void ) {
268 : # if FD_SCRATCH_USE_HANDHOLDING
269 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
270 : fd_scratch_in_prepare = 0;
271 : # endif
272 730 : fd_scratch_private_free = fd_scratch_private_start;
273 730 : fd_scratch_private_frame_cnt = 0UL;
274 :
275 : # if FD_HAS_DEEPASAN
276 : /* Poison entire scratch space again. */
277 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
278 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
279 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
280 : # endif
281 730 : }
282 :
283 : /* fd_scratch_push creates a new scratch frame and makes it the current
284 : frame. Assumes caller is attached to a scratch with space for a new
285 : frame. This cannot fail from the caller's point of view (if
286 : handholding is enabled, it will abort the caller with a descriptive
287 : error message if used obviously in error). This is freaky fast (O(5)
288 : fast asm operations under the hood). */
289 :
290 : FD_FN_UNUSED static void /* Work around -Winline */
291 14298014 : fd_scratch_push( void ) {
292 : # if FD_SCRATCH_USE_HANDHOLDING
293 90537 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) {
294 0 : FD_LOG_ERR(( "not attached" ));
295 0 : }
296 90537 : if( FD_UNLIKELY( fd_scratch_private_frame_cnt>=fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "too many frames" ));
297 90537 : fd_scratch_in_prepare = 0;
298 90537 : # endif
299 90537 : fd_scratch_private_frame[ fd_scratch_private_frame_cnt++ ] = fd_scratch_private_free;
300 :
301 : # if FD_HAS_DEEPASAN
302 : /* Poison to end of scratch region to account for case of in-prep allocation
303 : getting implictly cancelled. */
304 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
305 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
306 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
307 : # endif
308 90537 : }
309 :
310 : /* fd_scratch_pop frees all allocations in the current scratch frame,
311 : destroys the current scratch frame and makes the previous frame (if
312 : there is one) the current stack frame (and leaves the caller without
313 : a current frame if there is not one). Assumes the caller is attached
314 : to a scratch memory with at least one frame in use. This cannot fail
315 : from the caller's point of view (if handholding is enabled, it will
316 : abort the caller with a descriptive error message if used obviously
317 : in error). This is freaky fast (O(5) fast asm operations under the
318 : hood). */
319 :
320 : FD_FN_UNUSED static void /* Work around -Winline */
321 14744082 : fd_scratch_pop( void ) {
322 : # if FD_SCRATCH_USE_HANDHOLDING
323 90537 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
324 90537 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched pop" ));
325 90537 : fd_scratch_in_prepare = 0;
326 90537 : # endif
327 90537 : fd_scratch_private_free = fd_scratch_private_frame[ --fd_scratch_private_frame_cnt ];
328 :
329 : # if FD_HAS_DEEPASAN
330 : /* On a pop() operation, the entire range from fd_scratch_private_free to the
331 : end of the scratch space can be safely poisoned. The region must be aligned
332 : to accomodate asan manual poisoning requirements. */
333 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
334 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
335 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
336 : # endif
337 90537 : }
338 :
339 : /* fd_scratch_prepare starts an allocation of unknown size and known
340 : alignment align (0 means use default alignment) in the caller's
341 : current scratch frame. Returns a pointer in the caller's address
342 : space with alignment align to the first byte of a region with
343 : fd_scratch_free() (as observed after this function returns) bytes
344 : available. The caller is free to clobber any bytes in this region.
345 :
346 : fd_scratch_publish finishes an in-progress allocation. end points at
347 : the first byte after the final allocation. Assumes there is a
348 : matching prepare. A published allocation can be subsequently
349 : trimmed.
350 :
351 : fd_scratch_cancel cancels an in-progress allocation. This is a no-op
352 : if there is no matching prepare. If the prepare had alignment other
353 : than 1, it is possible that some alignment padding needed for the
354 : allocation will still be used in the caller's current scratch frame.
355 : If this is not acceptable, the prepare should use an alignment of 1
356 : and manually align the return.
357 :
358 : This allows idioms like:
359 :
360 : uchar * p = (uchar *)fd_scratch_prepare( align );
361 :
362 : if( FD_UNLIKELY( fd_scratch_free() < app_max_sz ) ) {
363 :
364 : fd_scratch_cancel();
365 :
366 : ... handle too little scratch space to handle application
367 : ... worst case needs here
368 :
369 : } else {
370 :
371 : ... populate sz bytes to p where sz is in [0,app_max_sz]
372 : p += sz;
373 :
374 : fd_scratch_publish( p );
375 :
376 : ... at this point, scratch is as though
377 : ... fd_scratch_alloc( align, sz ) was called above
378 :
379 : }
380 :
381 : Ideally every prepare should be matched with a publish or a cancel,
382 : only one prepare can be in-progress at a time on a thread and prepares
383 : cannot be nested. As such virtually all other scratch operations
384 : will implicitly cancel any in-progress prepare, including attach /
385 : detach / push / pop / prepare / alloc / trim. */
386 :
387 : FD_FN_UNUSED static void * /* Work around -Winline */
388 12754187 : fd_scratch_prepare( ulong align ) {
389 :
390 : # if FD_SCRATCH_USE_HANDHOLDING
391 114132 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched push" ));
392 114132 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) FD_LOG_ERR(( "bad align (%lu)", align ));
393 114132 : # endif
394 :
395 : # if FD_HAS_DEEPASAN
396 : /* Need 8 byte alignment. */
397 : align = fd_ulong_align_up( align, FD_ASAN_ALIGN );
398 : # endif
399 114132 : ulong true_align = fd_scratch_private_true_align( align );
400 114132 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
401 :
402 : # if FD_SCRATCH_USE_HANDHOLDING
403 114132 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) FD_LOG_ERR(( "prepare align (%lu) overflow", true_align ));
404 114132 : if( FD_UNLIKELY( smem > fd_scratch_private_stop ) ) FD_LOG_ERR(( "prepare align (%lu) needs %lu additional scratch",
405 114132 : align, smem - fd_scratch_private_stop ));
406 114132 : fd_scratch_in_prepare = 1;
407 114132 : # endif
408 :
409 : # if FD_HAS_DEEPASAN
410 : /* At this point the user is able to clobber any bytes in the region. smem is
411 : always going to be at least 8 byte aligned. */
412 : ulong aligned_sz = fd_ulong_align_up( fd_scratch_private_stop - smem, FD_ASAN_ALIGN );
413 : fd_asan_unpoison( (void*)smem, aligned_sz );
414 : # endif
415 :
416 114132 : fd_scratch_private_free = smem;
417 114132 : return (void *)smem;
418 114132 : }
419 :
420 : static inline void
421 12539607 : fd_scratch_publish( void * _end ) {
422 12539607 : ulong end = (ulong)_end;
423 :
424 : # if FD_SCRATCH_USE_HANDHOLDING
425 114132 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) FD_LOG_ERR(( "unmatched prepare" ));
426 114132 : if( FD_UNLIKELY( end < fd_scratch_private_free ) ) FD_LOG_ERR(( "publish underflow" ));
427 114132 : if( FD_UNLIKELY( end > fd_scratch_private_stop ) )
428 0 : FD_LOG_ERR(( "publish needs %lu additional scratch", end-fd_scratch_private_stop ));
429 114132 : fd_scratch_in_prepare = 0;
430 114132 : # endif
431 :
432 : # if FD_HAS_DEEPASAN
433 : /* Poison everything that is trimmed off. Conservatively poison potentially
434 : less than the region that is trimmed to respect alignment requirements. */
435 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
436 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
437 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
438 : # endif
439 :
440 114132 : fd_scratch_private_free = end;
441 114132 : }
442 :
443 : static inline void
444 188723 : fd_scratch_cancel( void ) {
445 :
446 : # if FD_SCRATCH_USE_HANDHOLDING
447 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) FD_LOG_ERR(( "unmatched prepare" ));
448 : fd_scratch_in_prepare = 0;
449 : # endif
450 :
451 188723 : }
452 :
453 : /* fd_scratch_alloc allocates sz bytes with alignment align in the
454 : caller's current scratch frame. There should be no prepare in
455 : progress. Note that this has same function signature as
456 : aligned_alloc (and not by accident). It does have some less
457 : restrictive behaviors though.
458 :
459 : align must be 0 or an integer power of 2. 0 will be treated as
460 : FD_SCRATCH_ALIGN_DEFAULT.
461 :
462 : sz need not be a multiple of align. Further, the underlying
463 : allocator does not implicitly round up sz to an align multiple (as
464 : such, scratch can allocate additional items in any tail padding that
465 : might have been implicitly reserved had it rounded up). That is, if
466 : you really want to round up allocations to a multiple of align, then
467 : manually align up sz ... e.g. pass fd_ulong_align_up(sz,align) when
468 : align is non-zero to this call (this could be implemented as a
469 : compile time mode with some small extra overhead if desirable).
470 :
471 : sz 0 is fine. This will currently return a properly aligned non-NULL
472 : pointer (the allocator might do some allocation under the hood to get
473 : the desired alignment and it is possible this might fail ... there is
474 : a case for returning NULL or an arbitrary but appropriately aligned
475 : non-NULL and this could be implemented as a compile time mode with
476 : some small extra overhead if desirable).
477 :
478 : This cannot fail from the caller's point of view (if handholding is
479 : enabled, it will abort the caller with a descriptive error message if
480 : used obviously in error).
481 :
482 : This is freaky fast (O(5) fast asm operations under the hood). */
483 :
484 : FD_FN_UNUSED static void * /* Work around -Winline */
485 : fd_scratch_alloc( ulong align,
486 12182513 : ulong sz ) {
487 12182513 : ulong smem = (ulong)fd_scratch_prepare( align );
488 12182513 : ulong end = smem + sz;
489 :
490 : # if FD_SCRATCH_USE_HANDHOLDING
491 114132 : if( FD_UNLIKELY( (end < smem) | (end > fd_scratch_private_stop) ) ) FD_LOG_ERR(( "sz (%lu) overflow", sz ));
492 114132 : # endif
493 :
494 114132 : fd_scratch_publish( (void *)end );
495 114132 : return (void *)smem;
496 12182513 : }
497 :
498 : /* fd_scratch_trim trims the size of the most recent scratch allocation
499 : in the current scratch frame (technically it can be used to trim the
500 : size of the entire current scratch frame but doing more than the most
501 : recent scratch allocation is strongly discouraged). Assumes there is
502 : a current scratch frame and the caller is not in a prepare. end
503 : points at the first byte to free in the most recent scratch
504 : allocation (or the first byte after the most recent scratch
505 : allocation). This allows idioms like:
506 :
507 : uchar * p = (uchar *)fd_scratch_alloc( align, max_sz );
508 :
509 : ... populate sz bytes of p where sz is in [0,max_sz]
510 : p += sz;
511 :
512 : fd_scratch_trim( p );
513 :
514 : ... now the thread's scratch is as though original call was
515 : ... p = fd_scratch_alloc( align, sz );
516 :
517 : This cannot fail from the caller's point of view (if handholding is
518 : enabled, this will abort the caller with a descriptive error message
519 : if used obviously in error).
520 :
521 : Note that an allocation be repeatedly trimmed.
522 :
523 : Note also that trim can nest. E.g. a thread can call a function that
524 : uses scratch with its own properly matched scratch pushes and pops.
525 : On function return, trim will still work on the most recent scratch
526 : alloc in that frame by the caller.
527 :
528 : This is freaky fast (O(1) fast asm operations under the hood). */
529 :
530 : static inline void
531 753841 : fd_scratch_trim( void * _end ) {
532 753841 : ulong end = (ulong)_end;
533 :
534 : # if FD_SCRATCH_USE_HANDHOLDING
535 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched push" ));
536 : if( FD_UNLIKELY( end < fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ] ) ) FD_LOG_ERR(( "trim underflow" ));
537 : if( FD_UNLIKELY( end > fd_scratch_private_free ) ) FD_LOG_ERR(( "trim overflow" ));
538 : fd_scratch_in_prepare = 0;
539 : # endif
540 :
541 : # if FD_HAS_DEEPASAN
542 : /* The region to poison should be from _end to the end of the scratch's region.
543 : The same alignment considerations need to be taken into account. */
544 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
545 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
546 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
547 : # endif
548 :
549 753841 : fd_scratch_private_free = end;
550 753841 : }
551 :
552 : /* fd_scratch_*_is_safe returns false (0) if the operation is obviously
553 : unsafe to do at the time of the call or true otherwise.
554 : Specifically:
555 :
556 : fd_scratch_attach_is_safe() returns 1 if the calling thread is not
557 : already attached to scratch.
558 :
559 : fd_scratch_detach_is_safe() returns 1 if the calling thread is
560 : already attached to scratch.
561 :
562 : fd_scratch_reset_is_safe() returns 1 if the calling thread is already
563 : attached to scratch.
564 :
565 : fd_scratch_push_is_safe() returns 1 if there is at least one frame
566 : available and 0 otherwise.
567 :
568 : fd_scratch_pop_is_safe() returns 1 if there is at least one frame
569 : in use and 0 otherwise.
570 :
571 : fd_scratch_prepare_is_safe( align ) returns 1 if there is a current
572 : frame for the allocation and enough scratch pad memory to start
573 : preparing an allocation with alignment align.
574 :
575 : fd_scratch_publish_is_safe( end ) returns 1 if end is a valid
576 : location to complete an allocation in preparation. If handholding is
577 : enabled, will additionally check that there is a prepare already in
578 : progress.
579 :
580 : fd_scratch_cancel_is_safe() returns 1.
581 :
582 : fd_scratch_alloc_is_safe( align, sz ) returns 1 if there is a current
583 : frame for the allocation and enough scratch pad memory for an
584 : allocation with alignment align and size sz.
585 :
586 : fd_scratch_trim_is_safe( end ) returns 1 if there is a current frame
587 : and that current frame can be trimmed to end safely.
588 :
589 : These are safe to call at any time and also freak fast handful of
590 : assembly operations. */
591 :
592 0 : FD_FN_PURE static inline int fd_scratch_attach_is_safe( void ) { return !fd_scratch_private_frame_max; }
593 0 : FD_FN_PURE static inline int fd_scratch_detach_is_safe( void ) { return !!fd_scratch_private_frame_max; }
594 0 : FD_FN_PURE static inline int fd_scratch_reset_is_safe ( void ) { return !!fd_scratch_private_frame_max; }
595 5998546 : FD_FN_PURE static inline int fd_scratch_push_is_safe ( void ) { return fd_scratch_private_frame_cnt<fd_scratch_private_frame_max; }
596 5907812 : FD_FN_PURE static inline int fd_scratch_pop_is_safe ( void ) { return !!fd_scratch_private_frame_cnt; }
597 :
598 : FD_FN_PURE static inline int
599 24036 : fd_scratch_prepare_is_safe( ulong align ) {
600 24036 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
601 24036 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad alignment, compile time typically */
602 24036 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
603 24036 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
604 24036 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* alignment overflow */
605 24036 : if( FD_UNLIKELY( smem > fd_scratch_private_stop ) ) return 0; /* insufficient scratch */
606 24036 : return 1;
607 24036 : }
608 :
609 : FD_FN_PURE static inline int
610 0 : fd_scratch_publish_is_safe( void * _end ) {
611 0 : ulong end = (ulong)_end;
612 0 : # if FD_SCRATCH_USE_HANDHOLDING
613 0 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) return 0; /* Not in prepare */
614 0 : # endif
615 0 : if( FD_UNLIKELY( end < fd_scratch_private_free ) ) return 0; /* Backward */
616 0 : if( FD_UNLIKELY( end > fd_scratch_private_stop ) ) return 0; /* Out of bounds */
617 0 : return 1;
618 0 : }
619 :
620 : FD_FN_CONST static inline int
621 0 : fd_scratch_cancel_is_safe( void ) {
622 0 : return 1;
623 0 : }
624 :
625 : FD_FN_PURE static inline int
626 : fd_scratch_alloc_is_safe( ulong align,
627 2913313 : ulong sz ) {
628 2913313 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
629 2601733 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad align, compile time typically */
630 2601733 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
631 2601733 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
632 2601733 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* align overflow */
633 2601733 : ulong free = smem + sz;
634 2601733 : if( FD_UNLIKELY( free < smem ) ) return 0; /* sz overflow */
635 2601733 : if( FD_UNLIKELY( free > fd_scratch_private_stop ) ) return 0; /* too little space */
636 753841 : return 1;
637 2601733 : }
638 :
639 : FD_FN_PURE static inline int
640 0 : fd_scratch_trim_is_safe( void * _end ) {
641 0 : ulong end = (ulong)_end;
642 0 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
643 0 : if( FD_UNLIKELY( end < fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ] ) ) return 0; /* Trim underflow */
644 0 : if( FD_UNLIKELY( end > fd_scratch_private_free ) ) return 0; /* Trim overflow */
645 0 : return 1;
646 0 : }
647 :
648 : /* fd_scratch_vtable is the virtual function table implementing
649 : fd_valloc for fd_scratch. */
650 :
651 : extern const fd_valloc_vtable_t fd_scratch_vtable;
652 :
653 : /* fd_scratch_virtual returns an abstract handle to the fd_scratch join.
654 : Valid for lifetime of scratch frame. fd_valloc_t must be dropped
655 : before scratch frame changes or scratch detaches. */
656 :
657 : FD_FN_CONST static inline fd_valloc_t
658 0 : fd_scratch_virtual( void ) {
659 0 : fd_valloc_t valloc = { NULL, &fd_scratch_vtable };
660 0 : return valloc;
661 0 : }
662 :
663 : /* FD_SCRATCH_SCOPE_{BEGIN,END} create a `do { ... } while(0);` scope in
664 : which a temporary scratch frame is available. Nested scopes are
665 : permitted. This scratch frame is automatically destroyed when
666 : exiting the scope normally (e.g. by 'break', 'return', or reaching
667 : the end). Uses a dummy variable with a cleanup attribute under the
668 : hood. U.B. if scope is left abnormally (e.g. longjmp(), exception,
669 : abort(), etc.). Use as follows:
670 :
671 : FD_SCRATCH_SCOPE_BEGIN {
672 : ...
673 : fd_scratch_alloc( ... );
674 : ...
675 : }
676 : FD_SCRATCH_SCOPE_END; */
677 :
678 : FD_FN_UNUSED static inline void
679 428141 : fd_scratch_scoped_pop_private( void * _unused ) {
680 428141 : (void)_unused;
681 428141 : fd_scratch_pop();
682 428141 : }
683 :
684 428141 : #define FD_SCRATCH_SCOPE_BEGIN do { \
685 428141 : fd_scratch_push(); \
686 428141 : int __fd_scratch_guard_ ## __LINE__ \
687 428141 : __attribute__((cleanup(fd_scratch_scoped_pop_private))) \
688 428141 : __attribute__((unused)) = 0; \
689 428141 : do
690 :
691 428141 : #define FD_SCRATCH_SCOPE_END while(0); } while(0)
692 :
693 : /* fd_alloca is variant of alloca that works like aligned_alloc. That
694 : is, it returns an allocation of sz bytes with an alignment of at
695 : least align. Like alloca, this allocation will be in the stack frame
696 : of the calling function with a lifetime of until the calling function
697 : returns. Stack overflow handling is likewise identical to alloca
698 : (stack overflows will overlap the top stack guard, typically
699 : triggering a seg fault when the overflow region is touched that will
700 : be caught and handled by the logger to terminate the calling thread
701 : group). As such, like alloca, these really should only be used for
702 : smallish (<< few KiB) quick allocations in bounded recursion depth
703 : circumstances.
704 :
705 : Like fd_scratch_alloc, align must be an 0 or a non-negative integer
706 : power of 2. 0 will be treated as align_default. align smaller than
707 : align_min will be bumped up to align_min.
708 :
709 : The caller promises request will not overflow the stack. This has to
710 : be implemented as a macro for linguistic reasons and align should be
711 : safe against multiple evaluation and, due to compiler limitations,
712 : must be a compile time constant. Returns non-NULL on success and
713 : NULL on failure (in most situations, can never fail from the caller's
714 : POV). sz==0 is okay (and will return non-NULL). */
715 :
716 : #if FD_HAS_ALLOCA
717 :
718 : /* Work around compiler limitations */
719 843612 : #define FD_SCRATCH_PRIVATE_TRUE_ALIGN( align ) ((align) ? (align) : FD_SCRATCH_ALIGN_DEFAULT)
720 :
721 837105 : #define fd_alloca(align,sz) __builtin_alloca_with_align( fd_ulong_max( (sz), 1UL ), \
722 837105 : 8UL*FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) /*bits*/ )
723 :
724 : /* fd_alloca_check does fd_alloca but it will FD_LOG_CRIT with a
725 : detailed message if the request would cause a stack overflow or leave
726 : so little available free stack that subsequent normal thread
727 : operations would be at risk.
728 :
729 : Note that returning NULL on failure is not an option as this would no
730 : longer be a drop-in instrumented replacement for fd_alloca (this
731 : would also require even more linguistic hacks to keep the fd_alloca
732 : at the appropriate scope). Likewise, testing the allocated region is
733 : within the stack post allocation is not an option as the FD_LOG_CRIT
734 : invocation would then try to use stack with the already overflowed
735 : allocation in it (there is no easy portable way to guarantee an
736 : alloca has been freed short of returning from the function in which
737 : the alloca was performed). Using FD_LOG_ERR instead of FD_LOG_CRIT
738 : is a potentially viable alternative error handling behavior though.
739 :
740 : This has to be implemented as a macro for linguistic reasons. It is
741 : recommended this only be used for development / debugging / testing
742 : purposes (e.g. if you are doing alloca in production that are large
743 : enough you are worried about stack overflow, you probably should be
744 : using fd_scratch, fd_alloc or fd_wksp depending on performance and
745 : persistence needs or, better still, architecting to not need any
746 : temporary memory allocations at all). If the caller's stack
747 : diagnostics could not be successfully initialized (this is logged),
748 : this will always FD_LOG_CRIT. */
749 :
750 : #if !FD_HAS_ASAN
751 :
752 : extern FD_TL ulong fd_alloca_check_private_sz;
753 :
754 : #define fd_alloca_check( align, sz ) \
755 6507 : ( fd_alloca_check_private_sz = (sz), \
756 6507 : (__extension__({ \
757 6507 : ulong _fd_alloca_check_private_pad_max = FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) - 1UL; \
758 6507 : ulong _fd_alloca_check_private_footprint = fd_alloca_check_private_sz + _fd_alloca_check_private_pad_max; \
759 6507 : if( FD_UNLIKELY( (_fd_alloca_check_private_footprint < _fd_alloca_check_private_pad_max ) | \
760 6507 : (_fd_alloca_check_private_footprint > (31UL*(fd_tile_stack_est_free() >> 5))) ) ) \
761 6507 : FD_LOG_CRIT(( "fd_alloca_check( " #align ", " #sz " ) stack overflow" )); \
762 6507 : })), \
763 6507 : fd_alloca( (align), fd_alloca_check_private_sz ) )
764 :
765 : #else /* FD_HAS_ASAN */
766 :
767 : /* AddressSanitizer provides its own alloca safety instrumentation
768 : which are more powerful than the above fd_alloca_check heuristics. */
769 :
770 : #define fd_alloca_check fd_alloca
771 :
772 : #endif /* FD_HAS_ASAN */
773 : #endif /* FD_HAS_ALLOCA */
774 :
775 : FD_PROTOTYPES_END
776 :
777 : #endif /* HEADER_fd_src_util_scratch_fd_scratch_h */
|