Line data Source code
1 : #ifndef HEADER_fd_src_util_scratch_fd_scratch_h
2 : #define HEADER_fd_src_util_scratch_fd_scratch_h
3 :
4 : /* APIs for high performance scratch pad memory allocation. There
5 : are two allocators provided. One is fd_alloca, which is an alignment
6 : aware equivalent of alloca. It is meant for use anywhere alloca
7 : would normally be used. This is only available if the built target
8 : has the FD_HAS_ALLOCA capability. The second as fd_scratch_alloc.
9 : It is meant for use in situations that have very complex and large
10 : temporary memory usage. */
11 :
12 : #include "../tile/fd_tile.h"
13 :
14 : /* FD_SCRATCH_USE_HANDHOLDING: Define this to non-zero at compile time
15 : to turn on additional run-time checks. */
16 :
17 : #ifndef FD_SCRATCH_USE_HANDHOLDING
18 : #if FD_HAS_DEEPASAN
19 : #define FD_SCRATCH_USE_HANDHOLDING 1
20 : #else
21 : #define FD_SCRATCH_USE_HANDHOLDING 0
22 : #endif
23 : #endif
24 :
25 : /* FD_SCRATCH_ALLOC_ALIGN_DEFAULT is the default alignment to use for
26 : allocations.
27 :
28 : Default should be at least 16 for consistent cross platform behavior
29 : that is language conformant across a wide range of targets (i.e. the
30 : largest primitive type across all possible build ... practically
31 : sizeof(int128)). This also naturally covers SSE natural alignment on
32 : x86. 8 could be used if features like int128 and so forth and still
33 : be linguistically conformant (sizeof(ulong) here is the limit).
34 : Likewise, 32, 64, 128 could be used to guarantee all allocations will
35 : have natural AVX/AVX2, natural AVX-512 / cache-line,
36 : adjacent-cache-line-prefetch false sharing avoidance / natural GPU
37 : alignment properties.
38 :
39 : 128 for default was picked as double x86 cache line for ACLPF false
40 : sharing avoidance and for consistency with GPU warp sizes ... i.e.
41 : the default allocation behaviors are naturally interthread
42 : communication false sharing resistant and GPU friendly. This also
43 : naturally covers cases like SSE, AVX, AVX2 and AVX-512. */
44 :
45 4303974 : #define FD_SCRATCH_ALIGN_DEFAULT (128UL) /* integer power-of-2 >=16 */
46 :
47 : /* FD_SCRATCH_{SMEM,FMEM}_ALIGN give the alignment requirements for
48 : the memory regions used to a scratch pad memory. There are not many
49 : restrictions on the SMEM alignment practically other than it be a
50 : reasonable integer power of two. 128 was picked to harmonize with
51 : FD_SCRATCH_ALIGN_DEFAULT (which does have more technical motivations
52 : behind its choice) but this is not strictly required.
53 : FD_SCRATCH_FMEM_ALIGN is required to be sizeof(ulong). */
54 :
55 98313 : #define FD_SCRATCH_SMEM_ALIGN (128UL) /* integer power-of-2, harmonized with ALIGN_DEFAULT */
56 : #define FD_SCRATCH_FMEM_ALIGN (8UL) /* ==sizeof(ulong) but avoids bugs with some compilers */
57 :
58 : FD_PROTOTYPES_BEGIN
59 :
60 : /* Private APIs *******************************************************/
61 :
62 : #if FD_SCRATCH_USE_HANDHOLDING
63 : extern FD_TL int fd_scratch_in_prepare;
64 : #endif
65 :
66 : extern FD_TL ulong fd_scratch_private_start;
67 : extern FD_TL ulong fd_scratch_private_free;
68 : extern FD_TL ulong fd_scratch_private_stop;
69 :
70 : extern FD_TL ulong * fd_scratch_private_frame;
71 : extern FD_TL ulong fd_scratch_private_frame_cnt;
72 : extern FD_TL ulong fd_scratch_private_frame_max;
73 :
74 : FD_FN_CONST static inline int
75 2590587 : fd_scratch_private_align_is_valid( ulong align ) {
76 2590587 : return !(align & (align-1UL)); /* returns true if power or 2 or zero, compile time typically */
77 2590587 : }
78 :
79 : FD_FN_CONST static inline ulong
80 3542595 : fd_scratch_private_true_align( ulong align ) {
81 3542595 : return fd_ulong_if( !align, FD_SCRATCH_ALIGN_DEFAULT, align ); /* compile time typically */
82 3542595 : }
83 :
84 : /* Public APIs ********************************************************/
85 :
86 : /* Constructor APIs */
87 :
88 : /* fd_scratch_smem_{align,footprint} return the alignment and footprint
89 : of a memory region suitable for use as a scratch pad memory that can
90 : hold up to smax bytes. There are very few restrictions on the nature
91 : of this memory. It could even be just a flat address space that is
92 : not backed by an actual physical memory as far as scratch is
93 : concerned. In typical use cases though, the scratch pad memory
94 : should point to a region of huge or gigantic page backed memory on
95 : the caller's numa node.
96 :
97 : A shared memory region for smem is fine for smem. This could be used
98 : for example to allow other threads / processes to access a scratch
99 : allocation from this thread for the lifetime of a scratch allocation.
100 :
101 : Even more generally, a shared memory region for both smem and fmem
102 : could make it is theoretically possible to have a scratch pad memory
103 : that is shared across multiple threads / processes. The API is not
104 : well designed for such though (the main reason to use fmem in shared
105 : memory would be convenience and/or adding hot swapping
106 : functionality). In the common scratch scenario, every thread would
107 : attach to their local join of the shared smem and shared fmem. But
108 : since the operations below are not designed to be thread safe, the
109 : threads would have to protect against concurrent use of push and pop
110 : (and attach would probably need to be tweaked to make it easier to
111 : attach to an already in use scratch pad).
112 :
113 : Compile time allocation is possible via the FD_SCRATCH_SMEM_ALIGN
114 : define. E.g.:
115 :
116 : uchar my_smem[ MY_SMAX ] __attribute__((aligned(FD_SCRATCH_SMEM_ALIGN)));
117 :
118 : will be valid to use as a scratch smem with space for up to MY_SMAX
119 : bytes. */
120 :
121 49158 : FD_FN_CONST static inline ulong fd_scratch_smem_align( void ) { return FD_SCRATCH_SMEM_ALIGN; }
122 :
123 : FD_FN_CONST static inline ulong
124 49155 : fd_scratch_smem_footprint( ulong smax ) {
125 49155 : return fd_ulong_align_up( smax, FD_SCRATCH_SMEM_ALIGN );
126 49155 : }
127 :
128 : /* fd_scratch_fmem_{align,footprint} return the alignment and footprint
129 : of a memory region suitable for holding the scratch pad memory
130 : metadata (typically very small). The scratch pad memory will be
131 : capable of holding up to depth scratch frames.
132 :
133 : Compile time allocation is possible via the FD_SCRATCH_FMEM_ALIGN
134 : define. E.g.
135 :
136 : ulong my_fmem[ MY_DEPTH ] __attribute((aligned(FD_SCRATCH_FMEM_ALIGN)));
137 :
138 : or, even simpler:
139 :
140 : ulong my_fmem[ MY_DEPTH ];
141 :
142 : will be valid to use as a scratch fmem with space for up to depth
143 : frames. The attribute variant is not strictly necessary, just for
144 : consistency with the smem above (where it is required). */
145 :
146 9 : FD_FN_CONST static inline ulong fd_scratch_fmem_align ( void ) { return sizeof(ulong); }
147 51 : FD_FN_CONST static inline ulong fd_scratch_fmem_footprint( ulong depth ) { return sizeof(ulong)*depth; }
148 :
149 : /* fd_scratch_attach attaches the calling thread to memory regions
150 : sufficient to hold up to smax (positive) bytes and with up to depth
151 : (positive) frames. smem/fmem should have the required alignment and
152 : footprint specified for smax/depth from the above and be non-NULL).
153 : The caller has a read/write interest in these regions while attached
154 : (and thus the local lifetime of these regions must cover the lifetime
155 : of the attachment). Only one scratch pad memory may be attached to a
156 : caller at a time. This cannot fail from the caller's point of view
157 : (if handholding is enabled, it will abort the caller with a
158 : descriptive error message if used obviously in error). */
159 :
160 : static inline void
161 : fd_scratch_attach( void * smem,
162 : void * fmem,
163 : ulong smax,
164 21 : ulong depth ) {
165 :
166 : # if FD_SCRATCH_USE_HANDHOLDING
167 3 : if( FD_UNLIKELY( fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "already attached" ));
168 3 : if( FD_UNLIKELY( !smem ) ) FD_LOG_ERR(( "bad smem" ));
169 3 : if( FD_UNLIKELY( !fmem ) ) FD_LOG_ERR(( "bad fmem" ));
170 3 : if( FD_UNLIKELY( !smax ) ) FD_LOG_ERR(( "bad smax" ));
171 3 : if( FD_UNLIKELY( !depth ) ) FD_LOG_ERR(( "bad depth" ));
172 3 : fd_scratch_in_prepare = 0;
173 3 : # endif
174 :
175 3 : fd_scratch_private_start = (ulong)smem;
176 3 : fd_scratch_private_free = fd_scratch_private_start;
177 3 : fd_scratch_private_stop = fd_scratch_private_start + smax;
178 :
179 3 : fd_scratch_private_frame = (ulong *)fmem;
180 3 : fd_scratch_private_frame_cnt = 0UL;
181 3 : fd_scratch_private_frame_max = depth;
182 :
183 : # if FD_HAS_DEEPASAN
184 : /* Poison the entire smem region. Underpoison the boundaries to respect
185 : alignment requirements. */
186 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
187 : ulong aligned_end = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
188 : fd_asan_poison( (void*)aligned_start, aligned_end - aligned_start );
189 : # endif
190 : #if FD_HAS_MSAN
191 : /* Mark the entire smem region as uninitialized. */
192 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_MSAN_ALIGN );
193 : ulong aligned_end = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
194 : fd_msan_poison( (void*)aligned_start, aligned_end - aligned_start );
195 : #endif
196 3 : }
197 :
198 : /* fd_scratch_detach detaches the calling thread from its current
199 : attachment. Returns smem used on attach and, if opt_fmem is
200 : non-NULL, opt_fmem[0] will contain the fmem used on attach on return.
201 :
202 : This relinquishes the calling threads read/write interest on these
203 : memory regions. All the caller's scratch frames are popped, any
204 : prepare in progress is canceled and all the caller's scratch
205 : allocations are freed implicitly by this.
206 :
207 : This cannot fail from the caller's point of view (if handholding is
208 : enabled, it will abort the caller with a descriptive error message if
209 : used obviously in error). */
210 :
211 : static inline void *
212 21 : fd_scratch_detach( void ** _opt_fmem ) {
213 :
214 : # if FD_SCRATCH_USE_HANDHOLDING
215 3 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
216 3 : fd_scratch_in_prepare = 0;
217 3 : # endif
218 :
219 : # if FD_HAS_DEEPASAN
220 : /* Unpoison the entire scratch space. There should now be an underlying
221 : allocation which has not been poisoned. */
222 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
223 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
224 : fd_asan_unpoison( (void*)aligned_start, aligned_stop - aligned_start );
225 : # endif
226 :
227 3 : void * smem = (void *)fd_scratch_private_start;
228 3 : void * fmem = (void *)fd_scratch_private_frame;
229 :
230 3 : fd_scratch_private_start = 0UL;
231 3 : fd_scratch_private_free = 0UL;
232 3 : fd_scratch_private_stop = 0UL;
233 :
234 3 : fd_scratch_private_frame = NULL;
235 3 : fd_scratch_private_frame_cnt = 0UL;
236 3 : fd_scratch_private_frame_max = 0UL;
237 :
238 21 : if( _opt_fmem ) _opt_fmem[0] = fmem;
239 3 : return smem;
240 21 : }
241 :
242 : /* User APIs */
243 :
244 : /* fd_scratch_{used,free} returns the number of bytes used/free in the
245 : caller's scratch. Returns 0 if not attached. Because of alignment
246 : overheads, an allocation is guaranteed to succeed if free>=sz+align-1
247 : where align is the actual alignment required for the allocation (e.g.
248 : align==0 -> default, align<min -> min). It is guaranteed to fail if
249 : free<sz. It might succeed or fail in between depending on the
250 : alignments of previously allocations. These are freaky fast (O(3)
251 : fast asm operations under the hood). */
252 :
253 9 : static inline ulong fd_scratch_used( void ) { return fd_scratch_private_free - fd_scratch_private_start; }
254 9 : static inline ulong fd_scratch_free( void ) { return fd_scratch_private_stop - fd_scratch_private_free; }
255 :
256 : /* fd_scratch_frame_{used,free} returns the number of scratch frames
257 : used/free in the caller's scratch. Returns 0 if not attached. push
258 : is guaranteed to succeed if free is non-zero and guaranteed to fail
259 : otherwise. pop is guaranteed to succeed if used is non-zero and
260 : guaranteed to fail otherwise. These are freaky fast (O(1-3) fast asm
261 : operations under the hood). */
262 :
263 2954181 : static inline ulong fd_scratch_frame_used( void ) { return fd_scratch_private_frame_cnt; }
264 2999373 : static inline ulong fd_scratch_frame_free( void ) { return fd_scratch_private_frame_max - fd_scratch_private_frame_cnt; }
265 :
266 : /* fd_scratch_reset frees all allocations (if any) and pops all scratch
267 : frames (if any) such that the caller's scratch will be in the same
268 : state it was immediately after attach. The caller must be attached
269 : to a scratch memory to use. This cannot fail from the caller's point
270 : of view (if handholding is enabled, it will abort the caller with a
271 : descriptive error message if used obviously in error). This is
272 : freaky fast (O(3) fast asm operations under the hood). */
273 :
274 : static inline void
275 738 : fd_scratch_reset( void ) {
276 : # if FD_SCRATCH_USE_HANDHOLDING
277 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
278 : fd_scratch_in_prepare = 0;
279 : # endif
280 738 : fd_scratch_private_free = fd_scratch_private_start;
281 738 : fd_scratch_private_frame_cnt = 0UL;
282 :
283 : /* Poison entire scratch space again. */
284 : # if FD_HAS_DEEPASAN
285 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
286 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
287 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
288 : # endif
289 : # if FD_HAS_MSAN
290 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_MSAN_ALIGN );
291 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
292 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
293 : # endif
294 738 : }
295 :
296 : /* fd_scratch_push creates a new scratch frame and makes it the current
297 : frame. Assumes caller is attached to a scratch with space for a new
298 : frame. This cannot fail from the caller's point of view (if
299 : handholding is enabled, it will abort the caller with a descriptive
300 : error message if used obviously in error). This is freaky fast (O(5)
301 : fast asm operations under the hood). */
302 :
303 : FD_FN_UNUSED static void /* Work around -Winline */
304 45444 : fd_scratch_push( void ) {
305 : # if FD_SCRATCH_USE_HANDHOLDING
306 30 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) {
307 0 : FD_LOG_ERR(( "not attached" ));
308 0 : }
309 30 : if( FD_UNLIKELY( fd_scratch_private_frame_cnt>=fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "too many frames" ));
310 30 : fd_scratch_in_prepare = 0;
311 30 : # endif
312 30 : fd_scratch_private_frame[ fd_scratch_private_frame_cnt++ ] = fd_scratch_private_free;
313 :
314 : /* Poison to end of scratch region to account for case of in-prep allocation
315 : getting implictly cancelled. */
316 : # if FD_HAS_DEEPASAN
317 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
318 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
319 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
320 : # endif
321 : #if FD_HAS_MSAN
322 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_MSAN_ALIGN );
323 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
324 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
325 : #endif
326 30 : }
327 :
328 : /* fd_scratch_pop frees all allocations in the current scratch frame,
329 : destroys the current scratch frame and makes the previous frame (if
330 : there is one) the current stack frame (and leaves the caller without
331 : a current frame if there is not one). Assumes the caller is attached
332 : to a scratch memory with at least one frame in use. This cannot fail
333 : from the caller's point of view (if handholding is enabled, it will
334 : abort the caller with a descriptive error message if used obviously
335 : in error). This is freaky fast (O(5) fast asm operations under the
336 : hood). */
337 :
338 : FD_FN_UNUSED static void /* Work around -Winline */
339 40794 : fd_scratch_pop( void ) {
340 : # if FD_SCRATCH_USE_HANDHOLDING
341 24 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
342 24 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched pop" ));
343 24 : fd_scratch_in_prepare = 0;
344 24 : # endif
345 24 : fd_scratch_private_free = fd_scratch_private_frame[ --fd_scratch_private_frame_cnt ];
346 :
347 : # if FD_HAS_DEEPASAN
348 : /* On a pop() operation, the entire range from fd_scratch_private_free to the
349 : end of the scratch space can be safely poisoned. The region must be aligned
350 : to accomodate asan manual poisoning requirements. */
351 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
352 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
353 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
354 : # endif
355 : #if FD_HAS_MSAN
356 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_MSAN_ALIGN );
357 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
358 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
359 : #endif
360 24 : }
361 :
362 : /* fd_scratch_prepare starts an allocation of unknown size and known
363 : alignment align (0 means use default alignment) in the caller's
364 : current scratch frame. Returns a pointer in the caller's address
365 : space with alignment align to the first byte of a region with
366 : fd_scratch_free() (as observed after this function returns) bytes
367 : available. The caller is free to clobber any bytes in this region.
368 :
369 : fd_scratch_publish finishes an in-progress allocation. end points at
370 : the first byte after the final allocation. Assumes there is a
371 : matching prepare. A published allocation can be subsequently
372 : trimmed.
373 :
374 : fd_scratch_cancel cancels an in-progress allocation. This is a no-op
375 : if there is no matching prepare. If the prepare had alignment other
376 : than 1, it is possible that some alignment padding needed for the
377 : allocation will still be used in the caller's current scratch frame.
378 : If this is not acceptable, the prepare should use an alignment of 1
379 : and manually align the return.
380 :
381 : This allows idioms like:
382 :
383 : uchar * p = (uchar *)fd_scratch_prepare( align );
384 :
385 : if( FD_UNLIKELY( fd_scratch_free() < app_max_sz ) ) {
386 :
387 : fd_scratch_cancel();
388 :
389 : ... handle too little scratch space to handle application
390 : ... worst case needs here
391 :
392 : } else {
393 :
394 : ... populate sz bytes to p where sz is in [0,app_max_sz]
395 : p += sz;
396 :
397 : fd_scratch_publish( p );
398 :
399 : ... at this point, scratch is as though
400 : ... fd_scratch_alloc( align, sz ) was called above
401 :
402 : }
403 :
404 : Ideally every prepare should be matched with a publish or a cancel,
405 : only one prepare can be in-progress at a time on a thread and prepares
406 : cannot be nested. As such virtually all other scratch operations
407 : will implicitly cancel any in-progress prepare, including attach /
408 : detach / push / pop / prepare / alloc / trim. */
409 :
410 : FD_FN_UNUSED static void * /* Work around -Winline */
411 952080 : fd_scratch_prepare( ulong align ) {
412 :
413 : # if FD_SCRATCH_USE_HANDHOLDING
414 72 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched push" ));
415 72 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) FD_LOG_ERR(( "bad align (%lu)", align ));
416 72 : # endif
417 :
418 : # if FD_HAS_DEEPASAN
419 : /* Need 8 byte alignment. */
420 : align = fd_ulong_align_up( align, FD_ASAN_ALIGN );
421 : # endif
422 72 : ulong true_align = fd_scratch_private_true_align( align );
423 72 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
424 :
425 : # if FD_SCRATCH_USE_HANDHOLDING
426 72 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) FD_LOG_ERR(( "prepare align (%lu) overflow", true_align ));
427 72 : if( FD_UNLIKELY( smem > fd_scratch_private_stop ) ) FD_LOG_ERR(( "prepare align (%lu) needs %lu additional scratch",
428 72 : align, smem - fd_scratch_private_stop ));
429 72 : fd_scratch_in_prepare = 1;
430 72 : # endif
431 :
432 : # if FD_HAS_DEEPASAN
433 : /* At this point the user is able to clobber any bytes in the region. smem is
434 : always going to be at least 8 byte aligned. */
435 : ulong aligned_sz = fd_ulong_align_up( fd_scratch_private_stop - smem, FD_ASAN_ALIGN );
436 : fd_asan_unpoison( (void*)smem, aligned_sz );
437 : # endif
438 :
439 72 : fd_scratch_private_free = smem;
440 72 : return (void *)smem;
441 72 : }
442 :
443 : static inline void
444 761631 : fd_scratch_publish( void * _end ) {
445 761631 : ulong end = (ulong)_end;
446 :
447 : # if FD_SCRATCH_USE_HANDHOLDING
448 72 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) FD_LOG_ERR(( "unmatched prepare" ));
449 72 : if( FD_UNLIKELY( end < fd_scratch_private_free ) ) FD_LOG_ERR(( "publish underflow" ));
450 72 : if( FD_UNLIKELY( end > fd_scratch_private_stop ) )
451 0 : FD_LOG_ERR(( "publish needs %lu additional scratch", end-fd_scratch_private_stop ));
452 72 : fd_scratch_in_prepare = 0;
453 72 : # endif
454 :
455 : /* Poison everything that is trimmed off. Conservatively poison potentially
456 : less than the region that is trimmed to respect alignment requirements. */
457 : # if FD_HAS_DEEPASAN
458 : ulong aligned_free = fd_ulong_align_dn( fd_scratch_private_free, FD_ASAN_ALIGN );
459 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
460 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
461 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
462 : fd_asan_unpoison( (void*)aligned_free, aligned_end - aligned_free );
463 : # endif
464 : # if FD_HAS_MSAN
465 : ulong aligned_free = fd_ulong_align_dn( fd_scratch_private_free, FD_ASAN_ALIGN );
466 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
467 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
468 : fd_msan_poison( (void*)aligned_end, aligned_stop - aligned_end );
469 : fd_msan_unpoison( (void*)aligned_free, aligned_end - aligned_free );
470 : # endif
471 :
472 72 : fd_scratch_private_free = end;
473 72 : }
474 :
475 : static inline void
476 190449 : fd_scratch_cancel( void ) {
477 :
478 : # if FD_SCRATCH_USE_HANDHOLDING
479 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) FD_LOG_ERR(( "unmatched prepare" ));
480 : fd_scratch_in_prepare = 0;
481 : # endif
482 :
483 190449 : }
484 :
485 : /* fd_scratch_alloc allocates sz bytes with alignment align in the
486 : caller's current scratch frame. There should be no prepare in
487 : progress. Note that this has same function signature as
488 : aligned_alloc (and not by accident). It does have some less
489 : restrictive behaviors though.
490 :
491 : align must be 0 or an integer power of 2. 0 will be treated as
492 : FD_SCRATCH_ALIGN_DEFAULT.
493 :
494 : sz need not be a multiple of align. Further, the underlying
495 : allocator does not implicitly round up sz to an align multiple (as
496 : such, scratch can allocate additional items in any tail padding that
497 : might have been implicitly reserved had it rounded up). That is, if
498 : you really want to round up allocations to a multiple of align, then
499 : manually align up sz ... e.g. pass fd_ulong_align_up(sz,align) when
500 : align is non-zero to this call (this could be implemented as a
501 : compile time mode with some small extra overhead if desirable).
502 :
503 : sz 0 is fine. This will currently return a properly aligned non-NULL
504 : pointer (the allocator might do some allocation under the hood to get
505 : the desired alignment and it is possible this might fail ... there is
506 : a case for returning NULL or an arbitrary but appropriately aligned
507 : non-NULL and this could be implemented as a compile time mode with
508 : some small extra overhead if desirable).
509 :
510 : This cannot fail from the caller's point of view (if handholding is
511 : enabled, it will abort the caller with a descriptive error message if
512 : used obviously in error).
513 :
514 : This is freaky fast (O(5) fast asm operations under the hood). */
515 :
516 : FD_FN_UNUSED static void * /* Work around -Winline */
517 : fd_scratch_alloc( ulong align,
518 380778 : ulong sz ) {
519 380778 : ulong smem = (ulong)fd_scratch_prepare( align );
520 380778 : ulong end = smem + sz;
521 :
522 : # if FD_SCRATCH_USE_HANDHOLDING
523 72 : if( FD_UNLIKELY( (end < smem) | (end > fd_scratch_private_stop) ) ) FD_LOG_ERR(( "sz (%lu) overflow", sz ));
524 72 : # endif
525 :
526 72 : fd_scratch_publish( (void *)end );
527 72 : return (void *)smem;
528 380778 : }
529 :
530 : /* fd_scratch_trim trims the size of the most recent scratch allocation
531 : in the current scratch frame (technically it can be used to trim the
532 : size of the entire current scratch frame but doing more than the most
533 : recent scratch allocation is strongly discouraged). Assumes there is
534 : a current scratch frame and the caller is not in a prepare. end
535 : points at the first byte to free in the most recent scratch
536 : allocation (or the first byte after the most recent scratch
537 : allocation). This allows idioms like:
538 :
539 : uchar * p = (uchar *)fd_scratch_alloc( align, max_sz );
540 :
541 : ... populate sz bytes of p where sz is in [0,max_sz]
542 : p += sz;
543 :
544 : fd_scratch_trim( p );
545 :
546 : ... now the thread's scratch is as though original call was
547 : ... p = fd_scratch_alloc( align, sz );
548 :
549 : This cannot fail from the caller's point of view (if handholding is
550 : enabled, this will abort the caller with a descriptive error message
551 : if used obviously in error).
552 :
553 : Note that an allocation be repeatedly trimmed.
554 :
555 : Note also that trim can nest. E.g. a thread can call a function that
556 : uses scratch with its own properly matched scratch pushes and pops.
557 : On function return, trim will still work on the most recent scratch
558 : alloc in that frame by the caller.
559 :
560 : This is freaky fast (O(1) fast asm operations under the hood). */
561 :
562 : static inline void
563 761379 : fd_scratch_trim( void * _end ) {
564 761379 : ulong end = (ulong)_end;
565 :
566 : # if FD_SCRATCH_USE_HANDHOLDING
567 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched push" ));
568 : if( FD_UNLIKELY( end < fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ] ) ) FD_LOG_ERR(( "trim underflow" ));
569 : if( FD_UNLIKELY( end > fd_scratch_private_free ) ) FD_LOG_ERR(( "trim overflow" ));
570 : fd_scratch_in_prepare = 0;
571 : # endif
572 :
573 : # if FD_HAS_DEEPASAN
574 : /* The region to poison should be from _end to the end of the scratch's region.
575 : The same alignment considerations need to be taken into account. */
576 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
577 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
578 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
579 : # endif
580 : # if FD_HAS_MSAN
581 : ulong aligned_end = fd_ulong_align_up( end, FD_MSAN_ALIGN );
582 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
583 : fd_msan_poison( (void*)aligned_end, aligned_stop - aligned_end );
584 : # endif
585 :
586 761379 : fd_scratch_private_free = end;
587 761379 : }
588 :
589 : /* fd_scratch_*_is_safe returns false (0) if the operation is obviously
590 : unsafe to do at the time of the call or true otherwise.
591 : Specifically:
592 :
593 : fd_scratch_attach_is_safe() returns 1 if the calling thread is not
594 : already attached to scratch.
595 :
596 : fd_scratch_detach_is_safe() returns 1 if the calling thread is
597 : already attached to scratch.
598 :
599 : fd_scratch_reset_is_safe() returns 1 if the calling thread is already
600 : attached to scratch.
601 :
602 : fd_scratch_push_is_safe() returns 1 if there is at least one frame
603 : available and 0 otherwise.
604 :
605 : fd_scratch_pop_is_safe() returns 1 if there is at least one frame
606 : in use and 0 otherwise.
607 :
608 : fd_scratch_prepare_is_safe( align ) returns 1 if there is a current
609 : frame for the allocation and enough scratch pad memory to start
610 : preparing an allocation with alignment align.
611 :
612 : fd_scratch_publish_is_safe( end ) returns 1 if end is a valid
613 : location to complete an allocation in preparation. If handholding is
614 : enabled, will additionally check that there is a prepare already in
615 : progress.
616 :
617 : fd_scratch_cancel_is_safe() returns 1.
618 :
619 : fd_scratch_alloc_is_safe( align, sz ) returns 1 if there is a current
620 : frame for the allocation and enough scratch pad memory for an
621 : allocation with alignment align and size sz.
622 :
623 : fd_scratch_trim_is_safe( end ) returns 1 if there is a current frame
624 : and that current frame can be trimmed to end safely.
625 :
626 : These are safe to call at any time and also freak fast handful of
627 : assembly operations. */
628 :
629 0 : FD_FN_PURE static inline int fd_scratch_attach_is_safe( void ) { return !fd_scratch_private_frame_max; }
630 0 : FD_FN_PURE static inline int fd_scratch_detach_is_safe( void ) { return !!fd_scratch_private_frame_max; }
631 0 : FD_FN_PURE static inline int fd_scratch_reset_is_safe ( void ) { return !!fd_scratch_private_frame_max; }
632 5998530 : FD_FN_PURE static inline int fd_scratch_push_is_safe ( void ) { return fd_scratch_private_frame_cnt<fd_scratch_private_frame_max; }
633 5908020 : FD_FN_PURE static inline int fd_scratch_pop_is_safe ( void ) { return !!fd_scratch_private_frame_cnt; }
634 :
635 : FD_FN_PURE static inline int
636 21 : fd_scratch_prepare_is_safe( ulong align ) {
637 21 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
638 21 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad alignment, compile time typically */
639 21 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
640 21 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
641 21 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* alignment overflow */
642 21 : if( FD_UNLIKELY( smem > fd_scratch_private_stop ) ) return 0; /* insufficient scratch */
643 21 : return 1;
644 21 : }
645 :
646 : FD_FN_PURE static inline int
647 21 : fd_scratch_publish_is_safe( void * _end ) {
648 21 : ulong end = (ulong)_end;
649 : # if FD_SCRATCH_USE_HANDHOLDING
650 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) return 0; /* Not in prepare */
651 : # endif
652 21 : if( FD_UNLIKELY( end < fd_scratch_private_free ) ) return 0; /* Backward */
653 21 : if( FD_UNLIKELY( end > fd_scratch_private_stop ) ) return 0; /* Out of bounds */
654 21 : return 1;
655 21 : }
656 :
657 : FD_FN_CONST static inline int
658 0 : fd_scratch_cancel_is_safe( void ) {
659 0 : return 1;
660 0 : }
661 :
662 : FD_FN_PURE static inline int
663 : fd_scratch_alloc_is_safe( ulong align,
664 2913408 : ulong sz ) {
665 2913408 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
666 2590494 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad align, compile time typically */
667 2590494 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
668 2590494 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
669 2590494 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* align overflow */
670 2590494 : ulong free = smem + sz;
671 2590494 : if( FD_UNLIKELY( free < smem ) ) return 0; /* sz overflow */
672 2590494 : if( FD_UNLIKELY( free > fd_scratch_private_stop ) ) return 0; /* too little space */
673 761388 : return 1;
674 2590494 : }
675 :
676 : FD_FN_PURE static inline int
677 0 : fd_scratch_trim_is_safe( void * _end ) {
678 0 : ulong end = (ulong)_end;
679 0 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
680 0 : if( FD_UNLIKELY( end < fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ] ) ) return 0; /* Trim underflow */
681 0 : if( FD_UNLIKELY( end > fd_scratch_private_free ) ) return 0; /* Trim overflow */
682 0 : return 1;
683 0 : }
684 :
685 : /* FD_SCRATCH_SCOPE_{BEGIN,END} create a `do { ... } while(0);` scope in
686 : which a temporary scratch frame is available. Nested scopes are
687 : permitted. This scratch frame is automatically destroyed when
688 : exiting the scope normally (e.g. by 'break', 'return', or reaching
689 : the end). Uses a dummy variable with a cleanup attribute under the
690 : hood. U.B. if scope is left abnormally (e.g. longjmp(), exception,
691 : abort(), etc.). Use as follows:
692 :
693 : FD_SCRATCH_SCOPE_BEGIN {
694 : ...
695 : fd_scratch_alloc( ... );
696 : ...
697 : }
698 : FD_SCRATCH_SCOPE_END; */
699 :
700 : FD_FN_UNUSED static inline void
701 87 : fd_scratch_scoped_pop_private( void * _unused ) {
702 87 : (void)_unused;
703 87 : fd_scratch_pop();
704 87 : }
705 :
706 87 : #define FD_SCRATCH_SCOPE_BEGIN do { \
707 87 : fd_scratch_push(); \
708 87 : int __fd_scratch_guard_ ## __LINE__ \
709 87 : __attribute__((cleanup(fd_scratch_scoped_pop_private))) \
710 87 : __attribute__((unused)) = 0; \
711 87 : do
712 :
713 87 : #define FD_SCRATCH_SCOPE_END while(0); } while(0)
714 :
715 : /* fd_alloca is variant of alloca that works like aligned_alloc. That
716 : is, it returns an allocation of sz bytes with an alignment of at
717 : least align. Like alloca, this allocation will be in the stack frame
718 : of the calling function with a lifetime of until the calling function
719 : returns. Stack overflow handling is likewise identical to alloca
720 : (stack overflows will overlap the top stack guard, typically
721 : triggering a seg fault when the overflow region is touched that will
722 : be caught and handled by the logger to terminate the calling thread
723 : group). As such, like alloca, these really should only be used for
724 : smallish (<< few KiB) quick allocations in bounded recursion depth
725 : circumstances.
726 :
727 : Like fd_scratch_alloc, align must be an 0 or a non-negative integer
728 : power of 2. 0 will be treated as align_default. align smaller than
729 : align_min will be bumped up to align_min.
730 :
731 : The caller promises request will not overflow the stack. This has to
732 : be implemented as a macro for linguistic reasons and align should be
733 : safe against multiple evaluation and, due to compiler limitations,
734 : must be a compile time constant. Returns non-NULL on success and
735 : NULL on failure (in most situations, can never fail from the caller's
736 : POV). sz==0 is okay (and will return non-NULL). */
737 :
738 : #if FD_HAS_ALLOCA
739 :
740 : /* Work around compiler limitations */
741 33 : #define FD_SCRATCH_PRIVATE_TRUE_ALIGN( align ) ((align) ? (align) : FD_SCRATCH_ALIGN_DEFAULT)
742 :
743 18 : #define fd_alloca(align,sz) __builtin_alloca_with_align( fd_ulong_max( (sz), 1UL ), \
744 18 : 8UL*FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) /*bits*/ )
745 :
746 : /* fd_alloca_check does fd_alloca but it will FD_LOG_CRIT with a
747 : detailed message if the request would cause a stack overflow or leave
748 : so little available free stack that subsequent normal thread
749 : operations would be at risk.
750 :
751 : Note that returning NULL on failure is not an option as this would no
752 : longer be a drop-in instrumented replacement for fd_alloca (this
753 : would also require even more linguistic hacks to keep the fd_alloca
754 : at the appropriate scope). Likewise, testing the allocated region is
755 : within the stack post allocation is not an option as the FD_LOG_CRIT
756 : invocation would then try to use stack with the already overflowed
757 : allocation in it (there is no easy portable way to guarantee an
758 : alloca has been freed short of returning from the function in which
759 : the alloca was performed). Using FD_LOG_ERR instead of FD_LOG_CRIT
760 : is a potentially viable alternative error handling behavior though.
761 :
762 : This has to be implemented as a macro for linguistic reasons. It is
763 : recommended this only be used for development / debugging / testing
764 : purposes (e.g. if you are doing alloca in production that are large
765 : enough you are worried about stack overflow, you probably should be
766 : using fd_scratch, fd_alloc or fd_wksp depending on performance and
767 : persistence needs or, better still, architecting to not need any
768 : temporary memory allocations at all). If the caller's stack
769 : diagnostics could not be successfully initialized (this is logged),
770 : this will always FD_LOG_CRIT. */
771 :
772 : #if !FD_HAS_ASAN
773 :
774 : extern FD_TL ulong fd_alloca_check_private_sz;
775 :
776 : #define fd_alloca_check( align, sz ) \
777 15 : ( fd_alloca_check_private_sz = (sz), \
778 15 : (__extension__({ \
779 15 : ulong _fd_alloca_check_private_pad_max = FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) - 1UL; \
780 15 : ulong _fd_alloca_check_private_footprint = fd_alloca_check_private_sz + _fd_alloca_check_private_pad_max; \
781 15 : if( FD_UNLIKELY( (_fd_alloca_check_private_footprint < _fd_alloca_check_private_pad_max ) | \
782 15 : (_fd_alloca_check_private_footprint > (31UL*(fd_tile_stack_est_free() >> 5))) ) ) \
783 15 : FD_LOG_CRIT(( "fd_alloca_check( " #align ", " #sz " ) stack overflow" )); \
784 15 : })), \
785 15 : fd_alloca( (align), fd_alloca_check_private_sz ) )
786 :
787 : #else /* FD_HAS_ASAN */
788 :
789 : /* AddressSanitizer provides its own alloca safety instrumentation
790 : which are more powerful than the above fd_alloca_check heuristics. */
791 :
792 : #define fd_alloca_check fd_alloca
793 :
794 : #endif /* FD_HAS_ASAN */
795 : #endif /* FD_HAS_ALLOCA */
796 :
797 : FD_PROTOTYPES_END
798 :
799 : #endif /* HEADER_fd_src_util_scratch_fd_scratch_h */
|