Line data Source code
1 : #ifndef HEADER_fd_src_util_scratch_fd_scratch_h
2 : #define HEADER_fd_src_util_scratch_fd_scratch_h
3 :
4 : /* APIs for high performance scratch pad memory allocation. There
5 : are two allocators provided. One is fd_alloca, which is an alignment
6 : aware equivalent of alloca. It is meant for use anywhere alloca
7 : would normally be used. This is only available if the built target
8 : has the FD_HAS_ALLOCA capability. The second as fd_scratch_alloc.
9 : It is meant for use in situations that have very complex and large
10 : temporary memory usage. */
11 :
12 : #include "../tile/fd_tile.h"
13 : #include "../valloc/fd_valloc.h"
14 :
15 : /* FD_SCRATCH_USE_HANDHOLDING: Define this to non-zero at compile time
16 : to turn on additional run-time checks. */
17 :
18 : #ifndef FD_SCRATCH_USE_HANDHOLDING
19 : #if FD_HAS_DEEPASAN
20 : #define FD_SCRATCH_USE_HANDHOLDING 1
21 : #else
22 : #define FD_SCRATCH_USE_HANDHOLDING 0
23 : #endif
24 : #endif
25 :
26 : /* FD_SCRATCH_ALLOC_ALIGN_DEFAULT is the default alignment to use for
27 : allocations.
28 :
29 : Default should be at least 16 for consistent cross platform behavior
30 : that is language conformant across a wide range of targets (i.e. the
31 : largest primitive type across all possible build ... practically
32 : sizeof(int128)). This also naturally covers SSE natural alignment on
33 : x86. 8 could be used if features like int128 and so forth and still
34 : be linguistically conformant (sizeof(ulong) here is the limit).
35 : Likewise, 32, 64, 128 could be used to guarantee all allocations will
36 : have natural AVX/AVX2, natural AVX-512 / cache-line,
37 : adjacent-cache-line-prefetch false sharing avoidance / natural GPU
38 : alignment properties.
39 :
40 : 128 for default was picked as double x86 cache line for ACLPF false
41 : sharing avoidance and for consistency with GPU warp sizes ... i.e.
42 : the default allocation behaviors are naturally interthread
43 : communication false sharing resistant and GPU friendly. This also
44 : naturally covers cases like SSE, AVX, AVX2 and AVX-512. */
45 :
46 4301224 : #define FD_SCRATCH_ALIGN_DEFAULT (128UL) /* integer power-of-2 >=16 */
47 :
48 : /* FD_SCRATCH_{SMEM,FMEM}_ALIGN give the alignment requirements for
49 : the memory regions used to a scratch pad memory. There are not many
50 : restrictions on the SMEM alignment practically other than it be a
51 : reasonable integer power of two. 128 was picked to harmonize with
52 : FD_SCRATCH_ALIGN_DEFAULT (which does have more technical motivations
53 : behind its choice) but this is not strictly required.
54 : FD_SCRATCH_FMEM_ALIGN is required to be sizeof(ulong). */
55 :
56 98313 : #define FD_SCRATCH_SMEM_ALIGN (128UL) /* integer power-of-2, harmonized with ALIGN_DEFAULT */
57 : #define FD_SCRATCH_FMEM_ALIGN (8UL) /* ==sizeof(ulong) but avoids bugs with some compilers */
58 :
59 : FD_PROTOTYPES_BEGIN
60 :
61 : /* Private APIs *******************************************************/
62 :
63 : #if FD_SCRATCH_USE_HANDHOLDING
64 : extern FD_TL int fd_scratch_in_prepare;
65 : #endif
66 :
67 : extern FD_TL ulong fd_scratch_private_start;
68 : extern FD_TL ulong fd_scratch_private_free;
69 : extern FD_TL ulong fd_scratch_private_stop;
70 :
71 : extern FD_TL ulong * fd_scratch_private_frame;
72 : extern FD_TL ulong fd_scratch_private_frame_cnt;
73 : extern FD_TL ulong fd_scratch_private_frame_max;
74 :
75 : FD_FN_CONST static inline int
76 2596208 : fd_scratch_private_align_is_valid( ulong align ) {
77 2596208 : return !(align & (align-1UL)); /* returns true if power or 2 or zero, compile time typically */
78 2596208 : }
79 :
80 : FD_FN_CONST static inline ulong
81 3543614 : fd_scratch_private_true_align( ulong align ) {
82 3543614 : return fd_ulong_if( !align, FD_SCRATCH_ALIGN_DEFAULT, align ); /* compile time typically */
83 3543614 : }
84 :
85 : /* Public APIs ********************************************************/
86 :
87 : /* Constructor APIs */
88 :
89 : /* fd_scratch_smem_{align,footprint} return the alignment and footprint
90 : of a memory region suitable for use as a scratch pad memory that can
91 : hold up to smax bytes. There are very few restrictions on the nature
92 : of this memory. It could even be just a flat address space that is
93 : not backed by an actual physical memory as far as scratch is
94 : concerned. In typical use cases though, the scratch pad memory
95 : should point to a region of huge or gigantic page backed memory on
96 : the caller's numa node.
97 :
98 : A shared memory region for smem is fine for smem. This could be used
99 : for example to allow other threads / processes to access a scratch
100 : allocation from this thread for the lifetime of a scratch allocation.
101 :
102 : Even more generally, a shared memory region for both smem and fmem
103 : could make it is theoretically possible to have a scratch pad memory
104 : that is shared across multiple threads / processes. The API is not
105 : well designed for such though (the main reason to use fmem in shared
106 : memory would be convenience and/or adding hot swapping
107 : functionality). In the common scratch scenario, every thread would
108 : attach to their local join of the shared smem and shared fmem. But
109 : since the operations below are not designed to be thread safe, the
110 : threads would have to protect against concurrent use of push and pop
111 : (and attach would probably need to be tweaked to make it easier to
112 : attach to an already in use scratch pad).
113 :
114 : Compile time allocation is possible via the FD_SCRATCH_SMEM_ALIGN
115 : define. E.g.:
116 :
117 : uchar my_smem[ MY_SMAX ] __attribute__((aligned(FD_SCRATCH_SMEM_ALIGN)));
118 :
119 : will be valid to use as a scratch smem with space for up to MY_SMAX
120 : bytes. */
121 :
122 49158 : FD_FN_CONST static inline ulong fd_scratch_smem_align( void ) { return FD_SCRATCH_SMEM_ALIGN; }
123 :
124 : FD_FN_CONST static inline ulong
125 49155 : fd_scratch_smem_footprint( ulong smax ) {
126 49155 : return fd_ulong_align_up( smax, FD_SCRATCH_SMEM_ALIGN );
127 49155 : }
128 :
129 : /* fd_scratch_fmem_{align,footprint} return the alignment and footprint
130 : of a memory region suitable for holding the scratch pad memory
131 : metadata (typically very small). The scratch pad memory will be
132 : capable of holding up to depth scratch frames.
133 :
134 : Compile time allocation is possible via the FD_SCRATCH_FMEM_ALIGN
135 : define. E.g.
136 :
137 : ulong my_fmem[ MY_DEPTH ] __attribute((aligned(FD_SCRATCH_FMEM_ALIGN)));
138 :
139 : or, even simpler:
140 :
141 : ulong my_fmem[ MY_DEPTH ];
142 :
143 : will be valid to use as a scratch fmem with space for up to depth
144 : frames. The attribute variant is not strictly necessary, just for
145 : consistency with the smem above (where it is required). */
146 :
147 9 : FD_FN_CONST static inline ulong fd_scratch_fmem_align ( void ) { return sizeof(ulong); }
148 51 : FD_FN_CONST static inline ulong fd_scratch_fmem_footprint( ulong depth ) { return sizeof(ulong)*depth; }
149 :
150 : /* fd_scratch_attach attaches the calling thread to memory regions
151 : sufficient to hold up to smax (positive) bytes and with up to depth
152 : (positive) frames. smem/fmem should have the required alignment and
153 : footprint specified for smax/depth from the above and be non-NULL).
154 : The caller has a read/write interest in these regions while attached
155 : (and thus the local lifetime of these regions must cover the lifetime
156 : of the attachment). Only one scratch pad memory may be attached to a
157 : caller at a time. This cannot fail from the caller's point of view
158 : (if handholding is enabled, it will abort the caller with a
159 : descriptive error message if used obviously in error). */
160 :
161 : static inline void
162 : fd_scratch_attach( void * smem,
163 : void * fmem,
164 : ulong smax,
165 18 : ulong depth ) {
166 :
167 : # if FD_SCRATCH_USE_HANDHOLDING
168 0 : if( FD_UNLIKELY( fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "already attached" ));
169 0 : if( FD_UNLIKELY( !smem ) ) FD_LOG_ERR(( "bad smem" ));
170 0 : if( FD_UNLIKELY( !fmem ) ) FD_LOG_ERR(( "bad fmem" ));
171 0 : if( FD_UNLIKELY( !smax ) ) FD_LOG_ERR(( "bad smax" ));
172 0 : if( FD_UNLIKELY( !depth ) ) FD_LOG_ERR(( "bad depth" ));
173 0 : fd_scratch_in_prepare = 0;
174 0 : # endif
175 :
176 0 : fd_scratch_private_start = (ulong)smem;
177 0 : fd_scratch_private_free = fd_scratch_private_start;
178 0 : fd_scratch_private_stop = fd_scratch_private_start + smax;
179 :
180 0 : fd_scratch_private_frame = (ulong *)fmem;
181 0 : fd_scratch_private_frame_cnt = 0UL;
182 0 : fd_scratch_private_frame_max = depth;
183 :
184 : # if FD_HAS_DEEPASAN
185 : /* Poison the entire smem region. Underpoison the boundaries to respect
186 : alignment requirements. */
187 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
188 : ulong aligned_end = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
189 : fd_asan_poison( (void*)aligned_start, aligned_end - aligned_start );
190 : # endif
191 : #if FD_HAS_MSAN
192 : /* Mark the entire smem region as uninitialized. */
193 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_MSAN_ALIGN );
194 : ulong aligned_end = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
195 : fd_msan_poison( (void*)aligned_start, aligned_end - aligned_start );
196 : #endif
197 0 : }
198 :
199 : /* fd_scratch_detach detaches the calling thread from its current
200 : attachment. Returns smem used on attach and, if opt_fmem is
201 : non-NULL, opt_fmem[0] will contain the fmem used on attach on return.
202 :
203 : This relinquishes the calling threads read/write interest on these
204 : memory regions. All the caller's scratch frames are popped, any
205 : prepare in progress is canceled and all the caller's scratch
206 : allocations are freed implicitly by this.
207 :
208 : This cannot fail from the caller's point of view (if handholding is
209 : enabled, it will abort the caller with a descriptive error message if
210 : used obviously in error). */
211 :
212 : static inline void *
213 18 : fd_scratch_detach( void ** _opt_fmem ) {
214 :
215 : # if FD_SCRATCH_USE_HANDHOLDING
216 0 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
217 0 : fd_scratch_in_prepare = 0;
218 0 : # endif
219 :
220 : # if FD_HAS_DEEPASAN
221 : /* Unpoison the entire scratch space. There should now be an underlying
222 : allocation which has not been poisoned. */
223 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
224 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
225 : fd_asan_unpoison( (void*)aligned_start, aligned_stop - aligned_start );
226 : # endif
227 :
228 0 : void * smem = (void *)fd_scratch_private_start;
229 0 : void * fmem = (void *)fd_scratch_private_frame;
230 :
231 0 : fd_scratch_private_start = 0UL;
232 0 : fd_scratch_private_free = 0UL;
233 0 : fd_scratch_private_stop = 0UL;
234 :
235 0 : fd_scratch_private_frame = NULL;
236 0 : fd_scratch_private_frame_cnt = 0UL;
237 0 : fd_scratch_private_frame_max = 0UL;
238 :
239 18 : if( _opt_fmem ) _opt_fmem[0] = fmem;
240 0 : return smem;
241 18 : }
242 :
243 : /* User APIs */
244 :
245 : /* fd_scratch_{used,free} returns the number of bytes used/free in the
246 : caller's scratch. Returns 0 if not attached. Because of alignment
247 : overheads, an allocation is guaranteed to succeed if free>=sz+align-1
248 : where align is the actual alignment required for the allocation (e.g.
249 : align==0 -> default, align<min -> min). It is guaranteed to fail if
250 : free<sz. It might succeed or fail in between depending on the
251 : alignments of previously allocations. These are freaky fast (O(3)
252 : fast asm operations under the hood). */
253 :
254 9 : static inline ulong fd_scratch_used( void ) { return fd_scratch_private_free - fd_scratch_private_start; }
255 9 : static inline ulong fd_scratch_free( void ) { return fd_scratch_private_stop - fd_scratch_private_free; }
256 :
257 : /* fd_scratch_frame_{used,free} returns the number of scratch frames
258 : used/free in the caller's scratch. Returns 0 if not attached. push
259 : is guaranteed to succeed if free is non-zero and guaranteed to fail
260 : otherwise. pop is guaranteed to succeed if used is non-zero and
261 : guaranteed to fail otherwise. These are freaky fast (O(1-3) fast asm
262 : operations under the hood). */
263 :
264 2954121 : static inline ulong fd_scratch_frame_used( void ) { return fd_scratch_private_frame_cnt; }
265 2999377 : static inline ulong fd_scratch_frame_free( void ) { return fd_scratch_private_frame_max - fd_scratch_private_frame_cnt; }
266 :
267 : /* fd_scratch_reset frees all allocations (if any) and pops all scratch
268 : frames (if any) such that the caller's scratch will be in the same
269 : state it was immediately after attach. The caller must be attached
270 : to a scratch memory to use. This cannot fail from the caller's point
271 : of view (if handholding is enabled, it will abort the caller with a
272 : descriptive error message if used obviously in error). This is
273 : freaky fast (O(3) fast asm operations under the hood). */
274 :
275 : static inline void
276 734 : fd_scratch_reset( void ) {
277 : # if FD_SCRATCH_USE_HANDHOLDING
278 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
279 : fd_scratch_in_prepare = 0;
280 : # endif
281 734 : fd_scratch_private_free = fd_scratch_private_start;
282 734 : fd_scratch_private_frame_cnt = 0UL;
283 :
284 : /* Poison entire scratch space again. */
285 : # if FD_HAS_DEEPASAN
286 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
287 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
288 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
289 : # endif
290 : # if FD_HAS_MSAN
291 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_MSAN_ALIGN );
292 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
293 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
294 : # endif
295 734 : }
296 :
297 : /* fd_scratch_push creates a new scratch frame and makes it the current
298 : frame. Assumes caller is attached to a scratch with space for a new
299 : frame. This cannot fail from the caller's point of view (if
300 : handholding is enabled, it will abort the caller with a descriptive
301 : error message if used obviously in error). This is freaky fast (O(5)
302 : fast asm operations under the hood). */
303 :
304 : FD_FN_UNUSED static void /* Work around -Winline */
305 45537 : fd_scratch_push( void ) {
306 : # if FD_SCRATCH_USE_HANDHOLDING
307 24 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) {
308 0 : FD_LOG_ERR(( "not attached" ));
309 0 : }
310 24 : if( FD_UNLIKELY( fd_scratch_private_frame_cnt>=fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "too many frames" ));
311 24 : fd_scratch_in_prepare = 0;
312 24 : # endif
313 24 : fd_scratch_private_frame[ fd_scratch_private_frame_cnt++ ] = fd_scratch_private_free;
314 :
315 : /* Poison to end of scratch region to account for case of in-prep allocation
316 : getting implictly cancelled. */
317 : # if FD_HAS_DEEPASAN
318 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
319 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
320 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
321 : # endif
322 : #if FD_HAS_MSAN
323 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_MSAN_ALIGN );
324 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
325 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
326 : #endif
327 24 : }
328 :
329 : /* fd_scratch_pop frees all allocations in the current scratch frame,
330 : destroys the current scratch frame and makes the previous frame (if
331 : there is one) the current stack frame (and leaves the caller without
332 : a current frame if there is not one). Assumes the caller is attached
333 : to a scratch memory with at least one frame in use. This cannot fail
334 : from the caller's point of view (if handholding is enabled, it will
335 : abort the caller with a descriptive error message if used obviously
336 : in error). This is freaky fast (O(5) fast asm operations under the
337 : hood). */
338 :
339 : FD_FN_UNUSED static void /* Work around -Winline */
340 40825 : fd_scratch_pop( void ) {
341 : # if FD_SCRATCH_USE_HANDHOLDING
342 24 : if( FD_UNLIKELY( !fd_scratch_private_frame_max ) ) FD_LOG_ERR(( "not attached" ));
343 24 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched pop" ));
344 24 : fd_scratch_in_prepare = 0;
345 24 : # endif
346 24 : fd_scratch_private_free = fd_scratch_private_frame[ --fd_scratch_private_frame_cnt ];
347 :
348 : # if FD_HAS_DEEPASAN
349 : /* On a pop() operation, the entire range from fd_scratch_private_free to the
350 : end of the scratch space can be safely poisoned. The region must be aligned
351 : to accomodate asan manual poisoning requirements. */
352 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
353 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
354 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
355 : # endif
356 : #if FD_HAS_MSAN
357 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_MSAN_ALIGN );
358 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
359 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
360 : #endif
361 24 : }
362 :
363 : /* fd_scratch_prepare starts an allocation of unknown size and known
364 : alignment align (0 means use default alignment) in the caller's
365 : current scratch frame. Returns a pointer in the caller's address
366 : space with alignment align to the first byte of a region with
367 : fd_scratch_free() (as observed after this function returns) bytes
368 : available. The caller is free to clobber any bytes in this region.
369 :
370 : fd_scratch_publish finishes an in-progress allocation. end points at
371 : the first byte after the final allocation. Assumes there is a
372 : matching prepare. A published allocation can be subsequently
373 : trimmed.
374 :
375 : fd_scratch_cancel cancels an in-progress allocation. This is a no-op
376 : if there is no matching prepare. If the prepare had alignment other
377 : than 1, it is possible that some alignment padding needed for the
378 : allocation will still be used in the caller's current scratch frame.
379 : If this is not acceptable, the prepare should use an alignment of 1
380 : and manually align the return.
381 :
382 : This allows idioms like:
383 :
384 : uchar * p = (uchar *)fd_scratch_prepare( align );
385 :
386 : if( FD_UNLIKELY( fd_scratch_free() < app_max_sz ) ) {
387 :
388 : fd_scratch_cancel();
389 :
390 : ... handle too little scratch space to handle application
391 : ... worst case needs here
392 :
393 : } else {
394 :
395 : ... populate sz bytes to p where sz is in [0,app_max_sz]
396 : p += sz;
397 :
398 : fd_scratch_publish( p );
399 :
400 : ... at this point, scratch is as though
401 : ... fd_scratch_alloc( align, sz ) was called above
402 :
403 : }
404 :
405 : Ideally every prepare should be matched with a publish or a cancel,
406 : only one prepare can be in-progress at a time on a thread and prepares
407 : cannot be nested. As such virtually all other scratch operations
408 : will implicitly cancel any in-progress prepare, including attach /
409 : detach / push / pop / prepare / alloc / trim. */
410 :
411 : FD_FN_UNUSED static void * /* Work around -Winline */
412 947454 : fd_scratch_prepare( ulong align ) {
413 :
414 : # if FD_SCRATCH_USE_HANDHOLDING
415 48 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched push" ));
416 48 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) FD_LOG_ERR(( "bad align (%lu)", align ));
417 48 : # endif
418 :
419 : # if FD_HAS_DEEPASAN
420 : /* Need 8 byte alignment. */
421 : align = fd_ulong_align_up( align, FD_ASAN_ALIGN );
422 : # endif
423 48 : ulong true_align = fd_scratch_private_true_align( align );
424 48 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
425 :
426 : # if FD_SCRATCH_USE_HANDHOLDING
427 48 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) FD_LOG_ERR(( "prepare align (%lu) overflow", true_align ));
428 48 : if( FD_UNLIKELY( smem > fd_scratch_private_stop ) ) FD_LOG_ERR(( "prepare align (%lu) needs %lu additional scratch",
429 48 : align, smem - fd_scratch_private_stop ));
430 48 : fd_scratch_in_prepare = 1;
431 48 : # endif
432 :
433 : # if FD_HAS_DEEPASAN
434 : /* At this point the user is able to clobber any bytes in the region. smem is
435 : always going to be at least 8 byte aligned. */
436 : ulong aligned_sz = fd_ulong_align_up( fd_scratch_private_stop - smem, FD_ASAN_ALIGN );
437 : fd_asan_unpoison( (void*)smem, aligned_sz );
438 : # endif
439 :
440 48 : fd_scratch_private_free = smem;
441 48 : return (void *)smem;
442 48 : }
443 :
444 : static inline void
445 757868 : fd_scratch_publish( void * _end ) {
446 757868 : ulong end = (ulong)_end;
447 :
448 : # if FD_SCRATCH_USE_HANDHOLDING
449 48 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) FD_LOG_ERR(( "unmatched prepare" ));
450 48 : if( FD_UNLIKELY( end < fd_scratch_private_free ) ) FD_LOG_ERR(( "publish underflow" ));
451 48 : if( FD_UNLIKELY( end > fd_scratch_private_stop ) )
452 0 : FD_LOG_ERR(( "publish needs %lu additional scratch", end-fd_scratch_private_stop ));
453 48 : fd_scratch_in_prepare = 0;
454 48 : # endif
455 :
456 : /* Poison everything that is trimmed off. Conservatively poison potentially
457 : less than the region that is trimmed to respect alignment requirements. */
458 : # if FD_HAS_DEEPASAN
459 : ulong aligned_free = fd_ulong_align_dn( fd_scratch_private_free, FD_ASAN_ALIGN );
460 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
461 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
462 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
463 : fd_asan_unpoison( (void*)aligned_free, aligned_end - aligned_free );
464 : # endif
465 : # if FD_HAS_MSAN
466 : ulong aligned_free = fd_ulong_align_dn( fd_scratch_private_free, FD_ASAN_ALIGN );
467 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
468 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
469 : fd_msan_poison( (void*)aligned_end, aligned_stop - aligned_end );
470 : fd_msan_unpoison( (void*)aligned_free, aligned_end - aligned_free );
471 : # endif
472 :
473 48 : fd_scratch_private_free = end;
474 48 : }
475 :
476 : static inline void
477 189586 : fd_scratch_cancel( void ) {
478 :
479 : # if FD_SCRATCH_USE_HANDHOLDING
480 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) FD_LOG_ERR(( "unmatched prepare" ));
481 : fd_scratch_in_prepare = 0;
482 : # endif
483 :
484 189586 : }
485 :
486 : /* fd_scratch_alloc allocates sz bytes with alignment align in the
487 : caller's current scratch frame. There should be no prepare in
488 : progress. Note that this has same function signature as
489 : aligned_alloc (and not by accident). It does have some less
490 : restrictive behaviors though.
491 :
492 : align must be 0 or an integer power of 2. 0 will be treated as
493 : FD_SCRATCH_ALIGN_DEFAULT.
494 :
495 : sz need not be a multiple of align. Further, the underlying
496 : allocator does not implicitly round up sz to an align multiple (as
497 : such, scratch can allocate additional items in any tail padding that
498 : might have been implicitly reserved had it rounded up). That is, if
499 : you really want to round up allocations to a multiple of align, then
500 : manually align up sz ... e.g. pass fd_ulong_align_up(sz,align) when
501 : align is non-zero to this call (this could be implemented as a
502 : compile time mode with some small extra overhead if desirable).
503 :
504 : sz 0 is fine. This will currently return a properly aligned non-NULL
505 : pointer (the allocator might do some allocation under the hood to get
506 : the desired alignment and it is possible this might fail ... there is
507 : a case for returning NULL or an arbitrary but appropriately aligned
508 : non-NULL and this could be implemented as a compile time mode with
509 : some small extra overhead if desirable).
510 :
511 : This cannot fail from the caller's point of view (if handholding is
512 : enabled, it will abort the caller with a descriptive error message if
513 : used obviously in error).
514 :
515 : This is freaky fast (O(5) fast asm operations under the hood). */
516 :
517 : FD_FN_UNUSED static void * /* Work around -Winline */
518 : fd_scratch_alloc( ulong align,
519 378827 : ulong sz ) {
520 378827 : ulong smem = (ulong)fd_scratch_prepare( align );
521 378827 : ulong end = smem + sz;
522 :
523 : # if FD_SCRATCH_USE_HANDHOLDING
524 48 : if( FD_UNLIKELY( (end < smem) | (end > fd_scratch_private_stop) ) ) FD_LOG_ERR(( "sz (%lu) overflow", sz ));
525 48 : # endif
526 :
527 48 : fd_scratch_publish( (void *)end );
528 48 : return (void *)smem;
529 378827 : }
530 :
531 : /* fd_scratch_trim trims the size of the most recent scratch allocation
532 : in the current scratch frame (technically it can be used to trim the
533 : size of the entire current scratch frame but doing more than the most
534 : recent scratch allocation is strongly discouraged). Assumes there is
535 : a current scratch frame and the caller is not in a prepare. end
536 : points at the first byte to free in the most recent scratch
537 : allocation (or the first byte after the most recent scratch
538 : allocation). This allows idioms like:
539 :
540 : uchar * p = (uchar *)fd_scratch_alloc( align, max_sz );
541 :
542 : ... populate sz bytes of p where sz is in [0,max_sz]
543 : p += sz;
544 :
545 : fd_scratch_trim( p );
546 :
547 : ... now the thread's scratch is as though original call was
548 : ... p = fd_scratch_alloc( align, sz );
549 :
550 : This cannot fail from the caller's point of view (if handholding is
551 : enabled, this will abort the caller with a descriptive error message
552 : if used obviously in error).
553 :
554 : Note that an allocation be repeatedly trimmed.
555 :
556 : Note also that trim can nest. E.g. a thread can call a function that
557 : uses scratch with its own properly matched scratch pushes and pops.
558 : On function return, trim will still work on the most recent scratch
559 : alloc in that frame by the caller.
560 :
561 : This is freaky fast (O(1) fast asm operations under the hood). */
562 :
563 : static inline void
564 757610 : fd_scratch_trim( void * _end ) {
565 757610 : ulong end = (ulong)_end;
566 :
567 : # if FD_SCRATCH_USE_HANDHOLDING
568 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) FD_LOG_ERR(( "unmatched push" ));
569 : if( FD_UNLIKELY( end < fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ] ) ) FD_LOG_ERR(( "trim underflow" ));
570 : if( FD_UNLIKELY( end > fd_scratch_private_free ) ) FD_LOG_ERR(( "trim overflow" ));
571 : fd_scratch_in_prepare = 0;
572 : # endif
573 :
574 : # if FD_HAS_DEEPASAN
575 : /* The region to poison should be from _end to the end of the scratch's region.
576 : The same alignment considerations need to be taken into account. */
577 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
578 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
579 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
580 : # endif
581 : # if FD_HAS_MSAN
582 : ulong aligned_end = fd_ulong_align_up( end, FD_MSAN_ALIGN );
583 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
584 : fd_msan_poison( (void*)aligned_end, aligned_stop - aligned_end );
585 : # endif
586 :
587 757610 : fd_scratch_private_free = end;
588 757610 : }
589 :
590 : /* fd_scratch_*_is_safe returns false (0) if the operation is obviously
591 : unsafe to do at the time of the call or true otherwise.
592 : Specifically:
593 :
594 : fd_scratch_attach_is_safe() returns 1 if the calling thread is not
595 : already attached to scratch.
596 :
597 : fd_scratch_detach_is_safe() returns 1 if the calling thread is
598 : already attached to scratch.
599 :
600 : fd_scratch_reset_is_safe() returns 1 if the calling thread is already
601 : attached to scratch.
602 :
603 : fd_scratch_push_is_safe() returns 1 if there is at least one frame
604 : available and 0 otherwise.
605 :
606 : fd_scratch_pop_is_safe() returns 1 if there is at least one frame
607 : in use and 0 otherwise.
608 :
609 : fd_scratch_prepare_is_safe( align ) returns 1 if there is a current
610 : frame for the allocation and enough scratch pad memory to start
611 : preparing an allocation with alignment align.
612 :
613 : fd_scratch_publish_is_safe( end ) returns 1 if end is a valid
614 : location to complete an allocation in preparation. If handholding is
615 : enabled, will additionally check that there is a prepare already in
616 : progress.
617 :
618 : fd_scratch_cancel_is_safe() returns 1.
619 :
620 : fd_scratch_alloc_is_safe( align, sz ) returns 1 if there is a current
621 : frame for the allocation and enough scratch pad memory for an
622 : allocation with alignment align and size sz.
623 :
624 : fd_scratch_trim_is_safe( end ) returns 1 if there is a current frame
625 : and that current frame can be trimmed to end safely.
626 :
627 : These are safe to call at any time and also freak fast handful of
628 : assembly operations. */
629 :
630 0 : FD_FN_PURE static inline int fd_scratch_attach_is_safe( void ) { return !fd_scratch_private_frame_max; }
631 0 : FD_FN_PURE static inline int fd_scratch_detach_is_safe( void ) { return !!fd_scratch_private_frame_max; }
632 0 : FD_FN_PURE static inline int fd_scratch_reset_is_safe ( void ) { return !!fd_scratch_private_frame_max; }
633 5998538 : FD_FN_PURE static inline int fd_scratch_push_is_safe ( void ) { return fd_scratch_private_frame_cnt<fd_scratch_private_frame_max; }
634 5907916 : FD_FN_PURE static inline int fd_scratch_pop_is_safe ( void ) { return !!fd_scratch_private_frame_cnt; }
635 :
636 : FD_FN_PURE static inline int
637 45 : fd_scratch_prepare_is_safe( ulong align ) {
638 45 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
639 45 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad alignment, compile time typically */
640 45 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
641 45 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
642 45 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* alignment overflow */
643 45 : if( FD_UNLIKELY( smem > fd_scratch_private_stop ) ) return 0; /* insufficient scratch */
644 45 : return 1;
645 45 : }
646 :
647 : FD_FN_PURE static inline int
648 45 : fd_scratch_publish_is_safe( void * _end ) {
649 45 : ulong end = (ulong)_end;
650 : # if FD_SCRATCH_USE_HANDHOLDING
651 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) return 0; /* Not in prepare */
652 : # endif
653 45 : if( FD_UNLIKELY( end < fd_scratch_private_free ) ) return 0; /* Backward */
654 45 : if( FD_UNLIKELY( end > fd_scratch_private_stop ) ) return 0; /* Out of bounds */
655 45 : return 1;
656 45 : }
657 :
658 : FD_FN_CONST static inline int
659 0 : fd_scratch_cancel_is_safe( void ) {
660 0 : return 1;
661 0 : }
662 :
663 : FD_FN_PURE static inline int
664 : fd_scratch_alloc_is_safe( ulong align,
665 2913362 : ulong sz ) {
666 2913362 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
667 2596115 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad align, compile time typically */
668 2596115 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
669 2596115 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
670 2596115 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* align overflow */
671 2596115 : ulong free = smem + sz;
672 2596115 : if( FD_UNLIKELY( free < smem ) ) return 0; /* sz overflow */
673 2596115 : if( FD_UNLIKELY( free > fd_scratch_private_stop ) ) return 0; /* too little space */
674 757616 : return 1;
675 2596115 : }
676 :
677 : FD_FN_PURE static inline int
678 0 : fd_scratch_trim_is_safe( void * _end ) {
679 0 : ulong end = (ulong)_end;
680 0 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
681 0 : if( FD_UNLIKELY( end < fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ] ) ) return 0; /* Trim underflow */
682 0 : if( FD_UNLIKELY( end > fd_scratch_private_free ) ) return 0; /* Trim overflow */
683 0 : return 1;
684 0 : }
685 :
686 : /* fd_scratch_vtable is the virtual function table implementing
687 : fd_valloc for fd_scratch. */
688 :
689 : extern const fd_valloc_vtable_t fd_scratch_vtable;
690 :
691 : /* fd_scratch_virtual returns an abstract handle to the fd_scratch join.
692 : Valid for lifetime of scratch frame. fd_valloc_t must be dropped
693 : before scratch frame changes or scratch detaches. */
694 :
695 : FD_FN_CONST static inline fd_valloc_t
696 3 : fd_scratch_virtual( void ) {
697 3 : fd_valloc_t valloc = { NULL, &fd_scratch_vtable };
698 3 : return valloc;
699 3 : }
700 :
701 : /* FD_SCRATCH_SCOPE_{BEGIN,END} create a `do { ... } while(0);` scope in
702 : which a temporary scratch frame is available. Nested scopes are
703 : permitted. This scratch frame is automatically destroyed when
704 : exiting the scope normally (e.g. by 'break', 'return', or reaching
705 : the end). Uses a dummy variable with a cleanup attribute under the
706 : hood. U.B. if scope is left abnormally (e.g. longjmp(), exception,
707 : abort(), etc.). Use as follows:
708 :
709 : FD_SCRATCH_SCOPE_BEGIN {
710 : ...
711 : fd_scratch_alloc( ... );
712 : ...
713 : }
714 : FD_SCRATCH_SCOPE_END; */
715 :
716 : FD_FN_UNUSED static inline void
717 79 : fd_scratch_scoped_pop_private( void * _unused ) {
718 79 : (void)_unused;
719 79 : fd_scratch_pop();
720 79 : }
721 :
722 79 : #define FD_SCRATCH_SCOPE_BEGIN do { \
723 79 : fd_scratch_push(); \
724 79 : int __fd_scratch_guard_ ## __LINE__ \
725 79 : __attribute__((cleanup(fd_scratch_scoped_pop_private))) \
726 79 : __attribute__((unused)) = 0; \
727 79 : do
728 :
729 79 : #define FD_SCRATCH_SCOPE_END while(0); } while(0)
730 :
731 : /* fd_alloca is variant of alloca that works like aligned_alloc. That
732 : is, it returns an allocation of sz bytes with an alignment of at
733 : least align. Like alloca, this allocation will be in the stack frame
734 : of the calling function with a lifetime of until the calling function
735 : returns. Stack overflow handling is likewise identical to alloca
736 : (stack overflows will overlap the top stack guard, typically
737 : triggering a seg fault when the overflow region is touched that will
738 : be caught and handled by the logger to terminate the calling thread
739 : group). As such, like alloca, these really should only be used for
740 : smallish (<< few KiB) quick allocations in bounded recursion depth
741 : circumstances.
742 :
743 : Like fd_scratch_alloc, align must be an 0 or a non-negative integer
744 : power of 2. 0 will be treated as align_default. align smaller than
745 : align_min will be bumped up to align_min.
746 :
747 : The caller promises request will not overflow the stack. This has to
748 : be implemented as a macro for linguistic reasons and align should be
749 : safe against multiple evaluation and, due to compiler limitations,
750 : must be a compile time constant. Returns non-NULL on success and
751 : NULL on failure (in most situations, can never fail from the caller's
752 : POV). sz==0 is okay (and will return non-NULL). */
753 :
754 : #if FD_HAS_ALLOCA
755 :
756 : /* Work around compiler limitations */
757 33 : #define FD_SCRATCH_PRIVATE_TRUE_ALIGN( align ) ((align) ? (align) : FD_SCRATCH_ALIGN_DEFAULT)
758 :
759 18 : #define fd_alloca(align,sz) __builtin_alloca_with_align( fd_ulong_max( (sz), 1UL ), \
760 18 : 8UL*FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) /*bits*/ )
761 :
762 : /* fd_alloca_check does fd_alloca but it will FD_LOG_CRIT with a
763 : detailed message if the request would cause a stack overflow or leave
764 : so little available free stack that subsequent normal thread
765 : operations would be at risk.
766 :
767 : Note that returning NULL on failure is not an option as this would no
768 : longer be a drop-in instrumented replacement for fd_alloca (this
769 : would also require even more linguistic hacks to keep the fd_alloca
770 : at the appropriate scope). Likewise, testing the allocated region is
771 : within the stack post allocation is not an option as the FD_LOG_CRIT
772 : invocation would then try to use stack with the already overflowed
773 : allocation in it (there is no easy portable way to guarantee an
774 : alloca has been freed short of returning from the function in which
775 : the alloca was performed). Using FD_LOG_ERR instead of FD_LOG_CRIT
776 : is a potentially viable alternative error handling behavior though.
777 :
778 : This has to be implemented as a macro for linguistic reasons. It is
779 : recommended this only be used for development / debugging / testing
780 : purposes (e.g. if you are doing alloca in production that are large
781 : enough you are worried about stack overflow, you probably should be
782 : using fd_scratch, fd_alloc or fd_wksp depending on performance and
783 : persistence needs or, better still, architecting to not need any
784 : temporary memory allocations at all). If the caller's stack
785 : diagnostics could not be successfully initialized (this is logged),
786 : this will always FD_LOG_CRIT. */
787 :
788 : #if !FD_HAS_ASAN
789 :
790 : extern FD_TL ulong fd_alloca_check_private_sz;
791 :
792 : #define fd_alloca_check( align, sz ) \
793 15 : ( fd_alloca_check_private_sz = (sz), \
794 15 : (__extension__({ \
795 15 : ulong _fd_alloca_check_private_pad_max = FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) - 1UL; \
796 15 : ulong _fd_alloca_check_private_footprint = fd_alloca_check_private_sz + _fd_alloca_check_private_pad_max; \
797 15 : if( FD_UNLIKELY( (_fd_alloca_check_private_footprint < _fd_alloca_check_private_pad_max ) | \
798 15 : (_fd_alloca_check_private_footprint > (31UL*(fd_tile_stack_est_free() >> 5))) ) ) \
799 15 : FD_LOG_CRIT(( "fd_alloca_check( " #align ", " #sz " ) stack overflow" )); \
800 15 : })), \
801 15 : fd_alloca( (align), fd_alloca_check_private_sz ) )
802 :
803 : #else /* FD_HAS_ASAN */
804 :
805 : /* AddressSanitizer provides its own alloca safety instrumentation
806 : which are more powerful than the above fd_alloca_check heuristics. */
807 :
808 : #define fd_alloca_check fd_alloca
809 :
810 : #endif /* FD_HAS_ASAN */
811 : #endif /* FD_HAS_ALLOCA */
812 :
813 : FD_PROTOTYPES_END
814 :
815 : #endif /* HEADER_fd_src_util_scratch_fd_scratch_h */
|