Line data Source code
1 : #ifndef HEADER_fd_src_util_scratch_fd_scratch_h
2 : #define HEADER_fd_src_util_scratch_fd_scratch_h
3 :
4 : /* APIs for high performance scratch pad memory allocation. There
5 : are two allocators provided. One is fd_alloca, which is an alignment
6 : aware equivalent of alloca. It is meant for use anywhere alloca
7 : would normally be used. This is only available if the built target
8 : has the FD_HAS_ALLOCA capability. The second as fd_scratch_alloc.
9 : It is meant for use in situations that have very complex and large
10 : temporary memory usage. */
11 :
12 : #include "../tile/fd_tile.h"
13 :
14 : /* FD_SCRATCH_ALLOC_ALIGN_DEFAULT is the default alignment to use for
15 : allocations.
16 :
17 : Default should be at least 16 for consistent cross platform behavior
18 : that is language conformant across a wide range of targets (i.e. the
19 : largest primitive type across all possible build ... practically
20 : sizeof(int128)). This also naturally covers SSE natural alignment on
21 : x86. 8 could be used if features like int128 and so forth and still
22 : be linguistically conformant (sizeof(ulong) here is the limit).
23 : Likewise, 32, 64, 128 could be used to guarantee all allocations will
24 : have natural AVX/AVX2, natural AVX-512 / cache-line,
25 : adjacent-cache-line-prefetch false sharing avoidance / natural GPU
26 : alignment properties.
27 :
28 : 128 for default was picked as double x86 cache line for ACLPF false
29 : sharing avoidance and for consistency with GPU warp sizes ... i.e.
30 : the default allocation behaviors are naturally interthread
31 : communication false sharing resistant and GPU friendly. This also
32 : naturally covers cases like SSE, AVX, AVX2 and AVX-512. */
33 :
34 4303890 : #define FD_SCRATCH_ALIGN_DEFAULT (128UL) /* integer power-of-2 >=16 */
35 :
36 : /* FD_SCRATCH_{SMEM,FMEM}_ALIGN give the alignment requirements for
37 : the memory regions used to a scratch pad memory. There are not many
38 : restrictions on the SMEM alignment practically other than it be a
39 : reasonable integer power of two. 128 was picked to harmonize with
40 : FD_SCRATCH_ALIGN_DEFAULT (which does have more technical motivations
41 : behind its choice) but this is not strictly required.
42 : FD_SCRATCH_FMEM_ALIGN is required to be sizeof(ulong). */
43 :
44 98313 : #define FD_SCRATCH_SMEM_ALIGN (128UL) /* integer power-of-2, harmonized with ALIGN_DEFAULT */
45 : #define FD_SCRATCH_FMEM_ALIGN (8UL) /* ==sizeof(ulong) but avoids bugs with some compilers */
46 :
47 : FD_PROTOTYPES_BEGIN
48 :
49 : /* Private APIs *******************************************************/
50 :
51 : #if FD_DCHECK_STYLE>0
52 : extern FD_TL int fd_scratch_in_prepare;
53 : #endif
54 :
55 : extern FD_TL ulong fd_scratch_private_start;
56 : extern FD_TL ulong fd_scratch_private_free;
57 : extern FD_TL ulong fd_scratch_private_stop;
58 :
59 : extern FD_TL ulong * fd_scratch_private_frame;
60 : extern FD_TL ulong fd_scratch_private_frame_cnt;
61 : extern FD_TL ulong fd_scratch_private_frame_max;
62 :
63 : FD_FN_CONST static inline int
64 3542511 : fd_scratch_private_align_is_valid( ulong align ) {
65 3542511 : return !(align & (align-1UL)); /* returns true if power or 2 or zero, compile time typically */
66 3542511 : }
67 :
68 : FD_FN_CONST static inline ulong
69 3542511 : fd_scratch_private_true_align( ulong align ) {
70 3542511 : return fd_ulong_if( !align, FD_SCRATCH_ALIGN_DEFAULT, align ); /* compile time typically */
71 3542511 : }
72 :
73 : /* Public APIs ********************************************************/
74 :
75 : /* Constructor APIs */
76 :
77 : /* fd_scratch_smem_{align,footprint} return the alignment and footprint
78 : of a memory region suitable for use as a scratch pad memory that can
79 : hold up to smax bytes. There are very few restrictions on the nature
80 : of this memory. It could even be just a flat address space that is
81 : not backed by an actual physical memory as far as scratch is
82 : concerned. In typical use cases though, the scratch pad memory
83 : should point to a region of huge or gigantic page backed memory on
84 : the caller's numa node.
85 :
86 : A shared memory region for smem is fine for smem. This could be used
87 : for example to allow other threads / processes to access a scratch
88 : allocation from this thread for the lifetime of a scratch allocation.
89 :
90 : Even more generally, a shared memory region for both smem and fmem
91 : could make it is theoretically possible to have a scratch pad memory
92 : that is shared across multiple threads / processes. The API is not
93 : well designed for such though (the main reason to use fmem in shared
94 : memory would be convenience and/or adding hot swapping
95 : functionality). In the common scratch scenario, every thread would
96 : attach to their local join of the shared smem and shared fmem. But
97 : since the operations below are not designed to be thread safe, the
98 : threads would have to protect against concurrent use of push and pop
99 : (and attach would probably need to be tweaked to make it easier to
100 : attach to an already in use scratch pad).
101 :
102 : Compile time allocation is possible via the FD_SCRATCH_SMEM_ALIGN
103 : define. E.g.:
104 :
105 : uchar my_smem[ MY_SMAX ] __attribute__((aligned(FD_SCRATCH_SMEM_ALIGN)));
106 :
107 : will be valid to use as a scratch smem with space for up to MY_SMAX
108 : bytes. */
109 :
110 49158 : FD_FN_CONST static inline ulong fd_scratch_smem_align( void ) { return FD_SCRATCH_SMEM_ALIGN; }
111 :
112 : FD_FN_CONST static inline ulong
113 49155 : fd_scratch_smem_footprint( ulong smax ) {
114 49155 : return fd_ulong_align_up( smax, FD_SCRATCH_SMEM_ALIGN );
115 49155 : }
116 :
117 : /* fd_scratch_fmem_{align,footprint} return the alignment and footprint
118 : of a memory region suitable for holding the scratch pad memory
119 : metadata (typically very small). The scratch pad memory will be
120 : capable of holding up to depth scratch frames.
121 :
122 : Compile time allocation is possible via the FD_SCRATCH_FMEM_ALIGN
123 : define. E.g.
124 :
125 : ulong my_fmem[ MY_DEPTH ] __attribute((aligned(FD_SCRATCH_FMEM_ALIGN)));
126 :
127 : or, even simpler:
128 :
129 : ulong my_fmem[ MY_DEPTH ];
130 :
131 : will be valid to use as a scratch fmem with space for up to depth
132 : frames. The attribute variant is not strictly necessary, just for
133 : consistency with the smem above (where it is required). */
134 :
135 9 : FD_FN_CONST static inline ulong fd_scratch_fmem_align ( void ) { return sizeof(ulong); }
136 51 : FD_FN_CONST static inline ulong fd_scratch_fmem_footprint( ulong depth ) { return sizeof(ulong)*depth; }
137 :
138 : /* fd_scratch_attach attaches the calling thread to memory regions
139 : sufficient to hold up to smax (positive) bytes and with up to depth
140 : (positive) frames. smem/fmem should have the required alignment and
141 : footprint specified for smax/depth from the above and be non-NULL).
142 : The caller has a read/write interest in these regions while attached
143 : (and thus the local lifetime of these regions must cover the lifetime
144 : of the attachment). Only one scratch pad memory may be attached to a
145 : caller at a time. This cannot fail from the caller's point of view
146 : (if handholding is enabled, it will abort the caller with a
147 : descriptive error message if used obviously in error). */
148 :
149 : static inline void
150 : fd_scratch_attach( void * smem,
151 : void * fmem,
152 : ulong smax,
153 15 : ulong depth ) {
154 :
155 15 : FD_DCHECK_CRIT( !fd_scratch_private_frame_max, "already attached" );
156 15 : FD_DCHECK_CRIT( !!smem, "bad smem" );
157 15 : FD_DCHECK_CRIT( !!fmem, "bad fmem" );
158 15 : FD_DCHECK_CRIT( !!smax, "bad smax" );
159 15 : FD_DCHECK_CRIT( !!depth, "bad depth" );
160 : # if FD_DCHECK_STYLE>0
161 : fd_scratch_in_prepare = 0;
162 : # endif
163 :
164 15 : fd_scratch_private_start = (ulong)smem;
165 15 : fd_scratch_private_free = fd_scratch_private_start;
166 15 : fd_scratch_private_stop = fd_scratch_private_start + smax;
167 :
168 15 : fd_scratch_private_frame = (ulong *)fmem;
169 15 : fd_scratch_private_frame_cnt = 0UL;
170 15 : fd_scratch_private_frame_max = depth;
171 :
172 : # if FD_HAS_DEEPASAN
173 : /* Poison the entire smem region. Underpoison the boundaries to respect
174 : alignment requirements. */
175 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
176 : ulong aligned_end = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
177 : fd_asan_poison( (void*)aligned_start, aligned_end - aligned_start );
178 : # endif
179 : #if FD_HAS_MSAN
180 : /* Mark the entire smem region as uninitialized. */
181 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_MSAN_ALIGN );
182 : ulong aligned_end = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
183 : fd_msan_poison( (void*)aligned_start, aligned_end - aligned_start );
184 : #endif
185 15 : }
186 :
187 : /* fd_scratch_detach detaches the calling thread from its current
188 : attachment. Returns smem used on attach and, if opt_fmem is
189 : non-NULL, opt_fmem[0] will contain the fmem used on attach on return.
190 :
191 : This relinquishes the calling threads read/write interest on these
192 : memory regions. All the caller's scratch frames are popped, any
193 : prepare in progress is canceled and all the caller's scratch
194 : allocations are freed implicitly by this.
195 :
196 : This cannot fail from the caller's point of view (if handholding is
197 : enabled, it will abort the caller with a descriptive error message if
198 : used obviously in error). */
199 :
200 : static inline void *
201 15 : fd_scratch_detach( void ** _opt_fmem ) {
202 :
203 15 : FD_DCHECK_CRIT( !!fd_scratch_private_frame_max, "not attached" );
204 : # if FD_DCHECK_STYLE>0
205 : fd_scratch_in_prepare = 0;
206 : # endif
207 :
208 : # if FD_HAS_DEEPASAN
209 : /* Unpoison the entire scratch space. There should now be an underlying
210 : allocation which has not been poisoned. */
211 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
212 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
213 : fd_asan_unpoison( (void*)aligned_start, aligned_stop - aligned_start );
214 : # endif
215 :
216 15 : void * smem = (void *)fd_scratch_private_start;
217 15 : void * fmem = (void *)fd_scratch_private_frame;
218 :
219 15 : fd_scratch_private_start = 0UL;
220 15 : fd_scratch_private_free = 0UL;
221 15 : fd_scratch_private_stop = 0UL;
222 :
223 15 : fd_scratch_private_frame = NULL;
224 15 : fd_scratch_private_frame_cnt = 0UL;
225 15 : fd_scratch_private_frame_max = 0UL;
226 :
227 15 : if( _opt_fmem ) _opt_fmem[0] = fmem;
228 15 : return smem;
229 15 : }
230 :
231 : /* User APIs */
232 :
233 : /* fd_scratch_{used,free} returns the number of bytes used/free in the
234 : caller's scratch. Returns 0 if not attached. Because of alignment
235 : overheads, an allocation is guaranteed to succeed if free>=sz+align-1
236 : where align is the actual alignment required for the allocation (e.g.
237 : align==0 -> default, align<min -> min). It is guaranteed to fail if
238 : free<sz. It might succeed or fail in between depending on the
239 : alignments of previously allocations. These are freaky fast (O(3)
240 : fast asm operations under the hood). */
241 :
242 9 : static inline ulong fd_scratch_used( void ) { return fd_scratch_private_free - fd_scratch_private_start; }
243 9 : static inline ulong fd_scratch_free( void ) { return fd_scratch_private_stop - fd_scratch_private_free; }
244 :
245 : /* fd_scratch_frame_{used,free} returns the number of scratch frames
246 : used/free in the caller's scratch. Returns 0 if not attached. push
247 : is guaranteed to succeed if free is non-zero and guaranteed to fail
248 : otherwise. pop is guaranteed to succeed if used is non-zero and
249 : guaranteed to fail otherwise. These are freaky fast (O(1-3) fast asm
250 : operations under the hood). */
251 :
252 2954172 : static inline ulong fd_scratch_frame_used( void ) { return fd_scratch_private_frame_cnt; }
253 2999373 : static inline ulong fd_scratch_frame_free( void ) { return fd_scratch_private_frame_max - fd_scratch_private_frame_cnt; }
254 :
255 : /* fd_scratch_reset frees all allocations (if any) and pops all scratch
256 : frames (if any) such that the caller's scratch will be in the same
257 : state it was immediately after attach. The caller must be attached
258 : to a scratch memory to use. This cannot fail from the caller's point
259 : of view (if handholding is enabled, it will abort the caller with a
260 : descriptive error message if used obviously in error). This is
261 : freaky fast (O(3) fast asm operations under the hood). */
262 :
263 : static inline void
264 738 : fd_scratch_reset( void ) {
265 738 : FD_DCHECK_CRIT( !!fd_scratch_private_frame_max, "not attached" );
266 : # if FD_DCHECK_STYLE>0
267 : fd_scratch_in_prepare = 0;
268 : # endif
269 738 : fd_scratch_private_free = fd_scratch_private_start;
270 738 : fd_scratch_private_frame_cnt = 0UL;
271 :
272 : /* Poison entire scratch space again. */
273 : # if FD_HAS_DEEPASAN
274 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_ASAN_ALIGN );
275 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
276 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
277 : # endif
278 : # if FD_HAS_MSAN
279 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_start, FD_MSAN_ALIGN );
280 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
281 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
282 : # endif
283 738 : }
284 :
285 : /* fd_scratch_push creates a new scratch frame and makes it the current
286 : frame. Assumes caller is attached to a scratch with space for a new
287 : frame. This cannot fail from the caller's point of view (if
288 : handholding is enabled, it will abort the caller with a descriptive
289 : error message if used obviously in error). This is freaky fast (O(5)
290 : fast asm operations under the hood). */
291 :
292 : FD_FN_UNUSED static void /* Work around -Winline */
293 45399 : fd_scratch_push( void ) {
294 45399 : FD_DCHECK_CRIT( !!fd_scratch_private_frame_max, "not attached" );
295 45399 : FD_DCHECK_CRIT( fd_scratch_private_frame_cnt < fd_scratch_private_frame_max, "too many frames" );
296 : # if FD_DCHECK_STYLE>0
297 : fd_scratch_in_prepare = 0;
298 : # endif
299 45399 : fd_scratch_private_frame[ fd_scratch_private_frame_cnt++ ] = fd_scratch_private_free;
300 :
301 : /* Poison to end of scratch region to account for case of in-prep allocation
302 : getting implictly cancelled. */
303 : # if FD_HAS_DEEPASAN
304 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
305 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
306 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
307 : # endif
308 : #if FD_HAS_MSAN
309 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_MSAN_ALIGN );
310 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
311 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
312 : #endif
313 45399 : }
314 :
315 : /* fd_scratch_pop frees all allocations in the current scratch frame,
316 : destroys the current scratch frame and makes the previous frame (if
317 : there is one) the current stack frame (and leaves the caller without
318 : a current frame if there is not one). Assumes the caller is attached
319 : to a scratch memory with at least one frame in use. This cannot fail
320 : from the caller's point of view (if handholding is enabled, it will
321 : abort the caller with a descriptive error message if used obviously
322 : in error). This is freaky fast (O(5) fast asm operations under the
323 : hood). */
324 :
325 : FD_FN_UNUSED static void /* Work around -Winline */
326 40749 : fd_scratch_pop( void ) {
327 40749 : FD_DCHECK_CRIT( !!fd_scratch_private_frame_max, "not attached" );
328 40749 : FD_DCHECK_CRIT( !!fd_scratch_private_frame_cnt, "unmatched pop" );
329 : # if FD_DCHECK_STYLE>0
330 : fd_scratch_in_prepare = 0;
331 : # endif
332 40749 : fd_scratch_private_free = fd_scratch_private_frame[ --fd_scratch_private_frame_cnt ];
333 :
334 : # if FD_HAS_DEEPASAN
335 : /* On a pop() operation, the entire range from fd_scratch_private_free to the
336 : end of the scratch space can be safely poisoned. The region must be aligned
337 : to accomodate asan manual poisoning requirements. */
338 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_ASAN_ALIGN );
339 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
340 : fd_asan_poison( (void*)aligned_start, aligned_stop - aligned_start );
341 : # endif
342 : #if FD_HAS_MSAN
343 : ulong aligned_start = fd_ulong_align_up( fd_scratch_private_free, FD_MSAN_ALIGN );
344 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
345 : fd_msan_poison( (void*)aligned_start, aligned_stop - aligned_start );
346 : #endif
347 40749 : }
348 :
349 : /* fd_scratch_prepare starts an allocation of unknown size and known
350 : alignment align (0 means use default alignment) in the caller's
351 : current scratch frame. Returns a pointer in the caller's address
352 : space with alignment align to the first byte of a region with
353 : fd_scratch_free() (as observed after this function returns) bytes
354 : available. The caller is free to clobber any bytes in this region.
355 :
356 : fd_scratch_publish finishes an in-progress allocation. end points at
357 : the first byte after the final allocation. Assumes there is a
358 : matching prepare. A published allocation can be subsequently
359 : trimmed.
360 :
361 : fd_scratch_cancel cancels an in-progress allocation. This is a no-op
362 : if there is no matching prepare. If the prepare had alignment other
363 : than 1, it is possible that some alignment padding needed for the
364 : allocation will still be used in the caller's current scratch frame.
365 : If this is not acceptable, the prepare should use an alignment of 1
366 : and manually align the return.
367 :
368 : This allows idioms like:
369 :
370 : uchar * p = (uchar *)fd_scratch_prepare( align );
371 :
372 : if( FD_UNLIKELY( fd_scratch_free() < app_max_sz ) ) {
373 :
374 : fd_scratch_cancel();
375 :
376 : ... handle too little scratch space to handle application
377 : ... worst case needs here
378 :
379 : } else {
380 :
381 : ... populate sz bytes to p where sz is in [0,app_max_sz]
382 : p += sz;
383 :
384 : fd_scratch_publish( p );
385 :
386 : ... at this point, scratch is as though
387 : ... fd_scratch_alloc( align, sz ) was called above
388 :
389 : }
390 :
391 : Ideally every prepare should be matched with a publish or a cancel,
392 : only one prepare can be in-progress at a time on a thread and prepares
393 : cannot be nested. As such virtually all other scratch operations
394 : will implicitly cancel any in-progress prepare, including attach /
395 : detach / push / pop / prepare / alloc / trim. */
396 :
397 : FD_FN_UNUSED static void * /* Work around -Winline */
398 952026 : fd_scratch_prepare( ulong align ) {
399 :
400 952026 : FD_DCHECK_CRIT( !!fd_scratch_private_frame_cnt, "unmatched push" );
401 952026 : FD_DCHECK_CRIT( fd_scratch_private_align_is_valid( align ), "bad align" );
402 :
403 : # if FD_HAS_DEEPASAN
404 : /* Need 8 byte alignment. */
405 : align = fd_ulong_align_up( align, FD_ASAN_ALIGN );
406 : # endif
407 952026 : ulong true_align = fd_scratch_private_true_align( align );
408 952026 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
409 :
410 952026 : FD_DCHECK_CRIT( smem >= fd_scratch_private_free, "prepare align overflow" );
411 952026 : FD_DCHECK_CRIT( smem <= fd_scratch_private_stop, "prepare overflow" );
412 : # if FD_DCHECK_STYLE>0
413 : fd_scratch_in_prepare = 1;
414 : # endif
415 :
416 : # if FD_HAS_DEEPASAN
417 : /* At this point the user is able to clobber any bytes in the region. smem is
418 : always going to be at least 8 byte aligned. */
419 : ulong aligned_sz = fd_ulong_align_up( fd_scratch_private_stop - smem, FD_ASAN_ALIGN );
420 : fd_asan_unpoison( (void*)smem, aligned_sz );
421 : # endif
422 :
423 952026 : fd_scratch_private_free = smem;
424 952026 : return (void *)smem;
425 952026 : }
426 :
427 : static inline void
428 761577 : fd_scratch_publish( void * _end ) {
429 761577 : ulong end = (ulong)_end;
430 :
431 : # if FD_DCHECK_STYLE>0
432 : FD_DCHECK_CRIT( !!fd_scratch_in_prepare, "unmatched prepare" );
433 : # endif
434 761577 : FD_DCHECK_CRIT( end >= fd_scratch_private_free, "publish underflow" );
435 761577 : FD_DCHECK_CRIT( end <= fd_scratch_private_stop, "publish overflow" );
436 : # if FD_DCHECK_STYLE>0
437 : fd_scratch_in_prepare = 0;
438 : # endif
439 :
440 : /* Poison everything that is trimmed off. Conservatively poison potentially
441 : less than the region that is trimmed to respect alignment requirements. */
442 : # if FD_HAS_DEEPASAN
443 : ulong aligned_free = fd_ulong_align_dn( fd_scratch_private_free, FD_ASAN_ALIGN );
444 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
445 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
446 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
447 : fd_asan_unpoison( (void*)aligned_free, aligned_end - aligned_free );
448 : # endif
449 : # if FD_HAS_MSAN
450 : ulong aligned_free = fd_ulong_align_dn( fd_scratch_private_free, FD_ASAN_ALIGN );
451 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
452 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
453 : fd_msan_poison( (void*)aligned_end, aligned_stop - aligned_end );
454 : fd_msan_unpoison( (void*)aligned_free, aligned_end - aligned_free );
455 : # endif
456 :
457 761577 : fd_scratch_private_free = end;
458 761577 : }
459 :
460 : static inline void
461 190449 : fd_scratch_cancel( void ) {
462 :
463 : # if FD_DCHECK_STYLE>0
464 : FD_DCHECK_CRIT( !!fd_scratch_in_prepare, "unmatched prepare" );
465 : fd_scratch_in_prepare = 0;
466 : # endif
467 :
468 190449 : }
469 :
470 : /* fd_scratch_alloc allocates sz bytes with alignment align in the
471 : caller's current scratch frame. There should be no prepare in
472 : progress. Note that this has same function signature as
473 : aligned_alloc (and not by accident). It does have some less
474 : restrictive behaviors though.
475 :
476 : align must be 0 or an integer power of 2. 0 will be treated as
477 : FD_SCRATCH_ALIGN_DEFAULT.
478 :
479 : sz need not be a multiple of align. Further, the underlying
480 : allocator does not implicitly round up sz to an align multiple (as
481 : such, scratch can allocate additional items in any tail padding that
482 : might have been implicitly reserved had it rounded up). That is, if
483 : you really want to round up allocations to a multiple of align, then
484 : manually align up sz ... e.g. pass fd_ulong_align_up(sz,align) when
485 : align is non-zero to this call (this could be implemented as a
486 : compile time mode with some small extra overhead if desirable).
487 :
488 : sz 0 is fine. This will currently return a properly aligned non-NULL
489 : pointer (the allocator might do some allocation under the hood to get
490 : the desired alignment and it is possible this might fail ... there is
491 : a case for returning NULL or an arbitrary but appropriately aligned
492 : non-NULL and this could be implemented as a compile time mode with
493 : some small extra overhead if desirable).
494 :
495 : This cannot fail from the caller's point of view (if handholding is
496 : enabled, it will abort the caller with a descriptive error message if
497 : used obviously in error).
498 :
499 : This is freaky fast (O(5) fast asm operations under the hood). */
500 :
501 : FD_FN_UNUSED static void * /* Work around -Winline */
502 : fd_scratch_alloc( ulong align,
503 380745 : ulong sz ) {
504 380745 : ulong smem = (ulong)fd_scratch_prepare( align );
505 380745 : ulong end = smem + sz;
506 :
507 380745 : FD_DCHECK_CRIT( end >= smem, "sz overflow" );
508 380745 : FD_DCHECK_CRIT( end <= fd_scratch_private_stop, "sz overflow" );
509 :
510 380745 : fd_scratch_publish( (void *)end );
511 380745 : return (void *)smem;
512 380745 : }
513 :
514 : /* fd_scratch_trim trims the size of the most recent scratch allocation
515 : in the current scratch frame (technically it can be used to trim the
516 : size of the entire current scratch frame but doing more than the most
517 : recent scratch allocation is strongly discouraged). Assumes there is
518 : a current scratch frame and the caller is not in a prepare. end
519 : points at the first byte to free in the most recent scratch
520 : allocation (or the first byte after the most recent scratch
521 : allocation). This allows idioms like:
522 :
523 : uchar * p = (uchar *)fd_scratch_alloc( align, max_sz );
524 :
525 : ... populate sz bytes of p where sz is in [0,max_sz]
526 : p += sz;
527 :
528 : fd_scratch_trim( p );
529 :
530 : ... now the thread's scratch is as though original call was
531 : ... p = fd_scratch_alloc( align, sz );
532 :
533 : This cannot fail from the caller's point of view (if handholding is
534 : enabled, this will abort the caller with a descriptive error message
535 : if used obviously in error).
536 :
537 : Note that an allocation be repeatedly trimmed.
538 :
539 : Note also that trim can nest. E.g. a thread can call a function that
540 : uses scratch with its own properly matched scratch pushes and pops.
541 : On function return, trim will still work on the most recent scratch
542 : alloc in that frame by the caller.
543 :
544 : This is freaky fast (O(1) fast asm operations under the hood). */
545 :
546 : static inline void
547 761379 : fd_scratch_trim( void * _end ) {
548 761379 : ulong end = (ulong)_end;
549 :
550 761379 : FD_DCHECK_CRIT( !!fd_scratch_private_frame_cnt, "unmatched push" );
551 761379 : FD_DCHECK_CRIT( end >= fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ], "trim underflow" );
552 761379 : FD_DCHECK_CRIT( end <= fd_scratch_private_free, "trim overflow" );
553 : # if FD_DCHECK_STYLE>0
554 : fd_scratch_in_prepare = 0;
555 : # endif
556 :
557 : # if FD_HAS_DEEPASAN
558 : /* The region to poison should be from _end to the end of the scratch's region.
559 : The same alignment considerations need to be taken into account. */
560 : ulong aligned_end = fd_ulong_align_up( end, FD_ASAN_ALIGN );
561 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_ASAN_ALIGN );
562 : fd_asan_poison( (void*)aligned_end, aligned_stop - aligned_end );
563 : # endif
564 : # if FD_HAS_MSAN
565 : ulong aligned_end = fd_ulong_align_up( end, FD_MSAN_ALIGN );
566 : ulong aligned_stop = fd_ulong_align_dn( fd_scratch_private_stop, FD_MSAN_ALIGN );
567 : fd_msan_poison( (void*)aligned_end, aligned_stop - aligned_end );
568 : # endif
569 :
570 761379 : fd_scratch_private_free = end;
571 761379 : }
572 :
573 : /* fd_scratch_*_is_safe returns false (0) if the operation is obviously
574 : unsafe to do at the time of the call or true otherwise.
575 : Specifically:
576 :
577 : fd_scratch_attach_is_safe() returns 1 if the calling thread is not
578 : already attached to scratch.
579 :
580 : fd_scratch_detach_is_safe() returns 1 if the calling thread is
581 : already attached to scratch.
582 :
583 : fd_scratch_reset_is_safe() returns 1 if the calling thread is already
584 : attached to scratch.
585 :
586 : fd_scratch_push_is_safe() returns 1 if there is at least one frame
587 : available and 0 otherwise.
588 :
589 : fd_scratch_pop_is_safe() returns 1 if there is at least one frame
590 : in use and 0 otherwise.
591 :
592 : fd_scratch_prepare_is_safe( align ) returns 1 if there is a current
593 : frame for the allocation and enough scratch pad memory to start
594 : preparing an allocation with alignment align.
595 :
596 : fd_scratch_publish_is_safe( end ) returns 1 if end is a valid
597 : location to complete an allocation in preparation. If handholding is
598 : enabled, will additionally check that there is a prepare already in
599 : progress.
600 :
601 : fd_scratch_cancel_is_safe() returns 1.
602 :
603 : fd_scratch_alloc_is_safe( align, sz ) returns 1 if there is a current
604 : frame for the allocation and enough scratch pad memory for an
605 : allocation with alignment align and size sz.
606 :
607 : fd_scratch_trim_is_safe( end ) returns 1 if there is a current frame
608 : and that current frame can be trimmed to end safely.
609 :
610 : These are safe to call at any time and also freak fast handful of
611 : assembly operations. */
612 :
613 0 : FD_FN_PURE static inline int fd_scratch_attach_is_safe( void ) { return !fd_scratch_private_frame_max; }
614 0 : FD_FN_PURE static inline int fd_scratch_detach_is_safe( void ) { return !!fd_scratch_private_frame_max; }
615 0 : FD_FN_PURE static inline int fd_scratch_reset_is_safe ( void ) { return !!fd_scratch_private_frame_max; }
616 5998530 : FD_FN_PURE static inline int fd_scratch_push_is_safe ( void ) { return fd_scratch_private_frame_cnt<fd_scratch_private_frame_max; }
617 5908020 : FD_FN_PURE static inline int fd_scratch_pop_is_safe ( void ) { return !!fd_scratch_private_frame_cnt; }
618 :
619 : FD_FN_PURE static inline int
620 0 : fd_scratch_prepare_is_safe( ulong align ) {
621 0 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
622 0 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad alignment, compile time typically */
623 0 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
624 0 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
625 0 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* alignment overflow */
626 0 : if( FD_UNLIKELY( smem > fd_scratch_private_stop ) ) return 0; /* insufficient scratch */
627 0 : return 1;
628 0 : }
629 :
630 : FD_FN_PURE static inline int
631 0 : fd_scratch_publish_is_safe( void * _end ) {
632 0 : ulong end = (ulong)_end;
633 0 : # if FD_DCHECK_STYLE>0
634 0 : if( FD_UNLIKELY( !fd_scratch_in_prepare ) ) return 0; /* Not in prepare */
635 0 : # endif
636 0 : if( FD_UNLIKELY( end < fd_scratch_private_free ) ) return 0; /* Backward */
637 0 : if( FD_UNLIKELY( end > fd_scratch_private_stop ) ) return 0; /* Out of bounds */
638 0 : return 1;
639 0 : }
640 :
641 : FD_FN_CONST static inline int
642 0 : fd_scratch_cancel_is_safe( void ) {
643 0 : return 1;
644 0 : }
645 :
646 : FD_FN_PURE static inline int
647 : fd_scratch_alloc_is_safe( ulong align,
648 2913399 : ulong sz ) {
649 2913399 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
650 2590485 : if( FD_UNLIKELY( !fd_scratch_private_align_is_valid( align ) ) ) return 0; /* Bad align, compile time typically */
651 2590485 : ulong true_align = fd_scratch_private_true_align( align ); /* compile time typically */
652 2590485 : ulong smem = fd_ulong_align_up( fd_scratch_private_free, true_align );
653 2590485 : if( FD_UNLIKELY( smem < fd_scratch_private_free ) ) return 0; /* align overflow */
654 2590485 : ulong free = smem + sz;
655 2590485 : if( FD_UNLIKELY( free < smem ) ) return 0; /* sz overflow */
656 2590485 : if( FD_UNLIKELY( free > fd_scratch_private_stop ) ) return 0; /* too little space */
657 761379 : return 1;
658 2590485 : }
659 :
660 : FD_FN_PURE static inline int
661 0 : fd_scratch_trim_is_safe( void * _end ) {
662 0 : ulong end = (ulong)_end;
663 0 : if( FD_UNLIKELY( !fd_scratch_private_frame_cnt ) ) return 0; /* No current frame */
664 0 : if( FD_UNLIKELY( end < fd_scratch_private_frame[ fd_scratch_private_frame_cnt-1UL ] ) ) return 0; /* Trim underflow */
665 0 : if( FD_UNLIKELY( end > fd_scratch_private_free ) ) return 0; /* Trim overflow */
666 0 : return 1;
667 0 : }
668 :
669 : /* FD_SCRATCH_SCOPE_{BEGIN,END} create a `do { ... } while(0);` scope in
670 : which a temporary scratch frame is available. Nested scopes are
671 : permitted. This scratch frame is automatically destroyed when
672 : exiting the scope normally (e.g. by 'break', 'return', or reaching
673 : the end). Uses a dummy variable with a cleanup attribute under the
674 : hood. U.B. if scope is left abnormally (e.g. longjmp(), exception,
675 : abort(), etc.). Use as follows:
676 :
677 : FD_SCRATCH_SCOPE_BEGIN {
678 : ...
679 : fd_scratch_alloc( ... );
680 : ...
681 : }
682 : FD_SCRATCH_SCOPE_END; */
683 :
684 : FD_FN_UNUSED static inline void
685 63 : fd_scratch_scoped_pop_private( void * _unused ) {
686 63 : (void)_unused;
687 63 : fd_scratch_pop();
688 63 : }
689 :
690 63 : #define FD_SCRATCH_SCOPE_BEGIN do { \
691 63 : fd_scratch_push(); \
692 63 : int __fd_scratch_guard_ ## __LINE__ \
693 63 : __attribute__((cleanup(fd_scratch_scoped_pop_private))) \
694 63 : __attribute__((unused)) = 0; \
695 63 : do
696 :
697 63 : #define FD_SCRATCH_SCOPE_END while(0); } while(0)
698 :
699 : /* fd_alloca is variant of alloca that works like aligned_alloc. That
700 : is, it returns an allocation of sz bytes with an alignment of at
701 : least align. Like alloca, this allocation will be in the stack frame
702 : of the calling function with a lifetime of until the calling function
703 : returns. Stack overflow handling is likewise identical to alloca
704 : (stack overflows will overlap the top stack guard, typically
705 : triggering a seg fault when the overflow region is touched that will
706 : be caught and handled by the logger to terminate the calling thread
707 : group). As such, like alloca, these really should only be used for
708 : smallish (<< few KiB) quick allocations in bounded recursion depth
709 : circumstances.
710 :
711 : Like fd_scratch_alloc, align must be an 0 or a non-negative integer
712 : power of 2. 0 will be treated as align_default. align smaller than
713 : align_min will be bumped up to align_min.
714 :
715 : The caller promises request will not overflow the stack. This has to
716 : be implemented as a macro for linguistic reasons and align should be
717 : safe against multiple evaluation and, due to compiler limitations,
718 : must be a compile time constant. Returns non-NULL on success and
719 : NULL on failure (in most situations, can never fail from the caller's
720 : POV). sz==0 is okay (and will return non-NULL). */
721 :
722 : #if FD_HAS_ALLOCA
723 :
724 : /* Work around compiler limitations */
725 9 : #define FD_SCRATCH_PRIVATE_TRUE_ALIGN( align ) ((align) ? (align) : FD_SCRATCH_ALIGN_DEFAULT)
726 :
727 6 : #define fd_alloca(align,sz) __builtin_alloca_with_align( fd_ulong_max( (sz), 1UL ), \
728 6 : 8UL*FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) /*bits*/ )
729 :
730 : /* fd_alloca_check does fd_alloca but it will FD_LOG_CRIT with a
731 : detailed message if the request would cause a stack overflow or leave
732 : so little available free stack that subsequent normal thread
733 : operations would be at risk.
734 :
735 : Note that returning NULL on failure is not an option as this would no
736 : longer be a drop-in instrumented replacement for fd_alloca (this
737 : would also require even more linguistic hacks to keep the fd_alloca
738 : at the appropriate scope). Likewise, testing the allocated region is
739 : within the stack post allocation is not an option as the FD_LOG_CRIT
740 : invocation would then try to use stack with the already overflowed
741 : allocation in it (there is no easy portable way to guarantee an
742 : alloca has been freed short of returning from the function in which
743 : the alloca was performed). Using FD_LOG_ERR instead of FD_LOG_CRIT
744 : is a potentially viable alternative error handling behavior though.
745 :
746 : This has to be implemented as a macro for linguistic reasons. It is
747 : recommended this only be used for development / debugging / testing
748 : purposes (e.g. if you are doing alloca in production that are large
749 : enough you are worried about stack overflow, you probably should be
750 : using fd_scratch, fd_alloc or fd_wksp depending on performance and
751 : persistence needs or, better still, architecting to not need any
752 : temporary memory allocations at all). If the caller's stack
753 : diagnostics could not be successfully initialized (this is logged),
754 : this will always FD_LOG_CRIT. */
755 :
756 : #if !FD_HAS_ASAN
757 :
758 : extern FD_TL ulong fd_alloca_check_private_sz;
759 :
760 : #define fd_alloca_check( align, sz ) \
761 3 : ( fd_alloca_check_private_sz = (sz), \
762 3 : (__extension__({ \
763 3 : ulong _fd_alloca_check_private_pad_max = FD_SCRATCH_PRIVATE_TRUE_ALIGN( (align) ) - 1UL; \
764 3 : ulong _fd_alloca_check_private_footprint = fd_alloca_check_private_sz + _fd_alloca_check_private_pad_max; \
765 3 : if( FD_UNLIKELY( (_fd_alloca_check_private_footprint < _fd_alloca_check_private_pad_max ) | \
766 3 : (_fd_alloca_check_private_footprint > (31UL*(fd_tile_stack_est_free() >> 5))) ) ) \
767 3 : FD_LOG_CRIT(( "fd_alloca_check( " #align ", " #sz " ) stack overflow" )); \
768 3 : })), \
769 3 : fd_alloca( (align), fd_alloca_check_private_sz ) )
770 :
771 : #else /* FD_HAS_ASAN */
772 :
773 : /* AddressSanitizer provides its own alloca safety instrumentation
774 : which are more powerful than the above fd_alloca_check heuristics. */
775 :
776 : #define fd_alloca_check fd_alloca
777 :
778 : #endif /* FD_HAS_ASAN */
779 : #endif /* FD_HAS_ALLOCA */
780 :
781 : FD_PROTOTYPES_END
782 :
783 : #endif /* HEADER_fd_src_util_scratch_fd_scratch_h */
|