Line data Source code
1 : #ifndef HEADER_fd_src_util_wksp_fd_wksp_private_h
2 : #define HEADER_fd_src_util_wksp_fd_wksp_private_h
3 :
4 : #include "fd_wksp.h"
5 :
6 : /* If FD_WKSP_LOCK_RECLAIM==0, do not try to recover the lock from
7 : dead processes. This is useful, for example, if we know that the
8 : lock will not get acquired by another process, or that if another
9 : acquiring process dies that all potential users will get exited. It
10 : prevents a syscall on various common workspace paths (eg, alloc). */
11 :
12 : #ifndef FD_WKSP_LOCK_RECLAIM
13 : #define FD_WKSP_LOCK_RECLAIM 0
14 : #endif
15 :
16 : /* FD_WKSP_PRIVATE_PINFO_IDX_NULL is the pinfo index value used to
17 : indicate NULL */
18 :
19 26418427791 : #define FD_WKSP_PRIVATE_PINFO_IDX_NULL ((ulong)UINT_MAX)
20 :
21 : /* A fd_wksp_private_pinfo_t specifies details about a partition in a
22 : workspace and its relationship to other partitions in that workspace.
23 :
24 : FD_WKSP_PRIVATE_PINFO_ALIGN is an integer power of 2 and
25 : FD_WKSP_PRIVATE_PINFO_FOOTPRINT will be a multiple of align.
26 :
27 : If a partition is not idle:
28 :
29 : - [gaddr_lo,gaddr_hi) specify the range of offsets covered by this
30 : partition.
31 :
32 : wksp_gaddr_lo <= gaddr_lo < gaddr_hi <= wksp_gaddr_hi
33 :
34 : such that partitions always have at least 1 byte and are contained
35 : within the workspace's data region.
36 :
37 : - tag==0 indicates this partition is space in the workspace free
38 : for use and the partition is in the free treap, fast findable by
39 : its size. Otherwise, this partition is allocated and the partition
40 : is in the used treap, fast O(1) findable by any in range gaddr.
41 :
42 : - prev_cidx, next_cidx give the index (in compressed form) of the
43 : previous / next partition if present or IDX_NULL if this is the
44 : workspace partition head / tail partition (in which case gaddr_lo /
45 : gaddr_hi will be wksp->gaddr_lo / wksp->gaddr_hi). That is, for
46 : partition idx where idx is in [0,wksp->part_max):
47 :
48 : ulong prev_idx = fd_wksp_private_pinfo_idx( pinfo[ idx ].prev_cidx );
49 : if( fd_wksp_private_pinfo_idx_is_null( prev_idx ) ) {
50 : ... at this point:
51 : ... idx == fd_wksp_private_pinfo_idx( wksp->part_head_cidx ) );
52 : ... pinfo[ idx ].gaddr_lo == wksp->gaddr_lo );
53 : } else {
54 : ... at this point:
55 : ... idx ==_fd_wksp_private_pinfo_idx( pinfo[ prev_idx ].next_cidx );
56 : ... pinfo[ idx ].gaddr_lo == pinfo[ prev_idx ].gaddr_hi;
57 : }
58 :
59 : and:
60 :
61 : ulong next_idx = fd_wksp_private_pinfo_idx( pinfo[ idx ].next_cidx );
62 : if( fd_wksp_private_pinfo_idx_is_null( next_idx ) ) {
63 : ... at this point:
64 : ... idx == fd_wksp_private_pinfo_idx( wksp->part_tail_cidx ) );
65 : ... pinfo[ idx ].gaddr_hi == wksp->gaddr_hi );
66 : } else {
67 : ... at this point:
68 : ... idx ==_fd_wksp_private_pinfo_idx( pinfo[ next_idx ].prev_cidx );
69 : ... pinfo[ idx ].gaddr_hi == pinfo[ next_idx ].gaddr_lo;
70 : }
71 :
72 : - If partition idx is in the used treap, {left,right}_cidx specify
73 : the partition indices of the root of the left and right subtrees.
74 :
75 : The used treap obeys the binary search tree property that all
76 : partitions in the left/right subtree (if any) cover a range of
77 : offsets strictly lower/higher than range covered by idx.
78 :
79 : parent_cidx specifies idx's parent tree (if any). If idx is the
80 : wksp used tree root, parent_cidx will specify IDX_NULL and
81 : wksp->part_used_cidx will specify idx.
82 :
83 : The used treap also obeys the heap property to make it well
84 : balanced on average. Specifically, the idx's parent's heap
85 : priority will be at least idx's heap priority.
86 :
87 : in_same will be 0 and same_cidx will specify IDX_NULL as no used
88 : partitions can overlap.
89 :
90 : - If partition idx is in the free treap, if partition idx is not
91 : in a list of same sized partitions, in_same will be 0 and
92 : {left,right}_cidx specify the partition indices of the root of the
93 : left and right subtrees.
94 :
95 : The free treap obeys the binary search tree property that all
96 : partitions in the left/right subtree (if any) have partition sizes
97 : strictly lower/higher than partition idx's size.
98 :
99 : parent_cidx specifies idx's parent tree (if any). If idx is the
100 : wksp free tree root, parent_cidx will specify IDX_NULL and
101 : wksp->part_free_cidx will specify idx.
102 :
103 : The free treap also obeys the heap property to make it well
104 : balanced on average. Specifically, the idx's parent's heap
105 : priority will be at least idx's heap priority.
106 :
107 : If there are additional partitions of the same size to partition
108 : idx, same_cidx will refer to the next partition of the same size.
109 :
110 : If partition idx is in a list of same sized partitions, in_same
111 : will be 1 and parent_cidx / same_cidx will specify the prev / next
112 : index of additional partitions of the same size. same_cidx will
113 : specify IDX_NULL if no more.
114 :
115 : - heap_prio is a random value used as described above.
116 :
117 : - stack_cidx and cycle_tag are for internal use */
118 :
119 : /* TODO: Consider align 32/ footprint 96 without compressed indices if
120 : ever needing more than ~4B partitions. */
121 :
122 : #define FD_WKSP_PRIVATE_PINFO_ALIGN (64UL) /* At most FD_WKSP_ALIGN */
123 : #define FD_WKSP_PRIVATE_PINFO_FOOTPRINT (64UL)
124 :
125 : struct __attribute__((aligned(FD_WKSP_PRIVATE_PINFO_ALIGN))) fd_wksp_private_pinfo {
126 : ulong gaddr_lo; /* If in idle stack, 0 */
127 : ulong gaddr_hi; /* ", 0 */
128 : ulong tag; /* ", 0 */
129 : uint heap_prio : 31; /* 30 bit priority and 1 bit free to use for infinite priority bulk tree ops */
130 : uint in_same : 1; /* 1 if in a same list and 0 otherwise */
131 : uint prev_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
132 : uint next_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
133 : uint left_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
134 : uint right_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
135 : uint parent_cidx; /* ", cidx of next idle or fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) if no more */
136 : uint same_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
137 : uint stack_cidx; /* internal use */
138 : ulong cycle_tag; /* internal use */
139 : };
140 :
141 : typedef struct fd_wksp_private_pinfo fd_wksp_private_pinfo_t;
142 :
143 : /* FD_WKSP_MAGIC is an ideally unique number that specifies the precise
144 : memory layout of a fd_wksp. */
145 :
146 531 : #define FD_WKSP_MAGIC (0xF17EDA2C3731C591UL) /* F17E=FIRE,DA2C/3R<>DANCER,31/C59<>WKSP,0<>0 --> FIRE DANCER WKSP VERSION 1 */
147 :
148 : /* fd_wksp_private specifies the detailed layout of the internals of a
149 : fd_wksp_t */
150 :
151 : struct fd_wksp_private {
152 :
153 : /* This point is FD_WKSP_ALIGN aligned */
154 :
155 : /* This fields are static and mostly in the first cache line */
156 :
157 : ulong magic; /* ==FD_WKSP_MAGIC */
158 : ulong part_max; /* Max wksp partitions */
159 : ulong data_max; /* Data region */
160 : ulong gaddr_lo; /* ==fd_wksp_private_data_off( part_max ), data region covers offsets [gaddr_lo,gaddr_hi) */
161 : ulong gaddr_hi; /* ==gaddr_lo + data_max, offset gaddr_hi is to 1 byte footer */
162 : char name[ FD_SHMEM_NAME_MAX ]; /* (Convenience) backing fd_shmem region cstr name */
163 : uint seed; /* Heap priority random number seed, arbitrary */
164 :
165 : /* These fields are dynamic and in the adjacent cache line */
166 :
167 : uint idle_top_cidx; /* Stack of partition infos not in use, parent_idx is next pointer */
168 : uint part_head_cidx; /* Index for info about the leftmost partition */
169 : uint part_tail_cidx; /* Index for info about the rightmost partition */
170 : uint part_used_cidx; /* Treap of partitions that are currently used (tag!=0), searchable by gaddr */
171 : uint part_free_cidx; /* Treap of partitions that are currently free (tag==0), searchable by size */
172 : ulong cycle_tag; /* Used for cycle detection */
173 : ulong owner; /* thread group id of the owner or NULL otherwise */
174 :
175 : /* IMPORTANT! The "single-source-of-truth" for what is currently
176 : used (and its tags) is the set of non-zero tagged partitions in the
177 : partition info array. The idle stack, partition list, used treap
178 : and free treap are auxiliary data structuring that can be
179 : reconstructed at any time from this single source of truth.
180 :
181 : Conversely, if there accidental or deliberate data corruption of
182 : the wksp metadata resulting in a conflict between what is stored
183 : in the partition info array and the auxiliary data structures,
184 : the partition info array governs. */
185 :
186 : /* Padding to FD_WKSP_PRIVATE_PINFO_ALIGN here */
187 :
188 : /* part_max pinfo here */
189 : /* data_max byte data region here */
190 : /* 1 footer byte here */
191 : /* Padding to FD_WKSP_ALIGN here */
192 : };
193 :
194 : FD_PROTOTYPES_BEGIN
195 :
196 : /* fd_wksp_private_pinfo_sz returns the size of a partition in bytes.
197 : Assumes pinfo points to the pinfo of a partition in a current local
198 : join. Will be positive. */
199 :
200 : FD_FN_PURE static inline ulong
201 7136493510 : fd_wksp_private_pinfo_sz( fd_wksp_private_pinfo_t const * pinfo ) {
202 7136493510 : return pinfo->gaddr_hi - pinfo->gaddr_lo;
203 7136493510 : }
204 :
205 : /* fd_wksp_private_{part,data}_off return the wksp offset of the
206 : pinfo array and the data region. data_off assumes part_max is a
207 : value that will not overflow. */
208 :
209 : FD_FN_CONST static inline ulong
210 0 : fd_wksp_private_pinfo_off( void ) {
211 0 : return 128UL; /* fd_ulong_align_up( sizeof(fd_wksp_t), FD_WKSP_PRIVATE_PINFO_ALIGN ); */
212 0 : }
213 :
214 : FD_FN_CONST static inline ulong
215 5364 : fd_wksp_private_data_off( ulong part_max ) {
216 5364 : return fd_wksp_private_pinfo_off() + part_max*sizeof(fd_wksp_private_pinfo_t);
217 5364 : }
218 :
219 : /* fd_wksp_private_pinfo returns the location of wksp pinfo array in the
220 : caller's address space. Assumes wksp is a current local join.
221 : fd_wksp_private_pinfo_const is a const-correct version. */
222 :
223 : FD_FN_CONST static inline fd_wksp_private_pinfo_t *
224 587977926 : fd_wksp_private_pinfo( fd_wksp_t * wksp ) {
225 587977926 : return (fd_wksp_private_pinfo_t *)(((ulong)wksp) + fd_wksp_private_pinfo_off());
226 587977926 : }
227 :
228 : FD_FN_CONST static inline fd_wksp_private_pinfo_t const *
229 0 : fd_wksp_private_pinfo_const( fd_wksp_t const * wksp ) {
230 0 : return (fd_wksp_private_pinfo_t const *)(((ulong)wksp) + fd_wksp_private_pinfo_off());
231 0 : }
232 :
233 : /* fd_wksp_private_pinfo_{cidx,idx} compresses / uncompresses a pinfo index */
234 :
235 8698208668 : static inline uint fd_wksp_private_pinfo_cidx( ulong idx ) { return (uint) idx; }
236 17330034808 : static inline ulong fd_wksp_private_pinfo_idx ( uint cidx ) { return (ulong)cidx; }
237 :
238 : /* fd_wksp_private_pinfo_idx_is_null returns 1 if idx is
239 : FD_WKSP_PRIVATE_PINFO_IDX_NULL and 0 otherwise */
240 :
241 22318511025 : static inline int fd_wksp_private_pinfo_idx_is_null( ulong idx ) { return idx==FD_WKSP_PRIVATE_PINFO_IDX_NULL; }
242 :
243 : /* pinfo idle stack APIs **********************************************/
244 :
245 : /* fd_wksp_private_idle_stack_is_empty returns 1 if there are no idle
246 : partitions and 0 otherwise. Also returns 1 if corruption is
247 : detected. Assumes wksp is a current local join. */
248 :
249 : static inline int
250 7446 : fd_wksp_private_idle_stack_is_empty( fd_wksp_t * wksp ) {
251 7446 : return fd_wksp_private_pinfo_idx( wksp->idle_top_cidx ) >= wksp->part_max;
252 7446 : }
253 :
254 : /* fd_wksp_private_idle_stack_pop pops an idle partition off wksp's idle
255 : stack. Assumes the caller knows idle stack is not empty. The caller
256 : is promised that the popped partition has [gaddr_lo,gaddr_hi) = [0,0)
257 : tag 0, {prev, next, left, right, same, parent}_cidx specify IDX_NULL.
258 : Further, heap_prio should have been assigned a random value.
259 : stack_idx and cycle_tag are for internal use. */
260 :
261 : static inline ulong /* Assumes in [0,part_max) */
262 : fd_wksp_private_idle_stack_pop( fd_wksp_t * wksp, /* Assumes current local join */
263 116699511 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
264 116699511 : ulong i = fd_wksp_private_pinfo_idx( wksp->idle_top_cidx );
265 : # if FD_HAS_DEEPASAN
266 : fd_asan_unpoison( &pinfo[ i ], FD_WKSP_PRIVATE_PINFO_FOOTPRINT );
267 : # endif
268 116699511 : wksp->idle_top_cidx = pinfo[ i ].parent_cidx;
269 116699511 : pinfo[ i ].parent_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
270 116699511 : return i;
271 116699511 : }
272 :
273 : /* fd_wksp_private_idle_stack_push pushes partition i onto the idle
274 : stack. Assumes the caller knows i is not currently in the idle
275 : stack, partitioning, used treap or free treap. */
276 :
277 : static inline void
278 : fd_wksp_private_idle_stack_push( ulong i, /* Assumes in [0,part_max) */
279 : fd_wksp_t * wksp, /* Assumes current local join */
280 152579889 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
281 152579889 : pinfo[ i ].gaddr_lo = 0UL;
282 152579889 : pinfo[ i ].gaddr_hi = 0UL;
283 152579889 : pinfo[ i ].tag = 0U;
284 152579889 : pinfo[ i ].in_same = 0U;
285 152579889 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
286 152579889 : pinfo[ i ].next_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
287 152579889 : pinfo[ i ].left_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
288 152579889 : pinfo[ i ].right_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
289 152579889 : pinfo[ i ].same_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
290 152579889 : pinfo[ i ].parent_cidx = wksp->idle_top_cidx;
291 152579889 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( i );
292 :
293 : # if FD_HAS_DEEPASAN
294 : fd_asan_poison( &pinfo[ i ], FD_WKSP_PRIVATE_PINFO_FOOTPRINT );
295 : # endif
296 152579889 : }
297 :
298 : /* pinfo used treap APIs **********************************************/
299 :
300 : /* fd_wksp_private_used_treap_query queries wksp's used treap for the
301 : used partition that holds gaddr. On success, returns the requested
302 : partition idx, in [0,part_max), and, on failure, returns IDX_NULL.
303 : Reasons for failure include gaddr is not in a used partition and
304 : internal treap corruption detected. Might consume a wksp cycle tag
305 : and clobber partition cycle tags. Reasonably fast O(lg N) where N is
306 : the number of used partitions. */
307 :
308 : ulong
309 : fd_wksp_private_used_treap_query( ulong gaddr,
310 : fd_wksp_t * wksp,
311 : fd_wksp_private_pinfo_t * pinfo );
312 :
313 : /* fd_wksp_private_used_treap_insert inserts partition n into wksp's
314 : used treap. Assumes n is not in the idle stack, used treap or free
315 : treap. Does not care if n is in the partitioning or not. Reasonably
316 : fast O(lg N) where N is the number of used partitions.
317 :
318 : Partition n should have [gaddr_lo,gaddr_hi) and heap_prio initialized
319 : on entry (heap_prio should be a random value). tag need not be
320 : initialized but it is assumed that the caller will set the tag to its
321 : final value on success to make the partition officially used. This
322 : will initialize {in_same, left, right, same, parent}_cidx. This will
323 : ignore {prev,next}_cidx. This might consume a wksp cycle tag and
324 : clobber partition stack_cidx and cycle_tag fields.
325 :
326 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
327 : (negative) on failure (logs details for failure). Reasons for
328 : failure include n is not in [0,part_max), n's range is not in wksp
329 : data region, n was detected as already inserted (this detection is
330 : not guaranteed), treap internal connectivity issues were detected
331 : (complete detection not guaranteed), and n overlaps with at least one
332 : element already inserted into the treap.
333 :
334 : On failure n and the treap itself were not modified (except possibly
335 : clobbering of stack_cidx and cycle_tag). Note that failure reasons
336 : are either user error or memory corruption. This cannot fail in
337 : normal operating circumstances. */
338 :
339 : int
340 : fd_wksp_private_used_treap_insert( ulong n,
341 : fd_wksp_t * wksp, /* Assumes current local join */
342 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
343 :
344 : /* fd_wksp_private_used_treap_remove removes partition d from wksp's
345 : used treap. Assumes d in the used treap, not in the free treap, not
346 : in the idle stack. Does not care if d is in the partitioning.
347 : Reasonably fast O(lg N) where N is the number of used partitions.
348 : This might consume a wksp cycle tag and clobber partition stack_cidx
349 : and cycle_tag fields.
350 :
351 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
352 : (negative) on failure (logs details for failure). Reasons for
353 : failure include d is not in [0,part_max) and treap internal
354 : connectivity issues were detected (complete detection not
355 : guaranteed).
356 :
357 : Note that failure reasons are either user error or memory corruption.
358 : This cannot fail in normal operating circumstances. */
359 :
360 : int
361 : fd_wksp_private_used_treap_remove( ulong d,
362 : fd_wksp_t * wksp, /* Assumes current local join */
363 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
364 :
365 : /* pinfo free treap APIs **********************************************/
366 :
367 : /* fd_wksp_private_free_treap_query queries wksp's free treap for the
368 : smallest partition of at least sz. On success, returns the index of
369 : a partition in the free treap suitable for sz, in [0,part_max), and,
370 : on failure, returns IDX_NULL. Reasons for failure include sz zero,
371 : sz is larger than any free partition, and internal treap corruption
372 : was detected. Might consume a wksp cycle tag and clobber partition
373 : cycle tags. Reasonably fast O(lg N) where N is the number of used
374 : partitions. */
375 :
376 : ulong
377 : fd_wksp_private_free_treap_query( ulong sz,
378 : fd_wksp_t * wksp, /* Assumes current local join */
379 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
380 :
381 : /* fd_wksp_private_free_treap_insert inserts partition n into wksp's
382 : free treap. Assumes n is not in the idle stack, used treap or free
383 : treap. Does not care if n is in the partitioning or not. Reasonably
384 : fast O(lg N) where N is the number of partitions in the free treap.
385 :
386 : Partition n should have [gaddr_lo,gaddr_hi) and heap_prio initialized
387 : on entry (heap_prio should be a random value). tag need not be
388 : initialized but it is assumed that the caller will zero the tag
389 : beforehand to make the partition officially free. This will
390 : initialize {in_same, left, right, same, parent}_cidx. This will
391 : ignore {prev,next}_cidx. This might consume a wksp cycle tag and
392 : clobber the partition stack_cidx and cycle_tag fields.
393 :
394 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
395 : (negative) on failure (logs details for failure). Reasons for
396 : failure include n is not in [0,part_max), n's range is not in wksp
397 : data region, n's tag is not zero, n was detected as already inserted
398 : (this detection is not guaranteed), treap internal connectivity
399 : issues were detected (complete detection not guaranteed).
400 :
401 : If n's size exactly matches the size of partition already in the
402 : treap, n will be pushed onto that partition's same stack rather than
403 : inserted into the treap.
404 :
405 : On failure n and the treap itself were not modified (except possibly
406 : clobbering of stack_cidx and cycle_tag). Note that failures reasons
407 : are either user error or memory corruption. This has no failures in
408 : normal operating circumstances. */
409 :
410 : int
411 : fd_wksp_private_free_treap_insert( ulong n,
412 : fd_wksp_t * wksp, /* Assumes current local join */
413 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
414 :
415 : /* fd_wksp_private_free_treap_same_is_empty returns 1 if the same list
416 : for d is empty and 0 if not. Returns 1 if corruption in detected.
417 : Assumes d is in the free treap. */
418 :
419 : static inline int
420 : fd_wksp_private_free_treap_same_is_empty( ulong d,
421 : fd_wksp_t * wksp, /* Assumes current local join */
422 80603721 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
423 80603721 : ulong part_max = wksp->part_max;
424 80603721 : return fd_wksp_private_pinfo_idx( pinfo[ d ].same_cidx )>=part_max;
425 80603721 : }
426 :
427 : /* fd_wksp_private_free_treap_same_remove removes the first partition
428 : from d's same list. Assumes the caller knows d's same list is not
429 : empty. The caller is promised that returned partition has the same
430 : size as d. */
431 :
432 : static inline ulong
433 : fd_wksp_private_free_treap_same_remove( ulong d,
434 : fd_wksp_t * wksp, /* Assumes current local join */
435 3604728 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
436 3604728 : ulong part_max = wksp->part_max;
437 3604728 : ulong i = fd_wksp_private_pinfo_idx( pinfo[ d ].same_cidx );
438 3604728 : ulong j = fd_wksp_private_pinfo_idx( pinfo[ i ].same_cidx );
439 3604728 : /**/ pinfo[ d ].same_cidx = fd_wksp_private_pinfo_cidx( j );
440 3604728 : if( j<part_max ) pinfo[ j ].parent_cidx = fd_wksp_private_pinfo_cidx( d );
441 3604728 : pinfo[ i ].in_same = 0U;
442 3604728 : pinfo[ i ].same_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
443 3604728 : pinfo[ i ].parent_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
444 3604728 : return i;
445 3604728 : }
446 :
447 : /* fd_wksp_private_free_treap_remove removes partition d from wksp's
448 : free treap. Assumes d in the free treap, not in the used treap, not
449 : in the idle stack. Does not care if d is in the partitioning.
450 : Reasonably fast O(lg N) where N is the number of free partitions.
451 : This might consume a wksp cycle tag and clobber partition stack_cidx
452 : and cycle_tag fields. There is an edge case where d's can be swapped
453 : with another same sized partition.
454 :
455 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
456 : (negative) on failure (logs details for failure). Reasons for
457 : failure include d is not in [0,part_max) and treap internal
458 : connectivity issues were detected (complete detection not
459 : guaranteed).
460 :
461 : Note that failure reasons are either user error or memory corruption.
462 : This cannot fail in normal operating circumstances. */
463 :
464 : int
465 : fd_wksp_private_free_treap_remove( ulong d,
466 : fd_wksp_t * wksp, /* Assumes current local join */
467 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
468 :
469 : /* private admin APIs *************************************************/
470 :
471 : /* fd_wksp_private_lock locks wksp. Assumes wksp is a current local
472 : join. If wksp is already locked, this will wait for the caller. If
473 : this detects that the caller died while holding the lock, it will try
474 : to steal the lock from the dead caller and cleanup any incomplete
475 : operation the caller was doing. Returns FD_WKSP_SUCCESS (0) if the
476 : lock was acquired or FD_WKSP_ERR_CORRUPT if the lock could not be
477 : obtained because memory corruption was detected while trying to
478 : recover from a dead caller that corrupted the wksp memory. */
479 :
480 : int
481 : fd_wksp_private_lock( fd_wksp_t * wksp );
482 :
483 : /* fd_wksp_private_unlock unlocks a locked wksp. Assumes wksp is a
484 : current local join and the caller has the lock */
485 :
486 : static inline void
487 587977248 : fd_wksp_private_unlock( fd_wksp_t * wksp ) {
488 587977248 : FD_COMPILER_MFENCE();
489 587977248 : FD_VOLATILE( wksp->owner ) = ULONG_MAX;
490 587977248 : FD_COMPILER_MFENCE();
491 587977248 : }
492 :
493 : /* private checkpt/restore APIs ***************************************/
494 : /* FIXME: MOVE THIS TO PUBLIC HEADER? */
495 :
496 : /* FD_WKSP_CHECKPT_{V1,V2}_{BINFO,UINFO}_MAX give the maximum byte size
497 : (including the terminating '\0') of a decompressed {v1,v2} checkpt
498 : {build,user} info cstr. */
499 :
500 9 : #define FD_WKSP_CHECKPT_V1_BINFO_MAX (16384UL)
501 9 : #define FD_WKSP_CHECKPT_V1_UINFO_MAX (16384UL)
502 :
503 : #define FD_WKSP_CHECKPT_V2_BINFO_MAX (16384UL)
504 : #define FD_WKSP_CHECKPT_V2_UINFO_MAX (16384UL)
505 :
506 : /* A fd_wksp_checkpt_v2_hdr_t gives the byte layout of frame 0 of a wksp
507 : v2 checkpt. This frame contains the style, compression algo used for
508 : the info, cgroup and appendix frames and fd_wksp_preview information
509 : uncompressed. */
510 :
511 : struct fd_wksp_checkpt_v2_hdr {
512 : ulong magic; /* Must be first, ==FD_WKSP_MAGIC */
513 : int style; /* Must be second, wksp checkpt style */
514 : int frame_style_compressed; /* frame style used for compressed frames */
515 : uint reserved; /* header padding */
516 : char name[ FD_SHMEM_NAME_MAX ]; /* cstr holding the original wksp name (note: FD_SHMEM_NAME_MAX==FD_LOG_NAME_MAX==40) */
517 : uint seed; /* wksp seed when checkpointed (probably same used to construct) */
518 : ulong part_max; /* part_max used to construct the wksp */
519 : ulong data_max; /* data_max used to construct the wksp */
520 : };
521 :
522 : typedef struct fd_wksp_checkpt_v2_hdr fd_wksp_checkpt_v2_hdr_t;
523 :
524 : /* A fd_wksp_checkpt_v2_info_t gives the byte layout of frame 1 of a
525 : wksp v2 checkpt. frame 1 immediately follows frame 0 and this frame
526 : contains the info structure followed compactly by the corresponding
527 : cstr (including the terminating '\0') stored consecutively in the
528 : same order. The size fields indicate the buffer layout. This frame
529 : is compressed according hdr/ftr specification. */
530 :
531 : struct fd_wksp_checkpt_v2_info {
532 : ulong mode;
533 : long wallclock;
534 : ulong app_id;
535 : ulong thread_id;
536 : ulong host_id;
537 : ulong cpu_id;
538 : ulong group_id;
539 : ulong tid;
540 : ulong user_id;
541 : /* FIXME: CONSIDER MAKING THESE ALL UCHAR / USHORT / 4 BYTE RESERVED */
542 : ulong sz_app; /* in [1,FD_LOG_NAME_MAX ~ 40B] */
543 : ulong sz_thread; /* " */
544 : ulong sz_host; /* " */
545 : ulong sz_cpu; /* " */
546 : ulong sz_group; /* " */
547 : ulong sz_user; /* " */
548 : ulong sz_path; /* in [1,PATH_MAX ~ 4KiB] */
549 : ulong sz_binfo; /* in [1,FD_WKSP_CHECKPT_V2_BINFO_MAX ~ 16KiB] */
550 : ulong sz_uinfo; /* in [1,FD_WKSP_CHECKPT_V2_UINFO_MAX ~ 16KiB] */
551 : };
552 :
553 : typedef struct fd_wksp_checkpt_v2_info fd_wksp_checkpt_v2_info_t;
554 :
555 : /* A v2 info frame is followed by zero or more volumes. A volume
556 : consists of zero or more cgroup frames and an appendix frame.
557 : Volumes are followed by a frame with a footer command and then an
558 : uncompressed footer frame.
559 :
560 : A cgroup frame starts with a zero or more meta commands that describe
561 : the allocations it contains followed by a data command that indicates
562 : the cgroup data section follows.
563 :
564 : An appendix frame starts with an appendix command, giving the number
565 : of cgroup frames it covers and the offset to the previous appendix
566 : frame (0 if the first appendix frame). This is followed by a ulong
567 : array with checkpt offsets to those cgroup frames followed a ulong
568 : array with the number of allocations in each cgroup frame. An
569 : appendix covers all cgroup frames between it and the previous
570 : appendix frame (or info frame if the first appendix).
571 :
572 : The last volume is followed by a compressed frame with a sole volumes
573 : command. The volumes command gives the offset of the appendix of the
574 : last volume (or 0 if there are no volumes). (This allows the final
575 : frame to be uncompressed while all the volumes can be compressed.)
576 :
577 : An uncompressed footer frame follows indicating the v2 checkpt is
578 : done. The command gives the total number of cgroup frames in the
579 : checkpt and the offset to the last volume's appendix (or 0 if no
580 : volumes).
581 :
582 : A fd_wksp_checkpt_v2_cmd_t supports writing an arbitrarily large
583 : checkpt single pass with only small upfront bounded allocation while
584 : supporting both streaming and parallel restore of those frames. */
585 :
586 : union fd_wksp_checkpt_v2_cmd {
587 : struct { ulong tag; /* > 0 */ ulong gaddr_lo; ulong gaddr_hi; } meta;
588 : struct { ulong tag; /* ==0 */ ulong cgroup_cnt; /* ==ULONG_MAX */ ulong frame_off; /* ==ULONG_MAX */ } data;
589 : struct { ulong tag; /* ==0 */ ulong cgroup_cnt; /* < ULONG_MAX */ ulong frame_off; /* < ULONG_MAX */ } appendix;
590 : struct { ulong tag; /* ==0 */ ulong cgroup_cnt; /* ==ULONG_MAX */ ulong frame_off; /* < ULONG_MAX */ } volumes;
591 : };
592 :
593 : typedef union fd_wksp_checkpt_v2_cmd fd_wksp_checkpt_v2_cmd_t;
594 :
595 : FD_FN_PURE static inline int
596 75 : fd_wksp_checkpt_v2_cmd_is_meta( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
597 75 : return cmd->meta.tag > 0UL;
598 75 : }
599 :
600 : FD_FN_PURE static inline int
601 21 : fd_wksp_checkpt_v2_cmd_is_data( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
602 21 : return (cmd->data.tag==0UL) & (cmd->data.cgroup_cnt==ULONG_MAX) & (cmd->data.frame_off==ULONG_MAX);
603 21 : }
604 :
605 : FD_FN_PURE static inline int
606 63 : fd_wksp_checkpt_v2_cmd_is_appendix( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
607 63 : return (cmd->appendix.tag==0UL) & (cmd->appendix.cgroup_cnt<ULONG_MAX) & (cmd->appendix.frame_off<ULONG_MAX);
608 63 : }
609 :
610 : FD_FN_PURE static inline int
611 0 : fd_wksp_checkpt_v2_cmd_is_volumes( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
612 0 : return (cmd->volumes.tag==0UL) & (cmd->volumes.cgroup_cnt==ULONG_MAX) & (cmd->volumes.frame_off<ULONG_MAX);
613 0 : }
614 :
615 : /* A fd_wksp_checkpt_v2_ftr_t gives the byte layout of the final frame
616 : of a wksp v2 checkpt. This frame contains this footer uncompressed.
617 : This is wksp checkpt header backwards plus some additional
618 : information to allow users to seek from the end of the checkpt to the
619 : header (checkpt_sz), to the appendix frame (frame_off_appendix) and
620 : do any allocations upfront necessary to completely unpack the
621 : checkpt. */
622 :
623 : struct fd_wksp_checkpt_v2_ftr {
624 : ulong alloc_cnt; /* total number of allocations in checkpt */
625 : ulong cgroup_cnt; /* total number of cgroups in checkpt */
626 : ulong volume_cnt; /* total number of volumes in checkpt */
627 : ulong frame_off; /* byte offset (relative to header initial byte) of the volumes command */
628 : ulong checkpt_sz; /* checkpt byte size, from header initial byte to the footer final byte inclusive (note that
629 : this can be used to convert offsets relative to header initial byte to offsets relative to
630 : the end-of-file / the one past the final footer byte) */
631 : ulong data_max; /* should match header */
632 : ulong part_max; /* " */
633 : uint seed; /* " */
634 : char name[ FD_SHMEM_NAME_MAX ]; /* " */
635 : uint reserved; /* " */
636 : int frame_style_compressed; /* " */
637 : int style; /* " */
638 : ulong unmagic; /* ==~FD_WKSP_MAGIC */
639 : };
640 :
641 : typedef struct fd_wksp_checkpt_v2_ftr fd_wksp_checkpt_v2_ftr_t;
642 :
643 : /* fd_wksp_private_{checkpt,restore,printf}_v1 provide the v1
644 : implementations of {checkpt,restore,printf}. That is, checkpt_v1
645 : will only write a v1 style checkpt while the {restore,printt}_v1 can
646 : assume that the path exclusively contains a v1 style checkpt. These
647 : can assume that the input arguments have been validated by their
648 : caller. The printf implementation can further assume verbose is
649 : positive and the verbose 0 information has already been printed. For
650 : checkpt/restore, if tpool is non-NULL, the operation will be
651 : parallelized over tpool threads [t0,t1). Assumes the caller is
652 : thread t0 and threads (t0,t1) are available for thread dispatch. */
653 :
654 : int
655 : fd_wksp_private_checkpt_v1( fd_tpool_t * tpool,
656 : ulong t0,
657 : ulong t1,
658 : fd_wksp_t * wksp,
659 : char const * path,
660 : ulong mode,
661 : char const * uinfo );
662 :
663 : int
664 : fd_wksp_private_restore_v1( fd_tpool_t * tpool,
665 : ulong t0,
666 : ulong t1,
667 : fd_wksp_t * wksp,
668 : char const * path,
669 : uint new_seed );
670 :
671 : int
672 : fd_wksp_private_printf_v1( int fd,
673 : char const * path,
674 : int verbose );
675 :
676 : /* Similarly for v2. Note that style==FD_WKSP_CHECKPT_STYLE_V3 in the
677 : fd_wksp_checkpt function becomes a FD_WKSP_CHECKPT_STYLE_V2 with a
678 : FD_CHECKPT_FRAME_STYLE_LZ4 cgroup frames in the checkpt itself. */
679 :
680 : int
681 : fd_wksp_private_checkpt_v2( fd_tpool_t * tpool,
682 : ulong t0,
683 : ulong t1,
684 : fd_wksp_t * wksp,
685 : char const * path,
686 : ulong mode,
687 : char const * uinfo,
688 : int frame_style_compresed );
689 :
690 : int
691 : fd_wksp_private_restore_v2( fd_tpool_t * tpool,
692 : ulong t0,
693 : ulong t1,
694 : fd_wksp_t * wksp,
695 : char const * path,
696 : uint new_seed );
697 :
698 : int
699 : fd_wksp_private_printf_v2( int fd,
700 : char const * path,
701 : int verbose );
702 :
703 : FD_PROTOTYPES_END
704 :
705 : #endif /* HEADER_fd_src_util_wksp_fd_wksp_private_h */
|