Line data Source code
1 : #ifndef HEADER_fd_src_util_wksp_fd_wksp_private_h
2 : #define HEADER_fd_src_util_wksp_fd_wksp_private_h
3 :
4 : #include "fd_wksp.h"
5 :
6 : /* If FD_WKSP_LOCK_RECLAIM==0, do not try to recover the lock from
7 : dead processes. This is useful, for example, if we know that the
8 : lock will not get acquired by another process, or that if another
9 : acquiring process dies that all potential users will get exited. It
10 : prevents a syscall on various common workspace paths (eg, alloc). */
11 :
12 : #ifndef FD_WKSP_LOCK_RECLAIM
13 : #define FD_WKSP_LOCK_RECLAIM 0
14 : #endif
15 :
16 : /* FD_WKSP_PRIVATE_PINFO_IDX_NULL is the pinfo index value used to
17 : indicate NULL */
18 :
19 25892195931 : #define FD_WKSP_PRIVATE_PINFO_IDX_NULL ((ulong)UINT_MAX)
20 :
21 : /* A fd_wksp_private_pinfo_t specifies details about a partition in a
22 : workspace and its relationship to other partitions in that workspace.
23 :
24 : FD_WKSP_PRIVATE_PINFO_ALIGN is an integer power of 2 and
25 : FD_WKSP_PRIVATE_PINFO_FOOTPRINT will be a multiple of align.
26 :
27 : If a partition is not idle:
28 :
29 : - [gaddr_lo,gaddr_hi) specify the range of offsets covered by this
30 : partition.
31 :
32 : wksp_gaddr_lo <= gaddr_lo < gaddr_hi <= wksp_gaddr_hi
33 :
34 : such that partitions always have at least 1 byte and are contained
35 : within the workspace's data region.
36 :
37 : - tag==0 indicates this partition is space in the workspace free
38 : for use and the partition is in the free treap, fast findable by
39 : its size. Otherwise, this partition is allocated and the partition
40 : is in the used treap, fast O(1) findable by any in range gaddr.
41 :
42 : - prev_cidx, next_cidx give the index (in compressed form) of the
43 : previous / next partition if present or IDX_NULL if this is the
44 : workspace partition head / tail partition (in which case gaddr_lo /
45 : gaddr_hi will be wksp->gaddr_lo / wksp->gaddr_hi). That is, for
46 : partition idx where idx is in [0,wksp->part_max):
47 :
48 : ulong prev_idx = fd_wksp_private_pinfo_idx( pinfo[ idx ].prev_cidx );
49 : if( fd_wksp_private_pinfo_idx_is_null( prev_idx ) ) {
50 : ... at this point:
51 : ... idx == fd_wksp_private_pinfo_idx( wksp->part_head_cidx ) );
52 : ... pinfo[ idx ].gaddr_lo == wksp->gaddr_lo );
53 : } else {
54 : ... at this point:
55 : ... idx ==_fd_wksp_private_pinfo_idx( pinfo[ prev_idx ].next_cidx );
56 : ... pinfo[ idx ].gaddr_lo == pinfo[ prev_idx ].gaddr_hi;
57 : }
58 :
59 : and:
60 :
61 : ulong next_idx = fd_wksp_private_pinfo_idx( pinfo[ idx ].next_cidx );
62 : if( fd_wksp_private_pinfo_idx_is_null( next_idx ) ) {
63 : ... at this point:
64 : ... idx == fd_wksp_private_pinfo_idx( wksp->part_tail_cidx ) );
65 : ... pinfo[ idx ].gaddr_hi == wksp->gaddr_hi );
66 : } else {
67 : ... at this point:
68 : ... idx ==_fd_wksp_private_pinfo_idx( pinfo[ next_idx ].prev_cidx );
69 : ... pinfo[ idx ].gaddr_hi == pinfo[ next_idx ].gaddr_lo;
70 : }
71 :
72 : - If partition idx is in the used treap, {left,right}_cidx specify
73 : the partition indices of the root of the left and right subtrees.
74 :
75 : The used treap obeys the binary search tree property that all
76 : partitions in the left/right subtree (if any) cover a range of
77 : offsets strictly lower/higher than range covered by idx.
78 :
79 : parent_cidx specifies idx's parent tree (if any). If idx is the
80 : wksp used tree root, parent_cidx will specify IDX_NULL and
81 : wksp->part_used_cidx will specify idx.
82 :
83 : The used treap also obeys the heap property to make it well
84 : balanced on average. Specifically, the idx's parent's heap
85 : priority will be at least idx's heap priority.
86 :
87 : in_same will be 0 and same_cidx will specify IDX_NULL as no used
88 : partitions can overlap.
89 :
90 : - If partition idx is in the free treap, if partition idx is not
91 : in a list of same sized partitions, in_same will be 0 and
92 : {left,right}_cidx specify the partition indices of the root of the
93 : left and right subtrees.
94 :
95 : The free treap obeys the binary search tree property that all
96 : partitions in the left/right subtree (if any) have partition sizes
97 : strictly lower/higher than partition idx's size.
98 :
99 : parent_cidx specifies idx's parent tree (if any). If idx is the
100 : wksp free tree root, parent_cidx will specify IDX_NULL and
101 : wksp->part_free_cidx will specify idx.
102 :
103 : The free treap also obeys the heap property to make it well
104 : balanced on average. Specifically, the idx's parent's heap
105 : priority will be at least idx's heap priority.
106 :
107 : If there are additional partitions of the same size to partition
108 : idx, same_cidx will refer to the next partition of the same size.
109 :
110 : If partition idx is in a list of same sized partitions, in_same
111 : will be 1 and parent_cidx / same_cidx will specify the prev / next
112 : index of additional partitions of the same size. same_cidx will
113 : specify IDX_NULL if no more.
114 :
115 : - heap_prio is a random value used as described above.
116 :
117 : - stack_cidx and cycle_tag are for internal use */
118 :
119 : /* TODO: Consider align 32/ footprint 96 without compressed indices if
120 : ever needing more than ~4B partitions. */
121 :
122 : #define FD_WKSP_PRIVATE_PINFO_ALIGN (64UL) /* At most FD_WKSP_ALIGN */
123 : #define FD_WKSP_PRIVATE_PINFO_FOOTPRINT (64UL)
124 :
125 : struct __attribute__((aligned(FD_WKSP_PRIVATE_PINFO_ALIGN))) fd_wksp_private_pinfo {
126 : ulong gaddr_lo; /* If in idle stack, 0 */
127 : ulong gaddr_hi; /* ", 0 */
128 : ulong tag; /* ", 0 */
129 : uint heap_prio : 31; /* 30 bit priority and 1 bit free to use for infinite priority bulk tree ops */
130 : uint in_same : 1; /* 1 if in a same list and 0 otherwise */
131 : uint prev_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
132 : uint next_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
133 : uint left_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
134 : uint right_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
135 : uint parent_cidx; /* ", cidx of next idle or fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) if no more */
136 : uint same_cidx; /* ", fd_wksp_private_pinfo_cidx( FD_WKSP_INFO_IDX_NULL ) */
137 : uint stack_cidx; /* internal use */
138 : ulong cycle_tag; /* internal use */
139 : };
140 :
141 : typedef struct fd_wksp_private_pinfo fd_wksp_private_pinfo_t;
142 :
143 : /* FD_WKSP_MAGIC is an ideally unique number that specifies the precise
144 : memory layout of a fd_wksp. */
145 :
146 108 : #define FD_WKSP_MAGIC (0xF17EDA2C3731C591UL) /* F17E=FIRE,DA2C/3R<>DANCER,31/C59<>WKSP,0<>0 --> FIRE DANCER WKSP VERSION 1 */
147 :
148 : /* fd_wksp_private specifies the detailed layout of the internals of a
149 : fd_wksp_t */
150 :
151 : struct fd_wksp_private {
152 :
153 : /* This point is FD_WKSP_ALIGN aligned */
154 :
155 : /* This fields are static and mostly in the first cache line */
156 :
157 : ulong magic; /* ==FD_WKSP_MAGIC */
158 : ulong part_max; /* Max wksp partitions */
159 : ulong data_max; /* Data region */
160 : ulong gaddr_lo; /* ==fd_wksp_private_data_off( part_max ), data region covers offsets [gaddr_lo,gaddr_hi) */
161 : ulong gaddr_hi; /* ==gaddr_lo + data_max, offset gaddr_hi is to 1 byte footer */
162 : char name[ FD_SHMEM_NAME_MAX ]; /* (Convenience) backing fd_shmem region cstr name */
163 : uint seed; /* Heap priority random number seed, arbitrary */
164 :
165 : /* These fields are dynamic and in the adjacent cache line */
166 :
167 : uint idle_top_cidx; /* Stack of partition infos not in use, parent_idx is next pointer */
168 : uint part_head_cidx; /* Index for info about the leftmost partition */
169 : uint part_tail_cidx; /* Index for info about the rightmost partition */
170 : uint part_used_cidx; /* Treap of partitions that are currently used (tag!=0), searchable by gaddr */
171 : uint part_free_cidx; /* Treap of partitions that are currently free (tag==0), searchable by size */
172 : ulong cycle_tag; /* Used for cycle detection */
173 : ulong owner; /* thread group id of the owner or NULL otherwise */
174 :
175 : /* IMPORTANT! The "single-source-of-truth" for what is currently
176 : used (and its tags) is the set of non-zero tagged partitions in the
177 : partition info array. The idle stack, partition list, used treap
178 : and free treap are auxiliary data structuring that can be
179 : reconstructed at any time from this single source of truth.
180 :
181 : Conversely, if there accidental or deliberate data corruption of
182 : the wksp metadata resulting in a conflict between what is stored
183 : in the partition info array and the auxiliary data structures,
184 : the partition info array governs. */
185 :
186 : /* Padding to FD_WKSP_PRIVATE_PINFO_ALIGN here */
187 :
188 : /* part_max pinfo here */
189 : /* data_max byte data region here */
190 : /* 1 footer byte here */
191 : /* Padding to FD_WKSP_ALIGN here */
192 : };
193 :
194 : FD_PROTOTYPES_BEGIN
195 :
196 : /* fd_wksp_private_pinfo_sz returns the size of a partition in bytes.
197 : Assumes pinfo points to the pinfo of a partition in a current local
198 : join. Will be positive. */
199 :
200 : FD_FN_PURE static inline ulong
201 7126867671 : fd_wksp_private_pinfo_sz( fd_wksp_private_pinfo_t const * pinfo ) {
202 7126867671 : return pinfo->gaddr_hi - pinfo->gaddr_lo;
203 7126867671 : }
204 :
205 : /* fd_wksp_private_{part,data}_off return the wksp offset of the
206 : pinfo array and the data region. data_off assumes part_max is a
207 : value that will not overflow. */
208 :
209 : FD_FN_CONST static inline ulong
210 0 : fd_wksp_private_pinfo_off( void ) {
211 0 : return 128UL; /* fd_ulong_align_up( sizeof(fd_wksp_t), FD_WKSP_PRIVATE_PINFO_ALIGN ); */
212 0 : }
213 :
214 : FD_FN_CONST static inline ulong
215 2856 : fd_wksp_private_data_off( ulong part_max ) {
216 2856 : return fd_wksp_private_pinfo_off() + part_max*sizeof(fd_wksp_private_pinfo_t);
217 2856 : }
218 :
219 : /* fd_wksp_private_pinfo returns the location of wksp pinfo array in the
220 : caller's address space. Assumes wksp is a current local join.
221 : fd_wksp_private_pinfo_const is a const-correct version. */
222 :
223 : FD_FN_CONST static inline fd_wksp_private_pinfo_t *
224 545653362 : fd_wksp_private_pinfo( fd_wksp_t * wksp ) {
225 545653362 : return (fd_wksp_private_pinfo_t *)(((ulong)wksp) + fd_wksp_private_pinfo_off());
226 545653362 : }
227 :
228 : FD_FN_CONST static inline fd_wksp_private_pinfo_t const *
229 0 : fd_wksp_private_pinfo_const( fd_wksp_t const * wksp ) {
230 0 : return (fd_wksp_private_pinfo_t const *)(((ulong)wksp) + fd_wksp_private_pinfo_off());
231 0 : }
232 :
233 : /* fd_wksp_private_pinfo_{cidx,idx} compresses / uncompresses a pinfo index */
234 :
235 8340952356 : static inline uint fd_wksp_private_pinfo_cidx( ulong idx ) { return (uint) idx; }
236 17130567966 : static inline ulong fd_wksp_private_pinfo_idx ( uint cidx ) { return (ulong)cidx; }
237 :
238 : /* fd_wksp_private_pinfo_idx_is_null returns 1 if idx is
239 : FD_WKSP_PRIVATE_PINFO_IDX_NULL and 0 otherwise */
240 :
241 22059570471 : static inline int fd_wksp_private_pinfo_idx_is_null( ulong idx ) { return idx==FD_WKSP_PRIVATE_PINFO_IDX_NULL; }
242 :
243 : /* pinfo idle stack APIs **********************************************/
244 :
245 : /* fd_wksp_private_idle_stack_is_empty returns 1 if there are no idle
246 : partitions and 0 otherwise. Also returns 1 if corruption is
247 : detected. Assumes wksp is a current local join. */
248 :
249 : static inline int
250 7023 : fd_wksp_private_idle_stack_is_empty( fd_wksp_t * wksp ) {
251 7023 : return fd_wksp_private_pinfo_idx( wksp->idle_top_cidx ) >= wksp->part_max;
252 7023 : }
253 :
254 : /* fd_wksp_private_idle_stack_pop pops an idle partition off wksp's idle
255 : stack. Assumes the caller knows idle stack is not empty. The caller
256 : is promised that the popped partition has [gaddr_lo,gaddr_hi) = [0,0)
257 : tag 0, {prev, next, left, right, same, parent}_cidx specify IDX_NULL.
258 : Further, heap_prio should have been assigned a random value.
259 : stack_idx and cycle_tag are for internal use. */
260 :
261 : static inline ulong /* Assumes in [0,part_max) */
262 : fd_wksp_private_idle_stack_pop( fd_wksp_t * wksp, /* Assumes current local join */
263 113798706 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
264 113798706 : ulong i = fd_wksp_private_pinfo_idx( wksp->idle_top_cidx );
265 113798706 : wksp->idle_top_cidx = pinfo[ i ].parent_cidx;
266 113798706 : pinfo[ i ].parent_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
267 113798706 : return i;
268 113798706 : }
269 :
270 : /* fd_wksp_private_idle_stack_push pushes partition i onto the idle
271 : stack. Assumes the caller knows i is not currently in the idle
272 : stack, partitioning, used treap or free treap. */
273 :
274 : static inline void
275 : fd_wksp_private_idle_stack_push( ulong i, /* Assumes in [0,part_max) */
276 : fd_wksp_t * wksp, /* Assumes current local join */
277 118694886 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
278 118694886 : pinfo[ i ].gaddr_lo = 0UL;
279 118694886 : pinfo[ i ].gaddr_hi = 0UL;
280 118694886 : pinfo[ i ].tag = 0U;
281 118694886 : pinfo[ i ].in_same = 0U;
282 118694886 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
283 118694886 : pinfo[ i ].next_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
284 118694886 : pinfo[ i ].left_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
285 118694886 : pinfo[ i ].right_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
286 118694886 : pinfo[ i ].same_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
287 118694886 : pinfo[ i ].parent_cidx = wksp->idle_top_cidx;
288 118694886 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( i );
289 118694886 : }
290 :
291 : /* pinfo used treap APIs **********************************************/
292 :
293 : /* fd_wksp_private_used_treap_query queries wksp's used treap for the
294 : used partition that holds gaddr. On success, returns the requested
295 : partition idx, in [0,part_max), and, on failure, returns IDX_NULL.
296 : Reasons for failure include gaddr is not in a used partition and
297 : internal treap corruption detected. Might consume a wksp cycle tag
298 : and clobber partition cycle tags. Reasonably fast O(lg N) where N is
299 : the number of used partitions. */
300 :
301 : ulong
302 : fd_wksp_private_used_treap_query( ulong gaddr,
303 : fd_wksp_t * wksp,
304 : fd_wksp_private_pinfo_t * pinfo );
305 :
306 : /* fd_wksp_private_used_treap_insert inserts partition n into wksp's
307 : used treap. Assumes n is not in the idle stack, used treap or free
308 : treap. Does not care if n is in the partitioning or not. Reasonably
309 : fast O(lg N) where N is the number of used partitions.
310 :
311 : Partition n should have [gaddr_lo,gaddr_hi) and heap_prio initialized
312 : on entry (heap_prio should be a random value). tag need not be
313 : initialized but it is assumed that the caller will set the tag to its
314 : final value on success to make the partition officially used. This
315 : will initialize {in_same, left, right, same, parent}_cidx. This will
316 : ignore {prev,next}_cidx. This might consume a wksp cycle tag and
317 : clobber partition stack_cidx and cycle_tag fields.
318 :
319 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
320 : (negative) on failure (logs details for failure). Reasons for
321 : failure include n is not in [0,part_max), n's range is not in wksp
322 : data region, n was detected as already inserted (this detection is
323 : not guaranteed), treap internal connectivity issues were detected
324 : (complete detection not guaranteed), and n overlaps with at least one
325 : element already inserted into the treap.
326 :
327 : On failure n and the treap itself were not modified (except possibly
328 : clobbering of stack_cidx and cycle_tag). Note that failure reasons
329 : are either user error or memory corruption. This cannot fail in
330 : normal operating circumstances. */
331 :
332 : int
333 : fd_wksp_private_used_treap_insert( ulong n,
334 : fd_wksp_t * wksp, /* Assumes current local join */
335 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
336 :
337 : /* fd_wksp_private_used_treap_remove removes partition d from wksp's
338 : used treap. Assumes d in the used treap, not in the free treap, not
339 : in the idle stack. Does not care if d is in the partitioning.
340 : Reasonably fast O(lg N) where N is the number of used partitions.
341 : This might consume a wksp cycle tag and clobber partition stack_cidx
342 : and cycle_tag fields.
343 :
344 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
345 : (negative) on failure (logs details for failure). Reasons for
346 : failure include d is not in [0,part_max) and treap internal
347 : connectivity issues were detected (complete detection not
348 : guaranteed).
349 :
350 : Note that failure reasons are either user error or memory corruption.
351 : This cannot fail in normal operating circumstances. */
352 :
353 : int
354 : fd_wksp_private_used_treap_remove( ulong d,
355 : fd_wksp_t * wksp, /* Assumes current local join */
356 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
357 :
358 : /* pinfo free treap APIs **********************************************/
359 :
360 : /* fd_wksp_private_free_treap_query queries wksp's free treap for the
361 : smallest partition of at least sz. On success, returns the index of
362 : a partition in the free treap suitable for sz, in [0,part_max), and,
363 : on failure, returns IDX_NULL. Reasons for failure include sz zero,
364 : sz is larger than any free partition, and internal treap corruption
365 : was detected. Might consume a wksp cycle tag and clobber partition
366 : cycle tags. Reasonably fast O(lg N) where N is the number of used
367 : partitions. */
368 :
369 : ulong
370 : fd_wksp_private_free_treap_query( ulong sz,
371 : fd_wksp_t * wksp, /* Assumes current local join */
372 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
373 :
374 : /* fd_wksp_private_free_treap_insert inserts partition n into wksp's
375 : free treap. Assumes n is not in the idle stack, used treap or free
376 : treap. Does not care if n is in the partitioning or not. Reasonably
377 : fast O(lg N) where N is the number of partitions in the free treap.
378 :
379 : Partition n should have [gaddr_lo,gaddr_hi) and heap_prio initialized
380 : on entry (heap_prio should be a random value). tag need not be
381 : initialized but it is assumed that the caller will zero the tag
382 : beforehand to make the partition officially free. This will
383 : initialize {in_same, left, right, same, parent}_cidx. This will
384 : ignore {prev,next}_cidx. This might consume a wksp cycle tag and
385 : clobber the partition stack_cidx and cycle_tag fields.
386 :
387 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
388 : (negative) on failure (logs details for failure). Reasons for
389 : failure include n is not in [0,part_max), n's range is not in wksp
390 : data region, n's tag is not zero, n was detected as already inserted
391 : (this detection is not guaranteed), treap internal connectivity
392 : issues were detected (complete detection not guaranteed).
393 :
394 : If n's size exactly matches the size of partition already in the
395 : treap, n will be pushed onto that partition's same stack rather than
396 : inserted into the treap.
397 :
398 : On failure n and the treap itself were not modified (except possibly
399 : clobbering of stack_cidx and cycle_tag). Note that failures reasons
400 : are either user error or memory corruption. This has no failures in
401 : normal operating circumstances. */
402 :
403 : int
404 : fd_wksp_private_free_treap_insert( ulong n,
405 : fd_wksp_t * wksp, /* Assumes current local join */
406 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
407 :
408 : /* fd_wksp_private_free_treap_same_is_empty returns 1 if the same list
409 : for d is empty and 0 if not. Returns 1 if corruption in detected.
410 : Assumes d is in the free treap. */
411 :
412 : static inline int
413 : fd_wksp_private_free_treap_same_is_empty( ulong d,
414 : fd_wksp_t * wksp, /* Assumes current local join */
415 78520242 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
416 78520242 : ulong part_max = wksp->part_max;
417 78520242 : return fd_wksp_private_pinfo_idx( pinfo[ d ].same_cidx )>=part_max;
418 78520242 : }
419 :
420 : /* fd_wksp_private_free_treap_same_remove removes the first partition
421 : from d's same list. Assumes the caller knows d's same list is not
422 : empty. The caller is promised that returned partition has the same
423 : size as d. */
424 :
425 : static inline ulong
426 : fd_wksp_private_free_treap_same_remove( ulong d,
427 : fd_wksp_t * wksp, /* Assumes current local join */
428 3604728 : fd_wksp_private_pinfo_t * pinfo ) { /* == fd_wksp_private_pinfo( wksp ) */
429 3604728 : ulong part_max = wksp->part_max;
430 3604728 : ulong i = fd_wksp_private_pinfo_idx( pinfo[ d ].same_cidx );
431 3604728 : ulong j = fd_wksp_private_pinfo_idx( pinfo[ i ].same_cidx );
432 3604728 : /**/ pinfo[ d ].same_cidx = fd_wksp_private_pinfo_cidx( j );
433 3604728 : if( j<part_max ) pinfo[ j ].parent_cidx = fd_wksp_private_pinfo_cidx( d );
434 3604728 : pinfo[ i ].in_same = 0U;
435 3604728 : pinfo[ i ].same_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
436 3604728 : pinfo[ i ].parent_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
437 3604728 : return i;
438 3604728 : }
439 :
440 : /* fd_wksp_private_free_treap_remove removes partition d from wksp's
441 : free treap. Assumes d in the free treap, not in the used treap, not
442 : in the idle stack. Does not care if d is in the partitioning.
443 : Reasonably fast O(lg N) where N is the number of free partitions.
444 : This might consume a wksp cycle tag and clobber partition stack_cidx
445 : and cycle_tag fields. There is an edge case where d's can be swapped
446 : with another same sized partition.
447 :
448 : Returns FD_WKSP_SUCCESS (zero) on success and a FD_WKSP_ERR_*
449 : (negative) on failure (logs details for failure). Reasons for
450 : failure include d is not in [0,part_max) and treap internal
451 : connectivity issues were detected (complete detection not
452 : guaranteed).
453 :
454 : Note that failure reasons are either user error or memory corruption.
455 : This cannot fail in normal operating circumstances. */
456 :
457 : int
458 : fd_wksp_private_free_treap_remove( ulong d,
459 : fd_wksp_t * wksp, /* Assumes current local join */
460 : fd_wksp_private_pinfo_t * pinfo ); /* == fd_wksp_private_pinfo( wksp ) */
461 :
462 : /* private admin APIs *************************************************/
463 :
464 : /* fd_wksp_private_lock locks wksp. Assumes wksp is a current local
465 : join. If wksp is already locked, this will wait for the caller. If
466 : this detects that the caller died while holding the lock, it will try
467 : to steal the lock from the dead caller and cleanup any incomplete
468 : operation the caller was doing. Returns FD_WKSP_SUCCESS (0) if the
469 : lock was acquired or FD_WKSP_ERR_CORRUPT if the lock could not be
470 : obtained because memory corruption was detected while trying to
471 : recover from a dead caller that corrupted the wksp memory. */
472 :
473 : int
474 : fd_wksp_private_lock( fd_wksp_t * wksp );
475 :
476 : /* fd_wksp_private_unlock unlocks a locked wksp. Assumes wksp is a
477 : current local join and the caller has the lock */
478 :
479 : static inline void
480 545652684 : fd_wksp_private_unlock( fd_wksp_t * wksp ) {
481 545652684 : FD_COMPILER_MFENCE();
482 545652684 : FD_VOLATILE( wksp->owner ) = ULONG_MAX;
483 545652684 : FD_COMPILER_MFENCE();
484 545652684 : }
485 :
486 : /* private checkpt/restore APIs ***************************************/
487 : /* FIXME: MOVE THIS TO PUBLIC HEADER? */
488 :
489 : /* FD_WKSP_CHECKPT_{V1,V2}_{BINFO,UINFO}_MAX give the maximum byte size
490 : (including the terminating '\0') of a decompressed {v1,v2} checkpt
491 : {build,user} info cstr. */
492 :
493 9 : #define FD_WKSP_CHECKPT_V1_BINFO_MAX (16384UL)
494 9 : #define FD_WKSP_CHECKPT_V1_UINFO_MAX (16384UL)
495 :
496 : #define FD_WKSP_CHECKPT_V2_BINFO_MAX (16384UL)
497 : #define FD_WKSP_CHECKPT_V2_UINFO_MAX (16384UL)
498 :
499 : /* A fd_wksp_checkpt_v2_hdr_t gives the byte layout of frame 0 of a wksp
500 : v2 checkpt. This frame contains the style, compression algo used for
501 : the info, cgroup and appendix frames and fd_wksp_preview information
502 : uncompressed. */
503 :
504 : struct fd_wksp_checkpt_v2_hdr {
505 : ulong magic; /* Must be first, ==FD_WKSP_MAGIC */
506 : int style; /* Must be second, wksp checkpt style */
507 : int frame_style_compressed; /* frame style used for compressed frames */
508 : uint reserved; /* header padding */
509 : char name[ FD_SHMEM_NAME_MAX ]; /* cstr holding the original wksp name (note: FD_SHMEM_NAME_MAX==FD_LOG_NAME_MAX==40) */
510 : uint seed; /* wksp seed when checkpointed (probably same used to construct) */
511 : ulong part_max; /* part_max used to construct the wksp */
512 : ulong data_max; /* data_max used to construct the wksp */
513 : };
514 :
515 : typedef struct fd_wksp_checkpt_v2_hdr fd_wksp_checkpt_v2_hdr_t;
516 :
517 : /* A fd_wksp_checkpt_v2_info_t gives the byte layout of frame 1 of a
518 : wksp v2 checkpt. frame 1 immediately follows frame 0 and this frame
519 : contains the info structure followed compactly by the corresponding
520 : cstr (including the terminating '\0') stored consecutively in the
521 : same order. The size fields indicate the buffer layout. This frame
522 : is compressed according hdr/ftr specification. */
523 :
524 : struct fd_wksp_checkpt_v2_info {
525 : ulong mode;
526 : long wallclock;
527 : ulong app_id;
528 : ulong thread_id;
529 : ulong host_id;
530 : ulong cpu_id;
531 : ulong group_id;
532 : ulong tid;
533 : ulong user_id;
534 : /* FIXME: CONSIDER MAKING THESE ALL UCHAR / USHORT / 4 BYTE RESERVED */
535 : ulong sz_app; /* in [1,FD_LOG_NAME_MAX ~ 40B] */
536 : ulong sz_thread; /* " */
537 : ulong sz_host; /* " */
538 : ulong sz_cpu; /* " */
539 : ulong sz_group; /* " */
540 : ulong sz_user; /* " */
541 : ulong sz_path; /* in [1,PATH_MAX ~ 4KiB] */
542 : ulong sz_binfo; /* in [1,FD_WKSP_CHECKPT_V2_BINFO_MAX ~ 16KiB] */
543 : ulong sz_uinfo; /* in [1,FD_WKSP_CHECKPT_V2_UINFO_MAX ~ 16KiB] */
544 : };
545 :
546 : typedef struct fd_wksp_checkpt_v2_info fd_wksp_checkpt_v2_info_t;
547 :
548 : /* A v2 info frame is followed by zero or more volumes. A volume
549 : consists of zero or more cgroup frames and an appendix frame.
550 : Volumes are followed by a frame with a footer command and then an
551 : uncompressed footer frame.
552 :
553 : A cgroup frame starts with a zero or more meta commands that describe
554 : the allocations it contains followed by a data command that indicates
555 : the cgroup data section follows.
556 :
557 : An appendix frame starts with an appendix command, giving the number
558 : of cgroup frames it covers and the offset to the previous appendix
559 : frame (0 if the first appendix frame). This is followed by a ulong
560 : array with checkpt offsets to those cgroup frames followed a ulong
561 : array with the number of allocations in each cgroup frame. An
562 : appendix covers all cgroup frames between it and the previous
563 : appendix frame (or info frame if the first appendix).
564 :
565 : The last volume is followed by a compressed frame with a sole volumes
566 : command. The volumes command gives the offset of the appendix of the
567 : last volume (or 0 if there are no volumes). (This allows the final
568 : frame to be uncompressed while all the volumes can be compressed.)
569 :
570 : An uncompressed footer frame follows indicating the v2 checkpt is
571 : done. The command gives the total number of cgroup frames in the
572 : checkpt and the offset to the last volume's appendix (or 0 if no
573 : volumes).
574 :
575 : A fd_wksp_checkpt_v2_cmd_t supports writing an arbitrarily large
576 : checkpt single pass with only small upfront bounded allocation while
577 : supporting both streaming and parallel restore of those frames. */
578 :
579 : union fd_wksp_checkpt_v2_cmd {
580 : struct { ulong tag; /* > 0 */ ulong gaddr_lo; ulong gaddr_hi; } meta;
581 : struct { ulong tag; /* ==0 */ ulong cgroup_cnt; /* ==ULONG_MAX */ ulong frame_off; /* ==ULONG_MAX */ } data;
582 : struct { ulong tag; /* ==0 */ ulong cgroup_cnt; /* < ULONG_MAX */ ulong frame_off; /* < ULONG_MAX */ } appendix;
583 : struct { ulong tag; /* ==0 */ ulong cgroup_cnt; /* ==ULONG_MAX */ ulong frame_off; /* < ULONG_MAX */ } volumes;
584 : };
585 :
586 : typedef union fd_wksp_checkpt_v2_cmd fd_wksp_checkpt_v2_cmd_t;
587 :
588 : FD_FN_PURE static inline int
589 75 : fd_wksp_checkpt_v2_cmd_is_meta( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
590 75 : return cmd->meta.tag > 0UL;
591 75 : }
592 :
593 : FD_FN_PURE static inline int
594 21 : fd_wksp_checkpt_v2_cmd_is_data( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
595 21 : return (cmd->data.tag==0UL) & (cmd->data.cgroup_cnt==ULONG_MAX) & (cmd->data.frame_off==ULONG_MAX);
596 21 : }
597 :
598 : FD_FN_PURE static inline int
599 63 : fd_wksp_checkpt_v2_cmd_is_appendix( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
600 63 : return (cmd->appendix.tag==0UL) & (cmd->appendix.cgroup_cnt<ULONG_MAX) & (cmd->appendix.frame_off<ULONG_MAX);
601 63 : }
602 :
603 : FD_FN_PURE static inline int
604 0 : fd_wksp_checkpt_v2_cmd_is_volumes( fd_wksp_checkpt_v2_cmd_t const * cmd ) {
605 0 : return (cmd->volumes.tag==0UL) & (cmd->volumes.cgroup_cnt==ULONG_MAX) & (cmd->volumes.frame_off<ULONG_MAX);
606 0 : }
607 :
608 : /* A fd_wksp_checkpt_v2_ftr_t gives the byte layout of the final frame
609 : of a wksp v2 checkpt. This frame contains this footer uncompressed.
610 : This is wksp checkpt header backwards plus some additional
611 : information to allow users to seek from the end of the checkpt to the
612 : header (checkpt_sz), to the appendix frame (frame_off_appendix) and
613 : do any allocations upfront necessary to completely unpack the
614 : checkpt. */
615 :
616 : struct fd_wksp_checkpt_v2_ftr {
617 : ulong alloc_cnt; /* total number of allocations in checkpt */
618 : ulong cgroup_cnt; /* total number of cgroups in checkpt */
619 : ulong volume_cnt; /* total number of volumes in checkpt */
620 : ulong frame_off; /* byte offset (relative to header initial byte) of the volumes command */
621 : ulong checkpt_sz; /* checkpt byte size, from header initial byte to the footer final byte inclusive (note that
622 : this can be used to convert offsets relative to header initial byte to offsets relative to
623 : the end-of-file / the one past the final footer byte) */
624 : ulong data_max; /* should match header */
625 : ulong part_max; /* " */
626 : uint seed; /* " */
627 : char name[ FD_SHMEM_NAME_MAX ]; /* " */
628 : uint reserved; /* " */
629 : int frame_style_compressed; /* " */
630 : int style; /* " */
631 : ulong unmagic; /* ==~FD_WKSP_MAGIC */
632 : };
633 :
634 : typedef struct fd_wksp_checkpt_v2_ftr fd_wksp_checkpt_v2_ftr_t;
635 :
636 : /* fd_wksp_private_{checkpt,restore,printf}_v1 provide the v1
637 : implementations of {checkpt,restore,printf}. That is, checkpt_v1
638 : will only write a v1 style checkpt while the {restore,printt}_v1 can
639 : assume that the path exclusively contains a v1 style checkpt. These
640 : can assume that the input arguments have been validated by their
641 : caller. The printf implementation can further assume verbose is
642 : positive and the verbose 0 information has already been printed. For
643 : checkpt/restore, if tpool is non-NULL, the operation will be
644 : parallelized over tpool threads [t0,t1). Assumes the caller is
645 : thread t0 and threads (t0,t1) are available for thread dispatch. */
646 :
647 : int
648 : fd_wksp_private_checkpt_v1( fd_tpool_t * tpool,
649 : ulong t0,
650 : ulong t1,
651 : fd_wksp_t * wksp,
652 : char const * path,
653 : ulong mode,
654 : char const * uinfo );
655 :
656 : int
657 : fd_wksp_private_restore_v1( fd_tpool_t * tpool,
658 : ulong t0,
659 : ulong t1,
660 : fd_wksp_t * wksp,
661 : char const * path,
662 : uint new_seed );
663 :
664 : int
665 : fd_wksp_private_printf_v1( int fd,
666 : char const * path,
667 : int verbose );
668 :
669 : /* Similarly for v2. Note that style==FD_WKSP_CHECKPT_STYLE_V3 in the
670 : fd_wksp_checkpt function becomes a FD_WKSP_CHECKPT_STYLE_V2 with a
671 : FD_CHECKPT_FRAME_STYLE_LZ4 cgroup frames in the checkpt itself. */
672 :
673 : int
674 : fd_wksp_private_checkpt_v2( fd_tpool_t * tpool,
675 : ulong t0,
676 : ulong t1,
677 : fd_wksp_t * wksp,
678 : char const * path,
679 : ulong mode,
680 : char const * uinfo,
681 : int frame_style_compresed );
682 :
683 : int
684 : fd_wksp_private_restore_v2( fd_tpool_t * tpool,
685 : ulong t0,
686 : ulong t1,
687 : fd_wksp_t * wksp,
688 : char const * path,
689 : uint new_seed );
690 :
691 : int
692 : fd_wksp_private_printf_v2( int fd,
693 : char const * path,
694 : int verbose );
695 :
696 : FD_PROTOTYPES_END
697 :
698 : #endif /* HEADER_fd_src_util_wksp_fd_wksp_private_h */
|