Line data Source code
1 : #include "fd_wksp_private.h"
2 :
3 : int
4 60391680 : fd_wksp_private_lock( fd_wksp_t * wksp ) {
5 : # if FD_WKSP_LOCK_RECLAIM
6 : int warning = 0;
7 : #endif
8 60391680 : ulong me = fd_log_group_id();
9 :
10 60391680 : ulong * _owner = &wksp->owner;
11 60391680 : for(;;) {
12 :
13 : /* Note that we emulate CAS on platforms without FD_HAS_ATOMIC
14 : to minimize the amount of code differences we have to test. On
15 : platforms without FD_HAS_ATOMIC, a workspace should not be used
16 : concurrently though. */
17 :
18 60391680 : FD_COMPILER_MFENCE();
19 60391680 : # if FD_HAS_ATOMIC
20 60391680 : ulong pid = FD_ATOMIC_CAS( _owner, ULONG_MAX, me );
21 : # else
22 : ulong pid = FD_VOLATILE_CONST( *_owner );
23 : if( pid==ULONG_MAX ) FD_VOLATILE( *_owner ) = me;
24 : # endif
25 60391680 : FD_COMPILER_MFENCE();
26 :
27 60391680 : if( FD_LIKELY( pid==ULONG_MAX ) ) return FD_WKSP_SUCCESS;
28 :
29 : # if FD_WKSP_LOCK_RECLAIM
30 : int status = fd_log_group_id_query( pid );
31 : if( FD_UNLIKELY( status==FD_LOG_GROUP_ID_QUERY_DEAD ) ) { /* A process died while holding the lock, try to recover the lock */
32 :
33 : FD_COMPILER_MFENCE();
34 : # if FD_HAS_ATOMIC
35 : ulong cur = FD_ATOMIC_CAS( _owner, pid, me );
36 : # else
37 : ulong cur = FD_VOLATILE_CONST( *_owner );
38 : if( cur==pid ) FD_VOLATILE( *_owner ) = me;
39 : # endif
40 : FD_COMPILER_MFENCE();
41 :
42 : if( FD_LIKELY( cur==pid ) ) { /* We recovered the lock from the dead pid, try to fix up incomplete ops */
43 :
44 : FD_LOG_WARNING(( "Process %lu died in an operation on wksp %s; verifying", pid, wksp->name ));
45 : if( FD_LIKELY( !fd_wksp_verify( wksp ) ) ) { /* logs details of issues detected */
46 : FD_LOG_NOTICE(( "wksp verified" ));
47 : return FD_WKSP_SUCCESS;
48 : }
49 :
50 : FD_LOG_WARNING(( "Issues detected; rebuilding" ));
51 : if( FD_UNLIKELY( fd_wksp_rebuild( wksp, wksp->seed ) ) ) { /* Rebuild failed (logs details of issues detected) */
52 : /* Return control of the lock to the previous owner */
53 : FD_COMPILER_MFENCE();
54 : FD_VOLATILE( *_owner ) = pid;
55 : FD_COMPILER_MFENCE();
56 : FD_LOG_WARNING(( "corrupt wksp detected" ));
57 : return FD_WKSP_ERR_CORRUPT;
58 : }
59 :
60 : FD_LOG_NOTICE(( "wksp rebuilt" ));
61 : return FD_WKSP_SUCCESS;
62 :
63 : }
64 :
65 : /* Somebody beat us to recovering the lock ... try again */
66 :
67 : } else if( FD_UNLIKELY( status!=FD_LOG_GROUP_ID_QUERY_LIVE ) ) { /* Unclear pid status ... issue a warning and try again */
68 :
69 : if( FD_UNLIKELY( !warning ) ) {
70 : FD_LOG_WARNING(( "wksp %s is owned by unknown pid %li; attempting to recover", wksp->name, pid ));
71 : warning = 1;
72 : }
73 :
74 : }
75 :
76 : /* At this point, either another thread in this process has the
77 : lock, another active thread in another process has the lock,
78 : another unknown status thread in other process has the lock or
79 : another thread beat us to reclaim the lock from a dead process.
80 : In any case, we don't have the lock. Wait a while to limit O/S
81 : contention and try again. */
82 :
83 : FD_YIELD();
84 : # else
85 :
86 : /* If we are running without FD_WKSP_LOCK_RECLAIM then it is assumed
87 : that the contention is caused by a tile pinned to another core,
88 : and that this core is itself pinned so spin locking is best. */
89 0 : FD_SPIN_PAUSE();
90 :
91 0 : #endif
92 0 : }
93 :
94 : /* never get here */
95 60391680 : }
96 :
97 : /* Public APIs ********************************************************/
98 :
99 : ulong
100 : fd_wksp_part_max_est( ulong footprint,
101 303 : ulong sz_typical ) {
102 303 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
103 303 : ulong data_end = footprint - 1UL;
104 303 : ulong pinfo_off = fd_wksp_private_pinfo_off();
105 303 : ulong consumed = sizeof(fd_wksp_private_pinfo_t) + sz_typical;
106 303 : ulong part_max = (data_end - pinfo_off) / (consumed + (ulong)!consumed); /* avoid div-by-zero */
107 303 : if( FD_UNLIKELY( (!footprint) | (!sz_typical) | (sz_typical>consumed) | (pinfo_off>data_end) ) ) return 0UL;
108 294 : return fd_ulong_min( part_max, FD_WKSP_PRIVATE_PINFO_IDX_NULL );
109 303 : }
110 :
111 : ulong
112 : fd_wksp_data_max_est( ulong footprint,
113 276 : ulong part_max ) {
114 276 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
115 276 : ulong data_end = footprint - 1UL;
116 276 : ulong data_off = fd_wksp_private_data_off( part_max );
117 276 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) |
118 276 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* covered above */
119 276 : (!footprint) | (data_off>=data_end) ) ) return 0UL;
120 258 : return data_end - data_off;
121 276 : }
122 :
123 : ulong
124 3 : fd_wksp_align( void ) {
125 3 : return FD_WKSP_ALIGN;
126 3 : }
127 :
128 : ulong
129 : fd_wksp_footprint( ulong part_max,
130 1461 : ulong data_max ) {
131 1461 : ulong data_off = fd_wksp_private_data_off( part_max );
132 1461 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) | (!data_max) |
133 1461 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* Covered above */
134 1461 : (data_max > (ULONG_MAX - FD_WKSP_ALIGN + 1UL - data_off - 1UL) ) ) ) return 0UL;
135 1392 : return fd_ulong_align_up( data_off + data_max + 1UL, FD_WKSP_ALIGN );
136 1461 : }
137 :
138 : void *
139 : fd_wksp_new( void * shmem,
140 : char const * name,
141 : uint seed,
142 : ulong part_max,
143 261 : ulong data_max ) {
144 261 : fd_wksp_t * wksp = (fd_wksp_t *)shmem;
145 :
146 261 : if( FD_UNLIKELY( !wksp ) ) {
147 3 : FD_LOG_WARNING(( "NULL shmem" ));
148 3 : return NULL;
149 3 : }
150 :
151 258 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
152 3 : FD_LOG_WARNING(( "bad align" ));
153 3 : return NULL;
154 3 : }
155 :
156 255 : ulong name_len = fd_shmem_name_len( name );
157 255 : if( FD_UNLIKELY( !name_len ) ) {
158 3 : FD_LOG_WARNING(( "bad name" ));
159 3 : return NULL;
160 3 : }
161 :
162 252 : ulong footprint = fd_wksp_footprint( part_max, data_max );
163 252 : if( FD_UNLIKELY( !footprint ) ) {
164 12 : FD_LOG_WARNING(( "bad part_max and/or data_max" ));
165 12 : return NULL;
166 12 : }
167 :
168 240 : fd_memset( wksp, 0, fd_wksp_footprint( part_max, 1UL ) );
169 :
170 240 : wksp->part_max = part_max;
171 240 : wksp->data_max = data_max;
172 240 : wksp->gaddr_lo = fd_wksp_private_data_off( part_max );
173 240 : wksp->gaddr_hi = wksp->gaddr_lo + data_max;
174 240 : fd_memcpy( wksp->name, name, name_len+1UL );
175 240 : wksp->seed = seed;
176 240 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
177 240 : wksp->part_head_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
178 240 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
179 240 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
180 240 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
181 240 : wksp->cycle_tag = 4UL; /* Verify uses tags 0-3 */
182 240 : wksp->owner = 0UL; /* Mark as locked and in construction */
183 :
184 : /* Note that wksp->owner was set to zero above, "locking" the wksp by
185 : group_id 0. And the memset above set all the partition tags to
186 : zero such that there are no allocated partitions. So once we set
187 : magic below, we can finish the initialization by rebuilding and
188 : unlocking. Since fd_log_group_id is non-zero, the zero owner
189 : indicates to any remote observer of the shared memory region that
190 : the wksp is being built for the first time. */
191 :
192 240 : FD_COMPILER_MFENCE();
193 240 : FD_VOLATILE( wksp->magic ) = FD_WKSP_MAGIC;
194 240 : FD_COMPILER_MFENCE();
195 :
196 240 : int err = fd_wksp_rebuild( wksp, seed );
197 240 : if( FD_UNLIKELY( err ) ) { /* Should be impossible at this point */
198 :
199 0 : FD_COMPILER_MFENCE();
200 0 : FD_VOLATILE( wksp->magic ) = 0UL;
201 0 : FD_COMPILER_MFENCE();
202 :
203 0 : FD_LOG_WARNING(( "fd_wksp_rebuild failed (%i-%s)", err, fd_wksp_strerror( err ) ));
204 0 : return NULL;
205 0 : }
206 :
207 : #if FD_HAS_DEEPASAN
208 : /* Poison entire workspace except wksp header and the pinfo array. */
209 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
210 : fd_asan_poison( wksp_data, footprint - fd_wksp_private_pinfo_off() );
211 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
212 : fd_asan_unpoison( pinfo, FD_WKSP_PRIVATE_PINFO_FOOTPRINT * part_max );
213 : #endif
214 :
215 240 : fd_wksp_private_unlock( wksp );
216 :
217 240 : return wksp;
218 240 : }
219 :
220 : fd_wksp_t *
221 2052 : fd_wksp_join( void * shwksp ) {
222 2052 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
223 :
224 2052 : if( FD_UNLIKELY( !wksp ) ) {
225 3 : FD_LOG_WARNING(( "NULL shwksp" ));
226 3 : return NULL;
227 3 : }
228 :
229 2049 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
230 3 : FD_LOG_WARNING(( "bad align" ));
231 3 : return NULL;
232 3 : }
233 :
234 2046 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
235 3 : FD_LOG_WARNING(( "bad magic" ));
236 3 : return NULL;
237 3 : }
238 :
239 2043 : return wksp;
240 2046 : }
241 :
242 : void *
243 1836 : fd_wksp_leave( fd_wksp_t * wksp ) {
244 1836 : if( FD_UNLIKELY( !wksp ) ) {
245 3 : FD_LOG_WARNING(( "NULL wksp" ));
246 3 : return NULL;
247 3 : }
248 :
249 1833 : return (void *)wksp;
250 1836 : }
251 :
252 : void *
253 126 : fd_wksp_delete( void * shwksp ) {
254 126 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
255 :
256 126 : if( FD_UNLIKELY( !wksp ) ) {
257 3 : FD_LOG_WARNING(( "NULL shwksp" ));
258 3 : return NULL;
259 3 : }
260 :
261 123 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
262 3 : FD_LOG_WARNING(( "bad align" ));
263 3 : return NULL;
264 3 : }
265 :
266 120 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
267 3 : FD_LOG_WARNING(( "bad magic" ));
268 3 : return NULL;
269 3 : }
270 :
271 : /* TODO: consider testing owner */
272 :
273 117 : FD_COMPILER_MFENCE();
274 117 : FD_VOLATILE( wksp->magic ) = 0UL;
275 117 : FD_COMPILER_MFENCE();
276 :
277 : # if FD_HAS_DEEPASAN
278 : /* Unpoison entire wksp region. */
279 : ulong footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
280 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
281 : fd_asan_unpoison( wksp_data, footprint - fd_wksp_private_pinfo_off());
282 : # endif
283 :
284 117 : return wksp;
285 120 : }
286 :
287 3 : char const * fd_wksp_name ( fd_wksp_t const * wksp ) { return wksp->name; }
288 111 : uint fd_wksp_seed ( fd_wksp_t const * wksp ) { return wksp->seed; }
289 6 : ulong fd_wksp_part_max( fd_wksp_t const * wksp ) { return wksp->part_max; }
290 6 : ulong fd_wksp_data_max( fd_wksp_t const * wksp ) { return wksp->data_max; }
291 6 : ulong fd_wksp_gaddr_lo( fd_wksp_t const * wksp ) { return wksp->gaddr_lo; }
292 153 : ulong fd_wksp_gaddr_hi( fd_wksp_t const * wksp ) { return wksp->gaddr_hi; }
293 :
294 : ulong
295 0 : fd_wksp_owner( fd_wksp_t const * wksp ) {
296 0 : FD_COMPILER_MFENCE();
297 0 : ulong owner = FD_VOLATILE_CONST( wksp->owner );
298 0 : FD_COMPILER_MFENCE();
299 0 : return owner;
300 0 : }
301 :
302 : char const *
303 48 : fd_wksp_strerror( int err ) {
304 48 : switch( err ) {
305 3 : case FD_WKSP_SUCCESS: return "success";
306 12 : case FD_WKSP_ERR_INVAL: return "inval";
307 30 : case FD_WKSP_ERR_FAIL: return "fail";
308 3 : case FD_WKSP_ERR_CORRUPT: return "corrupt";
309 0 : default: break;
310 48 : }
311 0 : return "unknown";
312 48 : }
313 :
314 : int
315 24 : fd_wksp_verify( fd_wksp_t * wksp ) {
316 :
317 402399 : # define TEST(c) do { \
318 402399 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_WKSP_ERR_CORRUPT; } \
319 402399 : } while(0)
320 :
321 : /* Validate metadata */
322 :
323 24 : TEST( wksp );
324 24 : TEST( wksp->magic==FD_WKSP_MAGIC );
325 :
326 24 : ulong part_max = wksp->part_max;
327 24 : ulong data_max = wksp->data_max;
328 24 : TEST( fd_wksp_footprint( part_max, data_max ) );
329 :
330 24 : ulong gaddr_lo = wksp->gaddr_lo; TEST( gaddr_lo==fd_wksp_private_data_off( part_max ) );
331 24 : ulong gaddr_hi = wksp->gaddr_hi; TEST( gaddr_hi==gaddr_lo+data_max );
332 :
333 24 : TEST( fd_shmem_name_len( wksp->name ) );
334 :
335 : /* seed is arbitrary */
336 :
337 24 : TEST( wksp->cycle_tag >= 4UL );
338 :
339 : /* TODO: consider verifying owner */
340 :
341 24 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
342 :
343 : /* Clear out cycle tags */
344 :
345 197952 : for( ulong i=0UL; i<part_max; i++ ) pinfo[ i ].cycle_tag = 0UL;
346 :
347 : /* Verify the idle stack */
348 :
349 24 : ulong idle_cnt = 0UL;
350 :
351 24 : do {
352 24 : ulong i = fd_wksp_private_pinfo_idx( wksp->idle_top_cidx );
353 197421 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
354 :
355 : /* Visit i. Note that i has not been validated yet. */
356 :
357 197397 : TEST( i<part_max ); /* Validate i */
358 197397 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
359 197397 : pinfo[ i ].cycle_tag = 1UL; /* Mark as visited in idle stack */
360 197397 : idle_cnt++; /* Update the idle cnt */
361 :
362 : /* Advance to the next idle */
363 :
364 197397 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx );
365 197397 : }
366 24 : } while(0);
367 :
368 : /* Idle stack looks intact, verify partitioning */
369 :
370 24 : ulong free_cnt = 0UL;
371 24 : ulong used_cnt = 0UL;
372 :
373 24 : do {
374 24 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
375 24 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_head_cidx );
376 24 : ulong g = gaddr_lo;
377 :
378 24 : int last_free = 0;
379 :
380 555 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
381 :
382 : /* At this point, we last visited j. Visit i. Note that j has
383 : been validated but i has not. */
384 :
385 531 : TEST( i<part_max ); /* Validate i */
386 531 : TEST( pinfo[ i ].gaddr_lo==g ); /* Make sure partition is tightly adjacent to previous */
387 531 : TEST( pinfo[ i ].gaddr_hi> g ); /* Make sure partition size is non-zero */
388 531 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].prev_cidx )==j ); /* Make sure correct prev partition */
389 531 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
390 531 : pinfo[ i ].cycle_tag = 2UL; /* Mark as visited in partitioning */
391 :
392 531 : g = pinfo[ i ].gaddr_hi; /* Extract where the next partition should start */
393 531 : int is_free = !pinfo[ i ].tag; /* Determine if this partition is free or used */
394 531 : TEST( !(last_free & is_free) ); /* Make sure no adjacent free partitions */
395 531 : free_cnt += (ulong) is_free; /* Update the free cnt */
396 531 : used_cnt += (ulong)!is_free; /* Update the used cnt */
397 :
398 : /* Advance to the next partition */
399 :
400 531 : last_free = is_free;
401 :
402 531 : j = i;
403 531 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].next_cidx );
404 531 : }
405 :
406 24 : TEST( fd_wksp_private_pinfo_idx( wksp->part_tail_cidx )==j ); /* Make sure correct partition tail */
407 24 : TEST( g==gaddr_hi ); /* Make sure complete partitioning */
408 24 : TEST( (idle_cnt + free_cnt + used_cnt)==part_max ); /* Make sure no lost idle partitions */
409 24 : } while(0);
410 :
411 : /* Idle stack and partitioning look intact, validate used treap */
412 :
413 24 : do {
414 24 : ulong visit_cnt = 0UL;
415 :
416 24 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
417 24 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
418 24 : ulong g = gaddr_lo;
419 :
420 24 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
421 12 : TEST( i<part_max ); /* Validate i */
422 12 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
423 12 : }
424 :
425 630 : for(;;) {
426 :
427 : /* At this point i is and everything on stack is validated */
428 :
429 630 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
430 327 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
431 :
432 : /* Pop stack */
433 :
434 303 : i = s;
435 303 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
436 :
437 : /* Visit i */
438 :
439 303 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
440 :
441 303 : TEST( pinfo[ i ].gaddr_lo>=g ); /* Make sure this starts after last visited */
442 303 : TEST( pinfo[ i ].tag ); /* Make sure tagged as a used partition */
443 303 : TEST( pinfo[ i ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
444 303 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) /* Make sure heap property satisfied */
445 291 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
446 :
447 303 : TEST( !pinfo[ i ].in_same ); /* Make sure unique */
448 303 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ i ].same_cidx ) ) ); /* " */
449 :
450 303 : pinfo[ i ].cycle_tag = 3UL; /* Mark as visited this traversal */
451 303 : visit_cnt++; /* Update the visit cnt */
452 303 : g = pinfo[ i ].gaddr_hi; /* Get minimum start for next partition */
453 :
454 : /* Traverse the right subtree */
455 :
456 303 : p = i;
457 303 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
458 303 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
459 150 : TEST( i<part_max ); /* Validate i */
460 150 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
461 150 : }
462 :
463 303 : } else {
464 :
465 : /* At this point i and everything on the stack is validated.
466 : Push i to the stack and recurse on the left subtree. */
467 :
468 303 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
469 303 : s = i;
470 303 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
471 303 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
472 141 : TEST( i<part_max ); /* Validate i */
473 141 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
474 141 : }
475 :
476 303 : }
477 630 : }
478 :
479 24 : TEST( visit_cnt==used_cnt ); /* Make sure all used partitions in used treap */
480 24 : } while(0);
481 :
482 : /* Idle stack, partitioning and used treap look intact, validate the
483 : free treap. */
484 :
485 24 : do {
486 24 : ulong visit_cnt = 0UL;
487 :
488 24 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_free_cidx );
489 24 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
490 24 : ulong sz = 0UL;
491 :
492 24 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
493 24 : TEST( i<part_max ); /* Validate i */
494 24 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
495 24 : }
496 :
497 318 : for(;;) {
498 :
499 : /* At this point i and everything on the stack is validated */
500 :
501 318 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
502 171 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
503 :
504 : /* Pop stack */
505 :
506 147 : i = s;
507 147 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
508 :
509 : /* Visit i */
510 :
511 147 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
512 147 : ulong isz = fd_wksp_private_pinfo_sz( pinfo + i ); /* Extract the size */
513 :
514 147 : TEST( isz>sz ); /* Make sure this partition i larger than previous */
515 147 : TEST( !pinfo[ i ].tag ); /* Make sure tagged as a free partition */
516 147 : TEST( !pinfo[ i ].in_same ); /* Make sure marked as not in same */
517 :
518 147 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) { /* Make sure heap property satisfied */
519 123 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
520 123 : }
521 :
522 147 : sz = isz; /* Update largest size partition seen so far */
523 :
524 : /* Traverse all same sized partitions */
525 :
526 147 : ulong j = i;
527 228 : for(;;) {
528 :
529 : /* At this point, j is validated */
530 :
531 228 : TEST( pinfo[ j ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
532 228 : pinfo[ j ].cycle_tag = 3UL; /* Mark as visited this traversal */
533 228 : visit_cnt++;
534 :
535 228 : ulong k = fd_wksp_private_pinfo_idx( pinfo[ j ].same_cidx ); /* Get the next same sized */
536 228 : if( fd_wksp_private_pinfo_idx_is_null( k ) ) break; /* If no more, we are done with this node */
537 81 : TEST( k<part_max ); /* Make sure valid index */
538 81 : TEST( fd_wksp_private_pinfo_sz( pinfo + k )==sz ); /* Make sure same size */
539 81 : TEST( pinfo[ k ].in_same ); /* Make sure marked as in same */
540 81 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].left_cidx ) ) );
541 81 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].right_cidx ) ) );
542 81 : TEST( fd_wksp_private_pinfo_idx( pinfo[ k ].parent_cidx )==j ); /* Make sure correct parent */
543 81 : j = k;
544 81 : }
545 :
546 : /* Recurse on the right subtree */
547 :
548 147 : p = i;
549 147 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
550 147 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
551 60 : TEST( i<part_max ); /* Validate i */
552 60 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
553 60 : }
554 :
555 147 : } else {
556 :
557 147 : TEST( i<part_max ); /* Validate i */
558 :
559 : /* At this point i and everything on the stack is validated.
560 : Push i to the stack and recurse on the left subtree. */
561 :
562 147 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
563 147 : s = i;
564 147 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
565 147 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
566 63 : TEST( i<part_max ); /* Validate i */
567 63 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
568 63 : }
569 147 : }
570 318 : }
571 :
572 24 : TEST( visit_cnt==free_cnt ); /* Make sure all free partitions in free treap */
573 :
574 24 : } while(0);
575 :
576 24 : # undef TEST
577 :
578 24 : return FD_WKSP_SUCCESS;
579 24 : }
580 :
581 : int
582 : fd_wksp_rebuild( fd_wksp_t * wksp,
583 480 : uint seed ) {
584 :
585 : /* Load the wksp metadata, don't rebuild if any of it looks even
586 : slightly off. */
587 :
588 480 : if( FD_UNLIKELY( !wksp ) ) {
589 0 : FD_LOG_WARNING(( "NULL wksp" ));
590 0 : return FD_WKSP_ERR_CORRUPT;
591 0 : }
592 :
593 480 : ulong magic = wksp->magic;
594 480 : ulong part_max = wksp->part_max;
595 480 : ulong data_max = wksp->data_max;
596 480 : ulong gaddr_lo = wksp->gaddr_lo;
597 480 : ulong gaddr_hi = wksp->gaddr_hi;
598 480 : ulong cycle_tag = wksp->cycle_tag;
599 :
600 : /* TODO: consider verifying owner */
601 :
602 480 : ulong footprint = fd_wksp_footprint( part_max, data_max );
603 480 : ulong gaddr_lo_exp = fd_wksp_private_data_off( part_max );
604 480 : ulong gaddr_hi_exp = gaddr_lo_exp + data_max;
605 480 : if( FD_UNLIKELY( (magic!=FD_WKSP_MAGIC) | (!footprint) | (!fd_shmem_name_len( wksp->name )) |
606 480 : (gaddr_lo!=gaddr_lo_exp) | (gaddr_hi!=gaddr_hi_exp) | (cycle_tag<4UL) ) ) {
607 0 : FD_LOG_WARNING(( "bad metadata\n\t"
608 0 : "magic %016lx (exp %016lx)\n\t"
609 0 : "part_max %lu data_max %lu (footprint %lu)\n\t"
610 0 : "gaddr_lo %lu (exp %lu)\n\t"
611 0 : "gaddr_hi %lu (exp %lu)\n\t"
612 0 : "cycle_tag %lu (exp>=4)",
613 0 : magic, FD_WKSP_MAGIC, part_max, data_max, footprint,
614 0 : gaddr_lo, gaddr_lo_exp, gaddr_hi, gaddr_hi_exp, cycle_tag ));
615 0 : return FD_WKSP_ERR_CORRUPT;
616 0 : }
617 :
618 : /* Scan the wksp pinfo and insert any used partitions into the used
619 : treap and put the rest on the idle stack. If there is any sign of
620 : corruption (empty, bad range or overlap between used partitions),
621 : we abort the rebuild (this is almost certainly data corruption of
622 : some form and we don't have enough info to resolve a conflict
623 : without potentially making the situation worse). We do the scan in
624 : reverse order to rebuild the idle stack in forward order.
625 :
626 : Note that we don't ever change the gaddr_lo,gaddr_hi of any tagged
627 : partitions such that operation is guaranteed to never change the
628 : single source of truth. As such, this operation can be interrupted
629 : and restarted arbitrarily safely.*/
630 :
631 480 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
632 :
633 480 : do {
634 480 : wksp->seed = seed;
635 480 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush idle stack */
636 480 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush used treap */
637 480 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush free treap */
638 :
639 480 : ulong i = part_max;
640 12770673 : while( i ) {
641 12770193 : i--;
642 :
643 : /* Ideally, heap priorities should just be a shuffling of the
644 : integers [0,part_max). fd_uint_hash will generate such a
645 : shuffling for part_max = 2^32. Using the lower 30 bits
646 : (reserving bit 31 for bulk operations) will yield something
647 : very close. We use seed to mix it up some more. */
648 :
649 12770193 : pinfo[ i ].in_same = 0U;
650 12770193 : pinfo[ i ].heap_prio = fd_uint_hash( seed ^ (uint)i ) & ((1U<<30)-1U);
651 12770193 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
652 12770193 : pinfo[ i ].cycle_tag = 0U;
653 :
654 12770193 : ulong tag = pinfo[ i ].tag;
655 12770193 : if( !tag ) { /* Not used ... make it available for reuse below */
656 12767994 : fd_wksp_private_idle_stack_push( i, wksp, pinfo );
657 12767994 : continue;
658 12767994 : }
659 :
660 2199 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
661 2199 : pinfo[ i ].next_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
662 :
663 2199 : if( FD_UNLIKELY( fd_wksp_private_used_treap_insert( i, wksp, pinfo ) ) ) return FD_WKSP_ERR_CORRUPT; /* Logs details */
664 2199 : }
665 480 : } while(0);
666 :
667 : /* At this point, a partition is either in the idle stack or used
668 : treap. Further, we have:
669 :
670 : | used | idle
671 : ----------+----------------------------+--------
672 : gaddr_* | non-empty range | 0
673 : | no overlap with other used | 0
674 : tag | non-zero | 0
675 : in_same | 0 | 0
676 : heap_prio | randomized | randomized
677 : prev | NULL | NULL
678 : next | NULL | NULL
679 : left | used treap managed | NULL
680 : right | used treap managed | NULL
681 : same | used treap managed (NULL) | NULL
682 : parent | used treap managed | idle stack managed
683 : stack | wksp managed | wksp managed
684 : cycle_tag | wksp managed | wksp managed
685 :
686 : In-order traverse the used treap to rebuild the partitioning and
687 : the free treap. */
688 :
689 480 : do {
690 480 : uint * j_next_cidx_ptr = &wksp->part_head_cidx; /* Location of most recently added partition next link */
691 :
692 480 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL; /* Most recently added partition */
693 480 : ulong g0 = gaddr_lo; /* Most recently added partition end */
694 :
695 480 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
696 480 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
697 4878 : for(;;) {
698 4878 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
699 2679 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
700 :
701 : /* Pop traversal stack */
702 :
703 2199 : i = s;
704 2199 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
705 :
706 : /* Visit i */
707 :
708 2199 : ulong g1 = pinfo[ i ].gaddr_lo;
709 2199 : if( g1 > g0 ) { /* There's a gap between i and the most recently added partition */
710 :
711 : /* Acquire an idle partition to hold the gap */
712 :
713 1506 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
714 0 : FD_LOG_WARNING(( "part_max (%lu) too small to fill gap before partition %lu (tag %lu gaddr_lo %lu gaddr_hi %lu)",
715 0 : part_max, i, pinfo[i].tag, pinfo[i].gaddr_lo, pinfo[i].gaddr_hi ));
716 0 : return FD_WKSP_ERR_CORRUPT;
717 0 : }
718 1506 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
719 :
720 : /* Populate the acquired partition with the gap details,
721 : append it to the wksp partitioning and insert it into the
722 : free treap. Note that stack_push/pop reset gaddr_lo,
723 : gaddr_hi, tag, in_same, {prev, next, left, right, same,
724 : parent}_cidx. It preserved heap_prio from its original
725 : assignment and didn't touch stack_cidx or cycle_tag. */
726 :
727 1506 : pinfo[ k ].gaddr_lo = g0;
728 1506 : pinfo[ k ].gaddr_hi = g1;
729 1506 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
730 1506 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
731 1506 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
732 1506 : j = k;
733 1506 : g0 = g1;
734 :
735 1506 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
736 1506 : }
737 :
738 : /* Add i to the partitioning. */
739 :
740 2199 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
741 2199 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( i );
742 2199 : j_next_cidx_ptr = &pinfo[ i ].next_cidx;
743 2199 : j = i;
744 2199 : g0 = pinfo[ i ].gaddr_hi;
745 :
746 : /* Traverse the right subtree */
747 :
748 2199 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
749 :
750 2199 : } else {
751 :
752 : /* Push i to the stack and recurse on the left subtree. */
753 :
754 2199 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
755 2199 : s = i;
756 2199 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
757 :
758 2199 : }
759 4878 : }
760 :
761 480 : if( g0 < gaddr_hi ) { /* Have final gap to fill */
762 :
763 : /* This works the same as the above */
764 :
765 480 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
766 0 : FD_LOG_WARNING(( "part_max (%lu) too small to complete partitioning", part_max ));
767 0 : return FD_WKSP_ERR_CORRUPT;
768 0 : }
769 480 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
770 :
771 480 : pinfo[ k ].gaddr_lo = g0;
772 480 : pinfo[ k ].gaddr_hi = gaddr_hi;
773 480 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
774 480 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
775 480 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
776 480 : j = k;
777 : //g0 = gaddr_hi;
778 :
779 480 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
780 480 : }
781 :
782 480 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( j );
783 :
784 480 : } while(0);
785 :
786 480 : return FD_WKSP_SUCCESS;
787 480 : }
|