Line data Source code
1 : #include "fd_wksp_private.h"
2 : #include <sys/mman.h>
3 : #include <errno.h>
4 :
5 : int
6 587976717 : fd_wksp_private_lock( fd_wksp_t * wksp ) {
7 : # if FD_WKSP_LOCK_RECLAIM
8 : int warning = 0;
9 : #endif
10 587976717 : ulong me = fd_log_group_id();
11 :
12 587976717 : ulong * _owner = &wksp->owner;
13 587976717 : for(;;) {
14 :
15 : /* Note that we emulate CAS on platforms without FD_HAS_ATOMIC
16 : to minimize the amount of code differences we have to test. On
17 : platforms without FD_HAS_ATOMIC, a workspace should not be used
18 : concurrently though. */
19 :
20 587976717 : FD_COMPILER_MFENCE();
21 587976717 : # if FD_HAS_ATOMIC
22 587976717 : ulong pid = FD_ATOMIC_CAS( _owner, ULONG_MAX, me );
23 : # else
24 : ulong pid = FD_VOLATILE_CONST( *_owner );
25 : if( pid==ULONG_MAX ) FD_VOLATILE( *_owner ) = me;
26 : # endif
27 587976717 : FD_COMPILER_MFENCE();
28 :
29 587976717 : if( FD_LIKELY( pid==ULONG_MAX ) ) return FD_WKSP_SUCCESS;
30 :
31 : # if FD_WKSP_LOCK_RECLAIM
32 : int status = fd_log_group_id_query( pid );
33 : if( FD_UNLIKELY( status==FD_LOG_GROUP_ID_QUERY_DEAD ) ) { /* A process died while holding the lock, try to recover the lock */
34 :
35 : FD_COMPILER_MFENCE();
36 : # if FD_HAS_ATOMIC
37 : ulong cur = FD_ATOMIC_CAS( _owner, pid, me );
38 : # else
39 : ulong cur = FD_VOLATILE_CONST( *_owner );
40 : if( cur==pid ) FD_VOLATILE( *_owner ) = me;
41 : # endif
42 : FD_COMPILER_MFENCE();
43 :
44 : if( FD_LIKELY( cur==pid ) ) { /* We recovered the lock from the dead pid, try to fix up incomplete ops */
45 :
46 : FD_LOG_WARNING(( "Process %lu died in an operation on wksp %s; verifying", pid, wksp->name ));
47 : if( FD_LIKELY( !fd_wksp_verify( wksp ) ) ) { /* logs details of issues detected */
48 : FD_LOG_NOTICE(( "wksp verified" ));
49 : return FD_WKSP_SUCCESS;
50 : }
51 :
52 : FD_LOG_WARNING(( "Issues detected; rebuilding" ));
53 : if( FD_UNLIKELY( fd_wksp_rebuild( wksp, wksp->seed ) ) ) { /* Rebuild failed (logs details of issues detected) */
54 : /* Return control of the lock to the previous owner */
55 : FD_COMPILER_MFENCE();
56 : FD_VOLATILE( *_owner ) = pid;
57 : FD_COMPILER_MFENCE();
58 : FD_LOG_WARNING(( "corrupt wksp detected" ));
59 : return FD_WKSP_ERR_CORRUPT;
60 : }
61 :
62 : FD_LOG_NOTICE(( "wksp rebuilt" ));
63 : return FD_WKSP_SUCCESS;
64 :
65 : }
66 :
67 : /* Somebody beat us to recovering the lock ... try again */
68 :
69 : } else if( FD_UNLIKELY( status!=FD_LOG_GROUP_ID_QUERY_LIVE ) ) { /* Unclear pid status ... issue a warning and try again */
70 :
71 : if( FD_UNLIKELY( !warning ) ) {
72 : FD_LOG_WARNING(( "wksp %s is owned by unknown pid %li; attempting to recover", wksp->name, pid ));
73 : warning = 1;
74 : }
75 :
76 : }
77 :
78 : /* At this point, either another thread in this process has the
79 : lock, another active thread in another process has the lock,
80 : another unknown status thread in other process has the lock or
81 : another thread beat us to reclaim the lock from a dead process.
82 : In any case, we don't have the lock. Wait a while to limit O/S
83 : contention and try again. */
84 :
85 : FD_YIELD();
86 : # else
87 :
88 : /* If we are running without FD_WKSP_LOCK_RECLAIM then it is assumed
89 : that the contention is caused by a tile pinned to another core,
90 : and that this core is itself pinned so spin locking is best. */
91 0 : FD_SPIN_PAUSE();
92 :
93 0 : #endif
94 0 : }
95 :
96 : /* never get here */
97 587976717 : }
98 :
99 : /* Public APIs ********************************************************/
100 :
101 : ulong
102 : fd_wksp_part_max_est( ulong footprint,
103 558 : ulong sz_typical ) {
104 558 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
105 558 : ulong data_end = footprint - 1UL;
106 558 : ulong pinfo_off = fd_wksp_private_pinfo_off();
107 558 : ulong consumed = sizeof(fd_wksp_private_pinfo_t) + sz_typical;
108 558 : ulong part_max = (data_end - pinfo_off) / (consumed + (ulong)!consumed); /* avoid div-by-zero */
109 558 : if( FD_UNLIKELY( (!footprint) | (!sz_typical) | (sz_typical>consumed) | (pinfo_off>data_end) ) ) return 0UL;
110 549 : return fd_ulong_min( part_max, FD_WKSP_PRIVATE_PINFO_IDX_NULL );
111 558 : }
112 :
113 : ulong
114 : fd_wksp_data_max_est( ulong footprint,
115 567 : ulong part_max ) {
116 567 : footprint = fd_ulong_align_dn( footprint, FD_WKSP_ALIGN );
117 567 : ulong data_end = footprint - 1UL;
118 567 : ulong data_off = fd_wksp_private_data_off( part_max );
119 567 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) |
120 567 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* covered above */
121 567 : (!footprint) | (data_off>=data_end) ) ) return 0UL;
122 549 : return data_end - data_off;
123 567 : }
124 :
125 : ulong
126 3 : fd_wksp_align( void ) {
127 3 : return FD_WKSP_ALIGN;
128 3 : }
129 :
130 : ulong
131 : fd_wksp_footprint( ulong part_max,
132 2850 : ulong data_max ) {
133 2850 : ulong data_off = fd_wksp_private_data_off( part_max );
134 2850 : if( FD_UNLIKELY( (!part_max) | (part_max>FD_WKSP_PRIVATE_PINFO_IDX_NULL) | (!data_max) |
135 2850 : (part_max > ((ULONG_MAX - fd_wksp_private_pinfo_off())/sizeof(fd_wksp_private_pinfo_t))) | /* Covered above */
136 2850 : (data_max > (ULONG_MAX - FD_WKSP_ALIGN + 1UL - data_off - 1UL) ) ) ) return 0UL;
137 2781 : return fd_ulong_align_up( data_off + data_max + 1UL, FD_WKSP_ALIGN );
138 2850 : }
139 :
140 : void *
141 : fd_wksp_new( void * shmem,
142 : char const * name,
143 : uint seed,
144 : ulong part_max,
145 552 : ulong data_max ) {
146 552 : fd_wksp_t * wksp = (fd_wksp_t *)shmem;
147 :
148 552 : if( FD_UNLIKELY( !wksp ) ) {
149 3 : FD_LOG_WARNING(( "NULL shmem" ));
150 3 : return NULL;
151 3 : }
152 :
153 549 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
154 3 : FD_LOG_WARNING(( "bad align" ));
155 3 : return NULL;
156 3 : }
157 :
158 546 : ulong name_len = fd_shmem_name_len( name );
159 546 : if( FD_UNLIKELY( !name_len ) ) {
160 3 : FD_LOG_WARNING(( "bad name" ));
161 3 : return NULL;
162 3 : }
163 :
164 543 : ulong footprint = fd_wksp_footprint( part_max, data_max );
165 543 : if( FD_UNLIKELY( !footprint ) ) {
166 12 : FD_LOG_WARNING(( "bad part_max and/or data_max" ));
167 12 : return NULL;
168 12 : }
169 :
170 531 : fd_memset( wksp, 0, fd_wksp_footprint( part_max, 1UL ) );
171 :
172 531 : wksp->part_max = part_max;
173 531 : wksp->data_max = data_max;
174 531 : wksp->gaddr_lo = fd_wksp_private_data_off( part_max );
175 531 : wksp->gaddr_hi = wksp->gaddr_lo + data_max;
176 531 : fd_memcpy( wksp->name, name, name_len+1UL );
177 531 : wksp->seed = seed;
178 531 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
179 531 : wksp->part_head_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
180 531 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
181 531 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
182 531 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
183 531 : wksp->cycle_tag = 4UL; /* Verify uses tags 0-3 */
184 531 : wksp->owner = 0UL; /* Mark as locked and in construction */
185 :
186 : /* Note that wksp->owner was set to zero above, "locking" the wksp by
187 : group_id 0. And the memset above set all the partition tags to
188 : zero such that there are no allocated partitions. So once we set
189 : magic below, we can finish the initialization by rebuilding and
190 : unlocking. Since fd_log_group_id is non-zero, the zero owner
191 : indicates to any remote observer of the shared memory region that
192 : the wksp is being built for the first time. */
193 :
194 531 : FD_COMPILER_MFENCE();
195 531 : FD_VOLATILE( wksp->magic ) = FD_WKSP_MAGIC;
196 531 : FD_COMPILER_MFENCE();
197 :
198 531 : int err = fd_wksp_rebuild( wksp, seed );
199 531 : if( FD_UNLIKELY( err ) ) { /* Should be impossible at this point */
200 :
201 0 : FD_COMPILER_MFENCE();
202 0 : FD_VOLATILE( wksp->magic ) = 0UL;
203 0 : FD_COMPILER_MFENCE();
204 :
205 0 : FD_LOG_WARNING(( "fd_wksp_rebuild failed (%i-%s)", err, fd_wksp_strerror( err ) ));
206 0 : return NULL;
207 0 : }
208 :
209 : #if FD_HAS_DEEPASAN
210 : /* Poison entire workspace except wksp header and the pinfo array. */
211 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
212 : fd_asan_poison( wksp_data, footprint - fd_wksp_private_pinfo_off() );
213 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
214 : for( ulong i=0; i<part_max; i++ ) {
215 : fd_asan_unpoison( &pinfo[ i ], FD_WKSP_PRIVATE_PINFO_FOOTPRINT );
216 : }
217 : #endif
218 :
219 531 : fd_wksp_private_unlock( wksp );
220 :
221 531 : return wksp;
222 531 : }
223 :
224 : fd_wksp_t *
225 2346 : fd_wksp_join( void * shwksp ) {
226 2346 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
227 :
228 2346 : if( FD_UNLIKELY( !wksp ) ) {
229 3 : FD_LOG_WARNING(( "NULL shwksp" ));
230 3 : return NULL;
231 3 : }
232 :
233 2343 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
234 3 : FD_LOG_WARNING(( "bad align" ));
235 3 : return NULL;
236 3 : }
237 :
238 2340 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
239 3 : FD_LOG_WARNING(( "bad magic" ));
240 3 : return NULL;
241 3 : }
242 :
243 2337 : return wksp;
244 2340 : }
245 :
246 : void *
247 2241 : fd_wksp_leave( fd_wksp_t * wksp ) {
248 2241 : if( FD_UNLIKELY( !wksp ) ) {
249 3 : FD_LOG_WARNING(( "NULL wksp" ));
250 3 : return NULL;
251 3 : }
252 :
253 2238 : return (void *)wksp;
254 2241 : }
255 :
256 : void *
257 531 : fd_wksp_delete( void * shwksp ) {
258 531 : fd_wksp_t * wksp = (fd_wksp_t *)shwksp;
259 :
260 531 : if( FD_UNLIKELY( !wksp ) ) {
261 3 : FD_LOG_WARNING(( "NULL shwksp" ));
262 3 : return NULL;
263 3 : }
264 :
265 528 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
266 3 : FD_LOG_WARNING(( "bad align" ));
267 3 : return NULL;
268 3 : }
269 :
270 525 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
271 3 : FD_LOG_WARNING(( "bad magic" ));
272 3 : return NULL;
273 3 : }
274 :
275 : /* TODO: consider testing owner */
276 :
277 522 : FD_COMPILER_MFENCE();
278 522 : FD_VOLATILE( wksp->magic ) = 0UL;
279 522 : FD_COMPILER_MFENCE();
280 :
281 : # if FD_HAS_DEEPASAN
282 : /* Unpoison entire wksp region. */
283 : ulong footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
284 : void * wksp_data = (void*)((ulong)wksp + fd_wksp_private_pinfo_off());
285 : fd_asan_unpoison( wksp_data, footprint - fd_wksp_private_pinfo_off());
286 : # endif
287 :
288 522 : return wksp;
289 525 : }
290 :
291 3 : char const * fd_wksp_name ( fd_wksp_t const * wksp ) { return wksp->name; }
292 111 : uint fd_wksp_seed ( fd_wksp_t const * wksp ) { return wksp->seed; }
293 3 : ulong fd_wksp_part_max( fd_wksp_t const * wksp ) { return wksp->part_max; }
294 3 : ulong fd_wksp_data_max( fd_wksp_t const * wksp ) { return wksp->data_max; }
295 :
296 : ulong
297 0 : fd_wksp_owner( fd_wksp_t const * wksp ) {
298 0 : FD_COMPILER_MFENCE();
299 0 : ulong owner = FD_VOLATILE_CONST( wksp->owner );
300 0 : FD_COMPILER_MFENCE();
301 0 : return owner;
302 0 : }
303 :
304 : char const *
305 48 : fd_wksp_strerror( int err ) {
306 48 : switch( err ) {
307 3 : case FD_WKSP_SUCCESS: return "success";
308 12 : case FD_WKSP_ERR_INVAL: return "inval";
309 30 : case FD_WKSP_ERR_FAIL: return "fail";
310 3 : case FD_WKSP_ERR_CORRUPT: return "corrupt";
311 0 : default: break;
312 48 : }
313 0 : return "unknown";
314 48 : }
315 :
316 : int
317 51 : fd_wksp_verify( fd_wksp_t * wksp ) {
318 :
319 430491 : # define TEST(c) do { \
320 430491 : if( FD_UNLIKELY( !(c) ) ) { FD_LOG_WARNING(( "FAIL: %s", #c )); return FD_WKSP_ERR_CORRUPT; } \
321 430491 : } while(0)
322 :
323 : /* Validate metadata */
324 :
325 51 : TEST( wksp );
326 51 : TEST( wksp->magic==FD_WKSP_MAGIC );
327 :
328 51 : ulong part_max = wksp->part_max;
329 51 : ulong data_max = wksp->data_max;
330 51 : TEST( fd_wksp_footprint( part_max, data_max ) );
331 :
332 51 : ulong gaddr_lo = wksp->gaddr_lo; TEST( gaddr_lo==fd_wksp_private_data_off( part_max ) );
333 51 : ulong gaddr_hi = wksp->gaddr_hi; TEST( gaddr_hi==gaddr_lo+data_max );
334 :
335 51 : TEST( fd_shmem_name_len( wksp->name ) );
336 :
337 : /* seed is arbitrary */
338 :
339 51 : TEST( wksp->cycle_tag >= 4UL );
340 :
341 : /* TODO: consider verifying owner */
342 :
343 51 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
344 :
345 : /* Clear out cycle tags */
346 :
347 201381 : for( ulong i=0UL; i<part_max; i++ ) pinfo[ i ].cycle_tag = 0UL;
348 :
349 : /* Verify the idle stack */
350 :
351 51 : ulong idle_cnt = 0UL;
352 :
353 51 : do {
354 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->idle_top_cidx );
355 199071 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
356 :
357 : /* Visit i. Note that i has not been validated yet. */
358 :
359 199020 : TEST( i<part_max ); /* Validate i */
360 199020 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
361 199020 : pinfo[ i ].cycle_tag = 1UL; /* Mark as visited in idle stack */
362 199020 : idle_cnt++; /* Update the idle cnt */
363 :
364 : /* Advance to the next idle */
365 :
366 199020 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx );
367 199020 : }
368 51 : } while(0);
369 :
370 : /* Idle stack looks intact, verify partitioning */
371 :
372 51 : ulong free_cnt = 0UL;
373 51 : ulong used_cnt = 0UL;
374 :
375 51 : do {
376 51 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
377 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_head_cidx );
378 51 : ulong g = gaddr_lo;
379 :
380 51 : int last_free = 0;
381 :
382 2361 : while( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
383 :
384 : /* At this point, we last visited j. Visit i. Note that j has
385 : been validated but i has not. */
386 :
387 2310 : TEST( i<part_max ); /* Validate i */
388 2310 : TEST( pinfo[ i ].gaddr_lo==g ); /* Make sure partition is tightly adjacent to previous */
389 2310 : TEST( pinfo[ i ].gaddr_hi> g ); /* Make sure partition size is non-zero */
390 2310 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].prev_cidx )==j ); /* Make sure correct prev partition */
391 2310 : TEST( !pinfo[ i ].cycle_tag ); /* Make sure not visited before */
392 2310 : pinfo[ i ].cycle_tag = 2UL; /* Mark as visited in partitioning */
393 :
394 2310 : g = pinfo[ i ].gaddr_hi; /* Extract where the next partition should start */
395 2310 : int is_free = !pinfo[ i ].tag; /* Determine if this partition is free or used */
396 2310 : TEST( !(last_free & is_free) ); /* Make sure no adjacent free partitions */
397 2310 : free_cnt += (ulong) is_free; /* Update the free cnt */
398 2310 : used_cnt += (ulong)!is_free; /* Update the used cnt */
399 :
400 : /* Advance to the next partition */
401 :
402 2310 : last_free = is_free;
403 :
404 2310 : j = i;
405 2310 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].next_cidx );
406 2310 : }
407 :
408 51 : TEST( fd_wksp_private_pinfo_idx( wksp->part_tail_cidx )==j ); /* Make sure correct partition tail */
409 51 : TEST( g==gaddr_hi ); /* Make sure complete partitioning */
410 51 : TEST( (idle_cnt + free_cnt + used_cnt)==part_max ); /* Make sure no lost idle partitions */
411 51 : } while(0);
412 :
413 : /* Idle stack and partitioning look intact, validate used treap */
414 :
415 51 : do {
416 51 : ulong visit_cnt = 0UL;
417 :
418 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
419 51 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
420 51 : ulong g = gaddr_lo;
421 :
422 51 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
423 39 : TEST( i<part_max ); /* Validate i */
424 39 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
425 39 : }
426 :
427 2691 : for(;;) {
428 :
429 : /* At this point i is and everything on stack is validated */
430 :
431 2691 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
432 1371 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
433 :
434 : /* Pop stack */
435 :
436 1320 : i = s;
437 1320 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
438 :
439 : /* Visit i */
440 :
441 1320 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
442 :
443 1320 : TEST( pinfo[ i ].gaddr_lo>=g ); /* Make sure this starts after last visited */
444 1320 : TEST( pinfo[ i ].tag ); /* Make sure tagged as a used partition */
445 1320 : TEST( pinfo[ i ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
446 1320 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) /* Make sure heap property satisfied */
447 1281 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
448 :
449 1320 : TEST( !pinfo[ i ].in_same ); /* Make sure unique */
450 1320 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ i ].same_cidx ) ) ); /* " */
451 :
452 1320 : pinfo[ i ].cycle_tag = 3UL; /* Mark as visited this traversal */
453 1320 : visit_cnt++; /* Update the visit cnt */
454 1320 : g = pinfo[ i ].gaddr_hi; /* Get minimum start for next partition */
455 :
456 : /* Traverse the right subtree */
457 :
458 1320 : p = i;
459 1320 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
460 1320 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
461 672 : TEST( i<part_max ); /* Validate i */
462 672 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
463 672 : }
464 :
465 1320 : } else {
466 :
467 : /* At this point i and everything on the stack is validated.
468 : Push i to the stack and recurse on the left subtree. */
469 :
470 1320 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
471 1320 : s = i;
472 1320 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
473 1320 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
474 609 : TEST( i<part_max ); /* Validate i */
475 609 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
476 609 : }
477 :
478 1320 : }
479 2691 : }
480 :
481 51 : TEST( visit_cnt==used_cnt ); /* Make sure all used partitions in used treap */
482 51 : } while(0);
483 :
484 : /* Idle stack, partitioning and used treap look intact, validate the
485 : free treap. */
486 :
487 51 : do {
488 51 : ulong visit_cnt = 0UL;
489 :
490 51 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_free_cidx );
491 51 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
492 51 : ulong sz = 0UL;
493 :
494 51 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
495 51 : TEST( i<part_max ); /* Validate i */
496 51 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
497 51 : }
498 :
499 1209 : for(;;) {
500 :
501 : /* At this point i and everything on the stack is validated */
502 :
503 1209 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
504 630 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
505 :
506 : /* Pop stack */
507 :
508 579 : i = s;
509 579 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
510 :
511 : /* Visit i */
512 :
513 579 : ulong p = fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx ); /* Extract the parent */
514 579 : ulong isz = fd_wksp_private_pinfo_sz( pinfo + i ); /* Extract the size */
515 :
516 579 : TEST( isz>sz ); /* Make sure this partition i larger than previous */
517 579 : TEST( !pinfo[ i ].tag ); /* Make sure tagged as a free partition */
518 579 : TEST( !pinfo[ i ].in_same ); /* Make sure marked as not in same */
519 :
520 579 : if( !fd_wksp_private_pinfo_idx_is_null( p ) ) { /* Make sure heap property satisfied */
521 528 : TEST( pinfo[ p ].heap_prio >= pinfo[ i ].heap_prio );
522 528 : }
523 :
524 579 : sz = isz; /* Update largest size partition seen so far */
525 :
526 : /* Traverse all same sized partitions */
527 :
528 579 : ulong j = i;
529 990 : for(;;) {
530 :
531 : /* At this point, j is validated */
532 :
533 990 : TEST( pinfo[ j ].cycle_tag==2UL ); /* Make sure in partitioning and not visited yet this traversal */
534 990 : pinfo[ j ].cycle_tag = 3UL; /* Mark as visited this traversal */
535 990 : visit_cnt++;
536 :
537 990 : ulong k = fd_wksp_private_pinfo_idx( pinfo[ j ].same_cidx ); /* Get the next same sized */
538 990 : if( fd_wksp_private_pinfo_idx_is_null( k ) ) break; /* If no more, we are done with this node */
539 411 : TEST( k<part_max ); /* Make sure valid index */
540 411 : TEST( fd_wksp_private_pinfo_sz( pinfo + k )==sz ); /* Make sure same size */
541 411 : TEST( pinfo[ k ].in_same ); /* Make sure marked as in same */
542 411 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].left_cidx ) ) );
543 411 : TEST( fd_wksp_private_pinfo_idx_is_null( fd_wksp_private_pinfo_idx( pinfo[ k ].right_cidx ) ) );
544 411 : TEST( fd_wksp_private_pinfo_idx( pinfo[ k ].parent_cidx )==j ); /* Make sure correct parent */
545 411 : j = k;
546 411 : }
547 :
548 : /* Recurse on the right subtree */
549 :
550 579 : p = i;
551 579 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
552 579 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
553 270 : TEST( i<part_max ); /* Validate i */
554 270 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==p ); /* Validate parent */
555 270 : }
556 :
557 579 : } else {
558 :
559 579 : TEST( i<part_max ); /* Validate i */
560 :
561 : /* At this point i and everything on the stack is validated.
562 : Push i to the stack and recurse on the left subtree. */
563 :
564 579 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
565 579 : s = i;
566 579 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
567 579 : if( !fd_wksp_private_pinfo_idx_is_null( i ) ) {
568 258 : TEST( i<part_max ); /* Validate i */
569 258 : TEST( fd_wksp_private_pinfo_idx( pinfo[ i ].parent_cidx )==s ); /* Validate parent */
570 258 : }
571 579 : }
572 1209 : }
573 :
574 51 : TEST( visit_cnt==free_cnt ); /* Make sure all free partitions in free treap */
575 :
576 51 : } while(0);
577 :
578 51 : # undef TEST
579 :
580 51 : return FD_WKSP_SUCCESS;
581 51 : }
582 :
583 : int
584 : fd_wksp_rebuild( fd_wksp_t * wksp,
585 1215 : uint seed ) {
586 :
587 : /* Load the wksp metadata, don't rebuild if any of it looks even
588 : slightly off. */
589 :
590 1215 : if( FD_UNLIKELY( !wksp ) ) {
591 0 : FD_LOG_WARNING(( "NULL wksp" ));
592 0 : return FD_WKSP_ERR_CORRUPT;
593 0 : }
594 :
595 1215 : ulong magic = wksp->magic;
596 1215 : ulong part_max = wksp->part_max;
597 1215 : ulong data_max = wksp->data_max;
598 1215 : ulong gaddr_lo = wksp->gaddr_lo;
599 1215 : ulong gaddr_hi = wksp->gaddr_hi;
600 1215 : ulong cycle_tag = wksp->cycle_tag;
601 :
602 : /* TODO: consider verifying owner */
603 :
604 1215 : ulong footprint = fd_wksp_footprint( part_max, data_max );
605 1215 : ulong gaddr_lo_exp = fd_wksp_private_data_off( part_max );
606 1215 : ulong gaddr_hi_exp = gaddr_lo_exp + data_max;
607 1215 : if( FD_UNLIKELY( (magic!=FD_WKSP_MAGIC) | (!footprint) | (!fd_shmem_name_len( wksp->name )) |
608 1215 : (gaddr_lo!=gaddr_lo_exp) | (gaddr_hi!=gaddr_hi_exp) | (cycle_tag<4UL) ) ) {
609 0 : FD_LOG_WARNING(( "bad metadata\n\t"
610 0 : "magic %016lx (exp %016lx)\n\t"
611 0 : "part_max %lu data_max %lu (footprint %lu)\n\t"
612 0 : "gaddr_lo %lu (exp %lu)\n\t"
613 0 : "gaddr_hi %lu (exp %lu)\n\t"
614 0 : "cycle_tag %lu (exp>=4)",
615 0 : magic, FD_WKSP_MAGIC, part_max, data_max, footprint,
616 0 : gaddr_lo, gaddr_lo_exp, gaddr_hi, gaddr_hi_exp, cycle_tag ));
617 0 : return FD_WKSP_ERR_CORRUPT;
618 0 : }
619 :
620 : /* Scan the wksp pinfo and insert any used partitions into the used
621 : treap and put the rest on the idle stack. If there is any sign of
622 : corruption (empty, bad range or overlap between used partitions),
623 : we abort the rebuild (this is almost certainly data corruption of
624 : some form and we don't have enough info to resolve a conflict
625 : without potentially making the situation worse). We do the scan in
626 : reverse order to rebuild the idle stack in forward order.
627 :
628 : Note that we don't ever change the gaddr_lo,gaddr_hi of any tagged
629 : partitions such that operation is guaranteed to never change the
630 : single source of truth. As such, this operation can be interrupted
631 : and restarted arbitrarily safely.*/
632 :
633 1215 : fd_wksp_private_pinfo_t * pinfo = fd_wksp_private_pinfo( wksp );
634 :
635 1215 : do {
636 1215 : wksp->seed = seed;
637 1215 : wksp->idle_top_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush idle stack */
638 1215 : wksp->part_used_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush used treap */
639 1215 : wksp->part_free_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL ); /* Flush free treap */
640 :
641 1215 : ulong i = part_max;
642 35916186 : while( i ) {
643 35914971 : i--;
644 :
645 : /* Ideally, heap priorities should just be a shuffling of the
646 : integers [0,part_max). fd_uint_hash will generate such a
647 : shuffling for part_max = 2^32. Using the lower 30 bits
648 : (reserving bit 31 for bulk operations) will yield something
649 : very close. We use seed to mix it up some more. */
650 :
651 35914971 : pinfo[ i ].in_same = 0U;
652 35914971 : pinfo[ i ].heap_prio = fd_uint_hash( seed ^ (uint)i ) & ((1U<<30)-1U);
653 35914971 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
654 35914971 : pinfo[ i ].cycle_tag = 0U;
655 :
656 35914971 : ulong tag = pinfo[ i ].tag;
657 35914971 : if( !tag ) { /* Not used ... make it available for reuse below */
658 35906103 : fd_wksp_private_idle_stack_push( i, wksp, pinfo );
659 35906103 : continue;
660 35906103 : }
661 :
662 8868 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
663 8868 : pinfo[ i ].next_cidx = fd_wksp_private_pinfo_cidx( FD_WKSP_PRIVATE_PINFO_IDX_NULL );
664 :
665 8868 : if( FD_UNLIKELY( fd_wksp_private_used_treap_insert( i, wksp, pinfo ) ) ) return FD_WKSP_ERR_CORRUPT; /* Logs details */
666 8868 : }
667 1215 : } while(0);
668 :
669 : /* At this point, a partition is either in the idle stack or used
670 : treap. Further, we have:
671 :
672 : | used | idle
673 : ----------+----------------------------+--------
674 : gaddr_* | non-empty range | 0
675 : | no overlap with other used | 0
676 : tag | non-zero | 0
677 : in_same | 0 | 0
678 : heap_prio | randomized | randomized
679 : prev | NULL | NULL
680 : next | NULL | NULL
681 : left | used treap managed | NULL
682 : right | used treap managed | NULL
683 : same | used treap managed (NULL) | NULL
684 : parent | used treap managed | idle stack managed
685 : stack | wksp managed | wksp managed
686 : cycle_tag | wksp managed | wksp managed
687 :
688 : In-order traverse the used treap to rebuild the partitioning and
689 : the free treap. */
690 :
691 1215 : do {
692 1215 : uint * j_next_cidx_ptr = &wksp->part_head_cidx; /* Location of most recently added partition next link */
693 :
694 1215 : ulong j = FD_WKSP_PRIVATE_PINFO_IDX_NULL; /* Most recently added partition */
695 1215 : ulong g0 = gaddr_lo; /* Most recently added partition end */
696 :
697 1215 : ulong i = fd_wksp_private_pinfo_idx( wksp->part_used_cidx );
698 1215 : ulong s = FD_WKSP_PRIVATE_PINFO_IDX_NULL;
699 18951 : for(;;) {
700 18951 : if( fd_wksp_private_pinfo_idx_is_null( i ) ) {
701 10083 : if( fd_wksp_private_pinfo_idx_is_null( s ) ) break; /* Done */
702 :
703 : /* Pop traversal stack */
704 :
705 8868 : i = s;
706 8868 : s = fd_wksp_private_pinfo_idx( pinfo[ i ].stack_cidx );
707 :
708 : /* Visit i */
709 :
710 8868 : ulong g1 = pinfo[ i ].gaddr_lo;
711 8868 : if( g1 > g0 ) { /* There's a gap between i and the most recently added partition */
712 :
713 : /* Acquire an idle partition to hold the gap */
714 :
715 6231 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
716 0 : FD_LOG_WARNING(( "part_max (%lu) too small to fill gap before partition %lu (tag %lu gaddr_lo %lu gaddr_hi %lu)",
717 0 : part_max, i, pinfo[i].tag, pinfo[i].gaddr_lo, pinfo[i].gaddr_hi ));
718 0 : return FD_WKSP_ERR_CORRUPT;
719 0 : }
720 6231 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
721 :
722 : /* Populate the acquired partition with the gap details,
723 : append it to the wksp partitioning and insert it into the
724 : free treap. Note that stack_push/pop reset gaddr_lo,
725 : gaddr_hi, tag, in_same, {prev, next, left, right, same,
726 : parent}_cidx. It preserved heap_prio from its original
727 : assignment and didn't touch stack_cidx or cycle_tag. */
728 :
729 6231 : pinfo[ k ].gaddr_lo = g0;
730 6231 : pinfo[ k ].gaddr_hi = g1;
731 6231 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
732 6231 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
733 6231 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
734 6231 : j = k;
735 6231 : g0 = g1;
736 :
737 6231 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
738 6231 : }
739 :
740 : /* Add i to the partitioning. */
741 :
742 8868 : pinfo[ i ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
743 8868 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( i );
744 8868 : j_next_cidx_ptr = &pinfo[ i ].next_cidx;
745 8868 : j = i;
746 8868 : g0 = pinfo[ i ].gaddr_hi;
747 :
748 : /* Traverse the right subtree */
749 :
750 8868 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].right_cidx );
751 :
752 8868 : } else {
753 :
754 : /* Push i to the stack and recurse on the left subtree. */
755 :
756 8868 : pinfo[ i ].stack_cidx = fd_wksp_private_pinfo_cidx( s );
757 8868 : s = i;
758 8868 : i = fd_wksp_private_pinfo_idx( pinfo[ i ].left_cidx );
759 :
760 8868 : }
761 18951 : }
762 :
763 1215 : if( g0 < gaddr_hi ) { /* Have final gap to fill */
764 :
765 : /* This works the same as the above */
766 :
767 1215 : if( FD_UNLIKELY( fd_wksp_private_idle_stack_is_empty( wksp ) ) ) {
768 0 : FD_LOG_WARNING(( "part_max (%lu) too small to complete partitioning", part_max ));
769 0 : return FD_WKSP_ERR_CORRUPT;
770 0 : }
771 1215 : ulong k = fd_wksp_private_idle_stack_pop( wksp, pinfo );
772 :
773 1215 : pinfo[ k ].gaddr_lo = g0;
774 1215 : pinfo[ k ].gaddr_hi = gaddr_hi;
775 1215 : pinfo[ k ].prev_cidx = fd_wksp_private_pinfo_cidx( j );
776 1215 : *j_next_cidx_ptr = fd_wksp_private_pinfo_cidx( k );
777 1215 : j_next_cidx_ptr = &pinfo[ k ].next_cidx;
778 1215 : j = k;
779 : //g0 = gaddr_hi;
780 :
781 1215 : fd_wksp_private_free_treap_insert( j, wksp, pinfo );
782 1215 : }
783 :
784 1215 : wksp->part_tail_cidx = fd_wksp_private_pinfo_cidx( j );
785 :
786 1215 : } while(0);
787 :
788 1215 : return FD_WKSP_SUCCESS;
789 1215 : }
790 :
791 : void
792 0 : fd_wksp_mprotect( fd_wksp_t * wksp, int flag ) {
793 0 : if( FD_UNLIKELY( !wksp ) ) {
794 0 : FD_LOG_WARNING(( "NULL wksp" ));
795 0 : return;
796 0 : }
797 :
798 0 : if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)wksp, FD_WKSP_ALIGN ) ) ) {
799 0 : FD_LOG_WARNING(( "bad align" ));
800 0 : return;
801 0 : }
802 :
803 0 : if( FD_UNLIKELY( wksp->magic!=FD_WKSP_MAGIC ) ) {
804 0 : FD_LOG_WARNING(( "bad magic" ));
805 0 : return;
806 0 : }
807 :
808 0 : ulong wksp_footprint = fd_wksp_footprint( wksp->part_max, wksp->data_max );
809 0 : if( FD_UNLIKELY( mprotect( (void*)wksp, wksp_footprint, ( flag ? PROT_READ : (PROT_READ|PROT_WRITE) ) ) ) ) {
810 0 : FD_LOG_WARNING(( "mprotect failed (%i-%s)", errno, fd_io_strerror( errno ) ));
811 0 : return;
812 0 : }
813 0 : }
|